diff --git a/docs/vibevoice-vllm-asr.md b/docs/vibevoice-vllm-asr.md index 8096469..fa43735 100644 --- a/docs/vibevoice-vllm-asr.md +++ b/docs/vibevoice-vllm-asr.md @@ -10,6 +10,7 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https: - **📡 OpenAI-Compatible API**: Standard `/v1/chat/completions` endpoint with streaming support - **🎵 Long Audio Support**: Process up to 60+ minutes of audio in a single request - **🔌 Plugin Architecture**: No vLLM source code modification required - just install and run +- **⚡ Data Parallel (DP)**: Run independent model replicas across multiple GPUs with automatic load balancing behind a single port ## 🛠️ Installation @@ -35,6 +36,66 @@ docker run -d --gpus all --name vibevoice-vllm \ -c "python3 /app/vllm_plugin/scripts/start_server.py" ``` +## ⚡ Multi-GPU Deployment + +The launcher supports two types of GPU parallelism via `--tp` and `--dp` flags: + +| Flag | Name | What it does | +|------|------|-------------| +| `--tp N` | Tensor Parallel | Splits **one model** across N GPUs (for models too large for a single GPU) | +| `--dp N` | Data Parallel | Runs **N independent replicas**, one per GPU, with automatic load balancing behind a single port | + +### Data Parallel (Recommended for scaling throughput) + +Run 4 independent replicas on 4 GPUs — vLLM automatically distributes incoming requests: + +```bash +docker run -d --gpus '"device=0,1,2,3"' --name vibevoice-vllm \ + --ipc=host \ + -p 8000:8000 \ + -e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \ + -e PYTORCH_ALLOC_CONF=expandable_segments:True \ + -v $(pwd):/app \ + -w /app \ + --entrypoint bash \ + vllm/vllm-openai:v0.14.1 \ + -c "python3 /app/vllm_plugin/scripts/start_server.py --dp 4" +``` + +### Tensor Parallel + +Split a single model across 2 GPUs (useful if GPU memory is limited): + +```bash +docker run -d --gpus '"device=0,1"' --name vibevoice-vllm \ + --ipc=host \ + -p 8000:8000 \ + -e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \ + -e PYTORCH_ALLOC_CONF=expandable_segments:True \ + -v $(pwd):/app \ + -w /app \ + --entrypoint bash \ + vllm/vllm-openai:v0.14.1 \ + -c "python3 /app/vllm_plugin/scripts/start_server.py --tp 2" +``` + +### Hybrid (DP × TP) + +Combine both — e.g., 2 replicas, each split across 2 GPUs (4 GPUs total): + +```bash +docker run -d --gpus '"device=0,1,2,3"' --name vibevoice-vllm \ + --ipc=host \ + -p 8000:8000 \ + -v $(pwd):/app \ + -w /app \ + --entrypoint bash \ + vllm/vllm-openai:v0.14.1 \ + -c "python3 /app/vllm_plugin/scripts/start_server.py --dp 2 --tp 2" +``` + +> **Note**: Total GPUs required = `dp × tp`. Make sure to expose enough GPU devices in the Docker `--gpus` flag. + 3. View logs ```bash docker logs -f vibevoice-vllm diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py index 7032391..78b85ef 100644 --- a/vllm_plugin/scripts/start_server.py +++ b/vllm_plugin/scripts/start_server.py @@ -77,10 +77,14 @@ def generate_tokenizer(model_path: str) -> None: ) -def start_vllm_server(model_path: str, port: int) -> None: +def start_vllm_server(model_path: str, port: int, + tensor_parallel_size: int = 1, + data_parallel_size: int = 1) -> None: """Start vLLM server (replaces current process).""" print(f"\n{'='*60}") print(f" Starting vLLM server on port {port}") + print(f" Tensor Parallel (TP): {tensor_parallel_size}") + print(f" Data Parallel (DP): {data_parallel_size}") print(f"{'='*60}\n") vllm_cmd = [ @@ -96,7 +100,8 @@ def start_vllm_server(model_path: str, port: int) -> None: "--no-enable-prefix-caching", "--enable-chunked-prefill", "--chat-template-content-format", "openai", - "--tensor-parallel-size", "1", + "--tensor-parallel-size", str(tensor_parallel_size), + "--data-parallel-size", str(data_parallel_size), "--allowed-local-media-path", "/app", "--port", str(port), ] @@ -110,12 +115,18 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Start with default settings + # Start with default settings (single GPU) python3 start_server.py # Use custom port python3 start_server.py --port 8080 + # Data parallel: 4 independent replicas on 4 GPUs (load balancing) + python3 start_server.py --dp 4 + + # Tensor parallel: split model across 2 GPUs + python3 start_server.py --tp 2 + # Skip dependency installation (if already installed) python3 start_server.py --skip-deps """ @@ -141,6 +152,20 @@ Examples: action="store_true", help="Skip generating tokenizer files" ) + parser.add_argument( + "--tp", "--tensor-parallel-size", + type=int, + default=1, + dest="tensor_parallel_size", + help="Tensor parallel size: split one model across N GPUs (default: 1)" + ) + parser.add_argument( + "--dp", "--data-parallel-size", + type=int, + default=1, + dest="data_parallel_size", + help="Data parallel size: run N independent model replicas for load balancing (default: 1)" + ) args = parser.parse_args() print("\n" + "="*60) @@ -162,7 +187,9 @@ Examples: generate_tokenizer(model_path) # Step 5: Start vLLM server - start_vllm_server(model_path, args.port) + start_vllm_server(model_path, args.port, + tensor_parallel_size=args.tensor_parallel_size, + data_parallel_size=args.data_parallel_size) if __name__ == "__main__":