Add data parallel (DP) support to vLLM server launcher
- Add --dp/--data-parallel-size flag for running independent model replicas across multiple GPUs with automatic load balancing behind a single port - Add --tp/--tensor-parallel-size flag (previously hardcoded to 1) - Update docs/vibevoice-vllm-asr.md with multi-GPU deployment guide covering DP, TP, and hybrid (DP × TP) configurations Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -10,6 +10,7 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https:
|
|||||||
- **📡 OpenAI-Compatible API**: Standard `/v1/chat/completions` endpoint with streaming support
|
- **📡 OpenAI-Compatible API**: Standard `/v1/chat/completions` endpoint with streaming support
|
||||||
- **🎵 Long Audio Support**: Process up to 60+ minutes of audio in a single request
|
- **🎵 Long Audio Support**: Process up to 60+ minutes of audio in a single request
|
||||||
- **🔌 Plugin Architecture**: No vLLM source code modification required - just install and run
|
- **🔌 Plugin Architecture**: No vLLM source code modification required - just install and run
|
||||||
|
- **⚡ Data Parallel (DP)**: Run independent model replicas across multiple GPUs with automatic load balancing behind a single port
|
||||||
|
|
||||||
## 🛠️ Installation
|
## 🛠️ Installation
|
||||||
|
|
||||||
@@ -35,6 +36,66 @@ docker run -d --gpus all --name vibevoice-vllm \
|
|||||||
-c "python3 /app/vllm_plugin/scripts/start_server.py"
|
-c "python3 /app/vllm_plugin/scripts/start_server.py"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## ⚡ Multi-GPU Deployment
|
||||||
|
|
||||||
|
The launcher supports two types of GPU parallelism via `--tp` and `--dp` flags:
|
||||||
|
|
||||||
|
| Flag | Name | What it does |
|
||||||
|
|------|------|-------------|
|
||||||
|
| `--tp N` | Tensor Parallel | Splits **one model** across N GPUs (for models too large for a single GPU) |
|
||||||
|
| `--dp N` | Data Parallel | Runs **N independent replicas**, one per GPU, with automatic load balancing behind a single port |
|
||||||
|
|
||||||
|
### Data Parallel (Recommended for scaling throughput)
|
||||||
|
|
||||||
|
Run 4 independent replicas on 4 GPUs — vLLM automatically distributes incoming requests:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d --gpus '"device=0,1,2,3"' --name vibevoice-vllm \
|
||||||
|
--ipc=host \
|
||||||
|
-p 8000:8000 \
|
||||||
|
-e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \
|
||||||
|
-e PYTORCH_ALLOC_CONF=expandable_segments:True \
|
||||||
|
-v $(pwd):/app \
|
||||||
|
-w /app \
|
||||||
|
--entrypoint bash \
|
||||||
|
vllm/vllm-openai:v0.14.1 \
|
||||||
|
-c "python3 /app/vllm_plugin/scripts/start_server.py --dp 4"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tensor Parallel
|
||||||
|
|
||||||
|
Split a single model across 2 GPUs (useful if GPU memory is limited):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d --gpus '"device=0,1"' --name vibevoice-vllm \
|
||||||
|
--ipc=host \
|
||||||
|
-p 8000:8000 \
|
||||||
|
-e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \
|
||||||
|
-e PYTORCH_ALLOC_CONF=expandable_segments:True \
|
||||||
|
-v $(pwd):/app \
|
||||||
|
-w /app \
|
||||||
|
--entrypoint bash \
|
||||||
|
vllm/vllm-openai:v0.14.1 \
|
||||||
|
-c "python3 /app/vllm_plugin/scripts/start_server.py --tp 2"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Hybrid (DP × TP)
|
||||||
|
|
||||||
|
Combine both — e.g., 2 replicas, each split across 2 GPUs (4 GPUs total):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d --gpus '"device=0,1,2,3"' --name vibevoice-vllm \
|
||||||
|
--ipc=host \
|
||||||
|
-p 8000:8000 \
|
||||||
|
-v $(pwd):/app \
|
||||||
|
-w /app \
|
||||||
|
--entrypoint bash \
|
||||||
|
vllm/vllm-openai:v0.14.1 \
|
||||||
|
-c "python3 /app/vllm_plugin/scripts/start_server.py --dp 2 --tp 2"
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note**: Total GPUs required = `dp × tp`. Make sure to expose enough GPU devices in the Docker `--gpus` flag.
|
||||||
|
|
||||||
3. View logs
|
3. View logs
|
||||||
```bash
|
```bash
|
||||||
docker logs -f vibevoice-vllm
|
docker logs -f vibevoice-vllm
|
||||||
|
|||||||
@@ -77,10 +77,14 @@ def generate_tokenizer(model_path: str) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def start_vllm_server(model_path: str, port: int) -> None:
|
def start_vllm_server(model_path: str, port: int,
|
||||||
|
tensor_parallel_size: int = 1,
|
||||||
|
data_parallel_size: int = 1) -> None:
|
||||||
"""Start vLLM server (replaces current process)."""
|
"""Start vLLM server (replaces current process)."""
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f" Starting vLLM server on port {port}")
|
print(f" Starting vLLM server on port {port}")
|
||||||
|
print(f" Tensor Parallel (TP): {tensor_parallel_size}")
|
||||||
|
print(f" Data Parallel (DP): {data_parallel_size}")
|
||||||
print(f"{'='*60}\n")
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
vllm_cmd = [
|
vllm_cmd = [
|
||||||
@@ -96,7 +100,8 @@ def start_vllm_server(model_path: str, port: int) -> None:
|
|||||||
"--no-enable-prefix-caching",
|
"--no-enable-prefix-caching",
|
||||||
"--enable-chunked-prefill",
|
"--enable-chunked-prefill",
|
||||||
"--chat-template-content-format", "openai",
|
"--chat-template-content-format", "openai",
|
||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", str(tensor_parallel_size),
|
||||||
|
"--data-parallel-size", str(data_parallel_size),
|
||||||
"--allowed-local-media-path", "/app",
|
"--allowed-local-media-path", "/app",
|
||||||
"--port", str(port),
|
"--port", str(port),
|
||||||
]
|
]
|
||||||
@@ -110,12 +115,18 @@ def main():
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
# Start with default settings
|
# Start with default settings (single GPU)
|
||||||
python3 start_server.py
|
python3 start_server.py
|
||||||
|
|
||||||
# Use custom port
|
# Use custom port
|
||||||
python3 start_server.py --port 8080
|
python3 start_server.py --port 8080
|
||||||
|
|
||||||
|
# Data parallel: 4 independent replicas on 4 GPUs (load balancing)
|
||||||
|
python3 start_server.py --dp 4
|
||||||
|
|
||||||
|
# Tensor parallel: split model across 2 GPUs
|
||||||
|
python3 start_server.py --tp 2
|
||||||
|
|
||||||
# Skip dependency installation (if already installed)
|
# Skip dependency installation (if already installed)
|
||||||
python3 start_server.py --skip-deps
|
python3 start_server.py --skip-deps
|
||||||
"""
|
"""
|
||||||
@@ -141,6 +152,20 @@ Examples:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Skip generating tokenizer files"
|
help="Skip generating tokenizer files"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tp", "--tensor-parallel-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
dest="tensor_parallel_size",
|
||||||
|
help="Tensor parallel size: split one model across N GPUs (default: 1)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dp", "--data-parallel-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
dest="data_parallel_size",
|
||||||
|
help="Data parallel size: run N independent model replicas for load balancing (default: 1)"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print("\n" + "="*60)
|
print("\n" + "="*60)
|
||||||
@@ -162,7 +187,9 @@ Examples:
|
|||||||
generate_tokenizer(model_path)
|
generate_tokenizer(model_path)
|
||||||
|
|
||||||
# Step 5: Start vLLM server
|
# Step 5: Start vLLM server
|
||||||
start_vllm_server(model_path, args.port)
|
start_vllm_server(model_path, args.port,
|
||||||
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
|
data_parallel_size=args.data_parallel_size)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user