Merge pull request #213 from Damon-Salvetore/vllm-1
Replace install_deps.sh with start_server.py one-click deployment
This commit is contained in:
+23
-39
@@ -15,70 +15,54 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https:
|
||||
|
||||
Using Official vLLM Docker Image (Recommended)
|
||||
|
||||
1. Clone the repository
|
||||
```bash
|
||||
# 1. Pull the official vLLM image
|
||||
docker pull vllm/vllm-openai:latest
|
||||
git clone https://github.com/microsoft/VibeVoice.git
|
||||
cd VibeVoice
|
||||
```
|
||||
|
||||
# 2. Start an interactive container
|
||||
docker run -it --gpus all --name vibevoice-vllm \
|
||||
2. Launch the server (background mode)
|
||||
```bash
|
||||
docker run -d --gpus all --name vibevoice-vllm \
|
||||
--ipc=host \
|
||||
-p 8000:8000 \
|
||||
-e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \
|
||||
-e PYTORCH_ALLOC_CONF=expandable_segments:True \
|
||||
-v /path/to/models:/models \
|
||||
-v /path/to/VibeVoice:/app \
|
||||
-v $(pwd):/app \
|
||||
-w /app \
|
||||
--entrypoint bash \
|
||||
vllm/vllm-openai:latest
|
||||
|
||||
# 3. Inside container: Install system dependencies
|
||||
bash vllm_plugin/scripts/install_deps.sh
|
||||
|
||||
# 4. Inside container: Install VibeVoice with vLLM support
|
||||
pip install -e .[vllm]
|
||||
|
||||
# 5. Inside container: (Optional) Generate tokenizer files if needed
|
||||
python3 -m vllm_plugin.tools.generate_tokenizer_files --output /models/your_model
|
||||
|
||||
# 6. Inside container: Start vLLM server
|
||||
vllm serve /models/your_model \
|
||||
--served-model-name vibevoice \
|
||||
--trust-remote-code \
|
||||
--dtype bfloat16 \
|
||||
--max-num-seqs 64 \
|
||||
--max-model-len 65536 \
|
||||
--max-num-batched-tokens 32768 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--enforce-eager \
|
||||
--no-enable-prefix-caching \
|
||||
--enable-chunked-prefill \
|
||||
--chat-template-content-format openai \
|
||||
--tensor-parallel-size 1 \
|
||||
--allowed-local-media-path /app \
|
||||
--port 8000
|
||||
vllm/vllm-openai:latest \
|
||||
-c "python3 /app/vllm_plugin/scripts/start_server.py"
|
||||
```
|
||||
|
||||
> **Note**: This approach allows you to switch models, adjust parameters, and debug issues without rebuilding the container.
|
||||
3. View logs
|
||||
```bash
|
||||
docker logs -f vibevoice-vllm
|
||||
```
|
||||
|
||||
> **Note**:
|
||||
> - The `-d` flag runs the container in background (detached mode)
|
||||
> - Use `docker stop vibevoice-vllm` to stop the service
|
||||
> - The model will be downloaded to HuggingFace cache (`~/.cache/huggingface`) inside the container
|
||||
|
||||
## 🚀 Quick Start
|
||||
## 🚀 Usages
|
||||
|
||||
### Test the API
|
||||
|
||||
Once the vLLM server is running, test it with the provided script:
|
||||
|
||||
```bash
|
||||
# Run the test script (inside container)
|
||||
python3 vllm_plugin/tests/test_api.py /path/to/audio.wav
|
||||
# Run the test (use container path /app/...)
|
||||
docker exec -it vibevoice-vllm python3 vllm_plugin/tests/test_api.py /app/audio.wav
|
||||
```
|
||||
|
||||
> **Note**: The audio file must be inside the mounted directory (`/app` in the container). Copy your audio to the VibeVoice folder before testing.
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `VIBEVOICE_FFMPEG_MAX_CONCURRENCY` | Maximum FFmpeg processes for audio decoding | `64` |
|
||||
| `PYTORCH_CUDA_ALLOC_CONF` | CUDA memory allocator config | `expandable_segments:True` |
|
||||
| `PYTORCH_ALLOC_CONF` | PyTorch memory allocator config | `expandable_segments:True` |
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Install system dependencies for VibeVoice vLLM plugin
|
||||
# Run this script inside the vLLM container before using the plugin
|
||||
|
||||
set -e
|
||||
|
||||
echo "Installing system dependencies for VibeVoice vLLM plugin..."
|
||||
|
||||
# Update package list
|
||||
apt-get update
|
||||
|
||||
# Install FFmpeg and audio processing libraries
|
||||
apt-get install -y \
|
||||
ffmpeg \
|
||||
libsndfile1 \
|
||||
git
|
||||
|
||||
echo "✅ System dependencies installed successfully!"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Install VibeVoice: pip install -e .[vllm]"
|
||||
echo " 2. Generate tokenizer files (if needed): python -m vllm_plugin.tools.generate_tokenizer_files -o /path/to/model"
|
||||
echo " 3. Start vLLM server: vllm serve <model_path> --trust-remote-code --enforce-eager --no-enable-prefix-caching"
|
||||
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
VibeVoice vLLM ASR Server Launcher
|
||||
|
||||
One-click deployment script that handles:
|
||||
1. Installing system dependencies (FFmpeg, etc.)
|
||||
2. Installing VibeVoice Python package
|
||||
3. Downloading model from HuggingFace
|
||||
4. Generating tokenizer files
|
||||
5. Starting vLLM server
|
||||
|
||||
Usage:
|
||||
python3 start_server.py [--model MODEL_ID] [--port PORT]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def run_command(cmd: list[str], description: str, shell: bool = False) -> None:
|
||||
"""Run a command with logging."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f" {description}")
|
||||
print(f"{'='*60}\n")
|
||||
if shell:
|
||||
subprocess.run(cmd, shell=True, check=True)
|
||||
else:
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
|
||||
def install_system_deps() -> None:
|
||||
"""Install system dependencies (FFmpeg, etc.)."""
|
||||
run_command(["apt-get", "update"], "Updating package list")
|
||||
run_command(
|
||||
["apt-get", "install", "-y", "ffmpeg", "libsndfile1"],
|
||||
"Installing FFmpeg and audio libraries"
|
||||
)
|
||||
|
||||
|
||||
def install_vibevoice() -> None:
|
||||
"""Install VibeVoice Python package."""
|
||||
run_command(
|
||||
[sys.executable, "-m", "pip", "install", "-e", "/app[vllm]"],
|
||||
"Installing VibeVoice with vLLM support"
|
||||
)
|
||||
|
||||
|
||||
def download_model(model_id: str) -> str:
|
||||
"""Download model from HuggingFace using default cache."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Downloading model: {model_id}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
import warnings
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
# Suppress deprecation warnings from huggingface_hub
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
model_path = snapshot_download(model_id)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f" ✅ Model downloaded successfully!")
|
||||
print(f" 📁 Path: {model_path}")
|
||||
print(f"{'='*60}\n")
|
||||
return model_path
|
||||
|
||||
|
||||
def generate_tokenizer(model_path: str) -> None:
|
||||
"""Generate tokenizer files for the model."""
|
||||
run_command(
|
||||
[sys.executable, "-m", "vllm_plugin.tools.generate_tokenizer_files",
|
||||
"--output", model_path],
|
||||
"Generating tokenizer files"
|
||||
)
|
||||
|
||||
|
||||
def start_vllm_server(model_path: str, port: int) -> None:
|
||||
"""Start vLLM server (replaces current process)."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Starting vLLM server on port {port}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
vllm_cmd = [
|
||||
"vllm", "serve", model_path,
|
||||
"--served-model-name", "vibevoice",
|
||||
"--trust-remote-code",
|
||||
"--dtype", "bfloat16",
|
||||
"--max-num-seqs", "64",
|
||||
"--max-model-len", "65536",
|
||||
"--max-num-batched-tokens", "32768",
|
||||
"--gpu-memory-utilization", "0.8",
|
||||
"--enforce-eager",
|
||||
"--no-enable-prefix-caching",
|
||||
"--enable-chunked-prefill",
|
||||
"--chat-template-content-format", "openai",
|
||||
"--tensor-parallel-size", "1",
|
||||
"--allowed-local-media-path", "/app",
|
||||
"--port", str(port),
|
||||
]
|
||||
|
||||
os.execvp("vllm", vllm_cmd)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="VibeVoice vLLM ASR Server - One-Click Deployment",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Start with default settings
|
||||
python3 start_server.py
|
||||
|
||||
# Use custom port
|
||||
python3 start_server.py --port 8080
|
||||
|
||||
# Skip dependency installation (if already installed)
|
||||
python3 start_server.py --skip-deps
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", "-m",
|
||||
default="microsoft/VibeVoice-ASR",
|
||||
help="HuggingFace model ID (default: microsoft/VibeVoice-ASR)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port", "-p",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="Server port (default: 8000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-deps",
|
||||
action="store_true",
|
||||
help="Skip installing system dependencies"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tokenizer",
|
||||
action="store_true",
|
||||
help="Skip generating tokenizer files"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(" VibeVoice vLLM ASR Server - One-Click Deployment")
|
||||
print("="*60)
|
||||
|
||||
# Step 1: Install system dependencies
|
||||
if not args.skip_deps:
|
||||
install_system_deps()
|
||||
|
||||
# Step 2: Install VibeVoice
|
||||
install_vibevoice()
|
||||
|
||||
# Step 3: Download model
|
||||
model_path = download_model(args.model)
|
||||
|
||||
# Step 4: Generate tokenizer files
|
||||
if not args.skip_tokenizer:
|
||||
generate_tokenizer(model_path)
|
||||
|
||||
# Step 5: Start vLLM server
|
||||
start_vllm_server(model_path, args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user