Merge pull request #213 from Damon-Salvetore/vllm-1

Replace install_deps.sh with start_server.py one-click deployment
2026-01-26 16:49:38 +08:00
parent d11d756b61 1eb04f53a2
commit c4ee4fe716
3 changed files with 192 additions and 62 deletions
@@ -15,70 +15,54 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https:
 Using Official vLLM Docker Image (Recommended)
 1. Clone the repository
 ```bash
-# 1. Pull the official vLLM image
+git clone https://github.com/microsoft/VibeVoice.git
-docker pull vllm/vllm-openai:latest
+cd VibeVoice
 ```
-# 2. Start an interactive container
+2. Launch the server (background mode)
-docker run -it --gpus all --name vibevoice-vllm \
+```bash
 docker run -d --gpus all --name vibevoice-vllm \
  --ipc=host \
  -p 8000:8000 \
  -e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \
  -e PYTORCH_ALLOC_CONF=expandable_segments:True \
-  -v /path/to/models:/models \
+  -v $(pwd):/app \
  -v /path/to/VibeVoice:/app \
  -w /app \
  --entrypoint bash \
-  vllm/vllm-openai:latest
+  vllm/vllm-openai:latest \
-
+  -c "python3 /app/vllm_plugin/scripts/start_server.py"
 # 3. Inside container: Install system dependencies
 bash vllm_plugin/scripts/install_deps.sh
 # 4. Inside container: Install VibeVoice with vLLM support
 pip install -e .[vllm]
 # 5. Inside container: (Optional) Generate tokenizer files if needed
 python3 -m vllm_plugin.tools.generate_tokenizer_files --output /models/your_model
 # 6. Inside container: Start vLLM server
 vllm serve /models/your_model \
  --served-model-name vibevoice \
  --trust-remote-code \
  --dtype bfloat16 \
  --max-num-seqs 64 \
  --max-model-len 65536 \
  --max-num-batched-tokens 32768 \
  --gpu-memory-utilization 0.8 \
  --enforce-eager \
  --no-enable-prefix-caching \
  --enable-chunked-prefill \
  --chat-template-content-format openai \
  --tensor-parallel-size 1 \
  --allowed-local-media-path /app \
  --port 8000
 ```
-> **Note**: This approach allows you to switch models, adjust parameters, and debug issues without rebuilding the container.
+3. View logs
 ```bash
 docker logs -f vibevoice-vllm
 ```
 > **Note**: 
 > - The `-d` flag runs the container in background (detached mode)
 > - Use `docker stop vibevoice-vllm` to stop the service
 > - The model will be downloaded to HuggingFace cache (`~/.cache/huggingface`) inside the container
-## 🚀 Quick Start
+## 🚀 Usages
 ### Test the API
 Once the vLLM server is running, test it with the provided script:
 ```bash
-# Run the test script (inside container)
+# Run the test (use container path /app/...)
-python3 vllm_plugin/tests/test_api.py /path/to/audio.wav
+docker exec -it vibevoice-vllm python3 vllm_plugin/tests/test_api.py /app/audio.wav
 ```
-
+> **Note**: The audio file must be inside the mounted directory (`/app` in the container). Copy your audio to the VibeVoice folder before testing.
 ### Environment Variables
 | Variable | Description | Default |
 |----------|-------------|---------|
 | `VIBEVOICE_FFMPEG_MAX_CONCURRENCY` | Maximum FFmpeg processes for audio decoding | `64` |
-| `PYTORCH_CUDA_ALLOC_CONF` | CUDA memory allocator config | `expandable_segments:True` |
+| `PYTORCH_ALLOC_CONF` | PyTorch memory allocator config | `expandable_segments:True` |
@@ -1,23 +0,0 @@
 #!/bin/bash
 # Install system dependencies for VibeVoice vLLM plugin
 # Run this script inside the vLLM container before using the plugin
 set -e
 echo "Installing system dependencies for VibeVoice vLLM plugin..."
 # Update package list
 apt-get update
 # Install FFmpeg and audio processing libraries
 apt-get install -y \
    ffmpeg \
    libsndfile1 \
    git
 echo "✅ System dependencies installed successfully!"
 echo ""
 echo "Next steps:"
 echo "  1. Install VibeVoice: pip install -e .[vllm]"
 echo "  2. Generate tokenizer files (if needed): python -m vllm_plugin.tools.generate_tokenizer_files -o /path/to/model"
 echo "  3. Start vLLM server: vllm serve <model_path> --trust-remote-code --enforce-eager --no-enable-prefix-caching"
@@ -0,0 +1,169 @@
 #!/usr/bin/env python3
 """
 VibeVoice vLLM ASR Server Launcher
 One-click deployment script that handles:
 1. Installing system dependencies (FFmpeg, etc.)
 2. Installing VibeVoice Python package
 3. Downloading model from HuggingFace
 4. Generating tokenizer files
 5. Starting vLLM server
 Usage:
    python3 start_server.py [--model MODEL_ID] [--port PORT]
 """
 import argparse
 import os
 import subprocess
 import sys
 def run_command(cmd: list[str], description: str, shell: bool = False) -> None:
    """Run a command with logging."""
    print(f"\n{'='*60}")
    print(f"  {description}")
    print(f"{'='*60}\n")
    if shell:
        subprocess.run(cmd, shell=True, check=True)
    else:
        subprocess.run(cmd, check=True)
 def install_system_deps() -> None:
    """Install system dependencies (FFmpeg, etc.)."""
    run_command(["apt-get", "update"], "Updating package list")
    run_command(
        ["apt-get", "install", "-y", "ffmpeg", "libsndfile1"],
        "Installing FFmpeg and audio libraries"
    )
 def install_vibevoice() -> None:
    """Install VibeVoice Python package."""
    run_command(
        [sys.executable, "-m", "pip", "install", "-e", "/app[vllm]"],
        "Installing VibeVoice with vLLM support"
    )
 def download_model(model_id: str) -> str:
    """Download model from HuggingFace using default cache."""
    print(f"\n{'='*60}")
    print(f"  Downloading model: {model_id}")
    print(f"{'='*60}\n")
    import warnings
    from huggingface_hub import snapshot_download
    # Suppress deprecation warnings from huggingface_hub
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model_path = snapshot_download(model_id)
    print(f"\n{'='*60}")
    print(f"  ✅ Model downloaded successfully!")
    print(f"  📁 Path: {model_path}")
    print(f"{'='*60}\n")
    return model_path
 def generate_tokenizer(model_path: str) -> None:
    """Generate tokenizer files for the model."""
    run_command(
        [sys.executable, "-m", "vllm_plugin.tools.generate_tokenizer_files", 
         "--output", model_path],
        "Generating tokenizer files"
    )
 def start_vllm_server(model_path: str, port: int) -> None:
    """Start vLLM server (replaces current process)."""
    print(f"\n{'='*60}")
    print(f"  Starting vLLM server on port {port}")
    print(f"{'='*60}\n")
    vllm_cmd = [
        "vllm", "serve", model_path,
        "--served-model-name", "vibevoice",
        "--trust-remote-code",
        "--dtype", "bfloat16",
        "--max-num-seqs", "64",
        "--max-model-len", "65536",
        "--max-num-batched-tokens", "32768",
        "--gpu-memory-utilization", "0.8",
        "--enforce-eager",
        "--no-enable-prefix-caching",
        "--enable-chunked-prefill",
        "--chat-template-content-format", "openai",
        "--tensor-parallel-size", "1",
        "--allowed-local-media-path", "/app",
        "--port", str(port),
    ]
    os.execvp("vllm", vllm_cmd)
 def main():
    parser = argparse.ArgumentParser(
        description="VibeVoice vLLM ASR Server - One-Click Deployment",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
    # Start with default settings
    python3 start_server.py
    # Use custom port
    python3 start_server.py --port 8080
    # Skip dependency installation (if already installed)
    python3 start_server.py --skip-deps
        """
    )
    parser.add_argument(
        "--model", "-m",
        default="microsoft/VibeVoice-ASR",
        help="HuggingFace model ID (default: microsoft/VibeVoice-ASR)"
    )
    parser.add_argument(
        "--port", "-p",
        type=int,
        default=8000,
        help="Server port (default: 8000)"
    )
    parser.add_argument(
        "--skip-deps",
        action="store_true",
        help="Skip installing system dependencies"
    )
    parser.add_argument(
        "--skip-tokenizer",
        action="store_true",
        help="Skip generating tokenizer files"
    )
    args = parser.parse_args()
    print("\n" + "="*60)
    print("  VibeVoice vLLM ASR Server - One-Click Deployment")
    print("="*60)
    # Step 1: Install system dependencies
    if not args.skip_deps:
        install_system_deps()
    # Step 2: Install VibeVoice
    install_vibevoice()
    # Step 3: Download model
    model_path = download_model(args.model)
    # Step 4: Generate tokenizer files
    if not args.skip_tokenizer:
        generate_tokenizer(model_path)
    # Step 5: Start vLLM server
    start_vllm_server(model_path, args.port)
 if __name__ == "__main__":
    main()