VibeVoice/vllm_plugin/scripts/start_server.py

#!/usr/bin/env python3
"""
VibeVoice vLLM ASR Server Launcher

One-click deployment script that handles:
1. Installing system dependencies (FFmpeg, etc.)
2. Installing VibeVoice Python package
3. Downloading model from HuggingFace
4. Generating tokenizer files
5. Starting vLLM server

Usage:
    python3 start_server.py [--model MODEL_ID] [--port PORT]
"""

import argparse
import os
import subprocess
import sys


def run_command(cmd: list[str], description: str, shell: bool = False) -> None:
    """Run a command with logging."""
    print(f"\n{'='*60}")
    print(f"  {description}")
    print(f"{'='*60}\n")
    if shell:
        subprocess.run(cmd, shell=True, check=True)
    else:
        subprocess.run(cmd, check=True)


def install_system_deps() -> None:
    """Install system dependencies (FFmpeg, etc.)."""
    run_command(["apt-get", "update"], "Updating package list")
    run_command(
        ["apt-get", "install", "-y", "ffmpeg", "libsndfile1"],
        "Installing FFmpeg and audio libraries"
    )


def install_vibevoice() -> None:
    """Install VibeVoice Python package."""
    run_command(
        [sys.executable, "-m", "pip", "install", "-e", "/app[vllm]"],
        "Installing VibeVoice with vLLM support"
    )


def download_model(model_id: str) -> str:
    """Download model from HuggingFace using default cache."""
    print(f"\n{'='*60}")
    print(f"  Downloading model: {model_id}")
    print(f"{'='*60}\n")

    import warnings
    from huggingface_hub import snapshot_download

    # Suppress deprecation warnings from huggingface_hub
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model_path = snapshot_download(model_id)

    print(f"\n{'='*60}")
    print(f"  ✅ Model downloaded successfully!")
    print(f"  📁 Path: {model_path}")
    print(f"{'='*60}\n")
    return model_path


def generate_tokenizer(model_path: str) -> None:
    """Generate tokenizer files for the model."""
    run_command(
        [sys.executable, "-m", "vllm_plugin.tools.generate_tokenizer_files",
         "--output", model_path],
        "Generating tokenizer files"
    )


def start_vllm_server(model_path: str, port: int) -> None:
    """Start vLLM server (replaces current process)."""
    print(f"\n{'='*60}")
    print(f"  Starting vLLM server on port {port}")
    print(f"{'='*60}\n")

    vllm_cmd = [
        "vllm", "serve", model_path,
        "--served-model-name", "vibevoice",
        "--trust-remote-code",
        "--dtype", "bfloat16",
        "--max-num-seqs", "64",
        "--max-model-len", "65536",
        # "--max-num-batched-tokens", "32768",
        "--gpu-memory-utilization", "0.8",
        # "--enforce-eager",
        "--no-enable-prefix-caching",
        "--enable-chunked-prefill",
        "--chat-template-content-format", "openai",
        "--tensor-parallel-size", "1",
        "--allowed-local-media-path", "/app",
        "--port", str(port),
    ]

    os.execvp("vllm", vllm_cmd)


def main():
    parser = argparse.ArgumentParser(
        description="VibeVoice vLLM ASR Server - One-Click Deployment",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Start with default settings
    python3 start_server.py

    # Use custom port
    python3 start_server.py --port 8080

    # Skip dependency installation (if already installed)
    python3 start_server.py --skip-deps
        """
    )
    parser.add_argument(
        "--model", "-m",
        default="microsoft/VibeVoice-ASR",
        help="HuggingFace model ID (default: microsoft/VibeVoice-ASR)"
    )
    parser.add_argument(
        "--port", "-p",
        type=int,
        default=8000,
        help="Server port (default: 8000)"
    )
    parser.add_argument(
        "--skip-deps",
        action="store_true",
        help="Skip installing system dependencies"
    )
    parser.add_argument(
        "--skip-tokenizer",
        action="store_true",
        help="Skip generating tokenizer files"
    )
    args = parser.parse_args()

    print("\n" + "="*60)
    print("  VibeVoice vLLM ASR Server - One-Click Deployment")
    print("="*60)

    # Step 1: Install system dependencies
    if not args.skip_deps:
        install_system_deps()

    # Step 2: Install VibeVoice
    install_vibevoice()

    # Step 3: Download model
    model_path = download_model(args.model)

    # Step 4: Generate tokenizer files
    if not args.skip_tokenizer:
        generate_tokenizer(model_path)

    # Step 5: Start vLLM server
    start_vllm_server(model_path, args.port)


if __name__ == "__main__":
    main()