From 1eb04f53a279e37327e8a57c6e294bbc7d50bf66 Mon Sep 17 00:00:00 2001
From: YingboHAO <3259482542@qq.com>
Date: Mon, 26 Jan 2026 07:26:29 +0000
Subject: [PATCH] Replace install_deps.sh with start_server.py one-click
 deployment

---
 docs/vibevoice-vllm-asr.md          |  62 ++++------
 vllm_plugin/scripts/install_deps.sh |  23 ----
 vllm_plugin/scripts/start_server.py | 169 ++++++++++++++++++++++++++++
 3 files changed, 192 insertions(+), 62 deletions(-)
 delete mode 100644 vllm_plugin/scripts/install_deps.sh
 create mode 100644 vllm_plugin/scripts/start_server.py

diff --git a/docs/vibevoice-vllm-asr.md b/docs/vibevoice-vllm-asr.md
index f32cab4..16cb2ca 100644
--- a/docs/vibevoice-vllm-asr.md
+++ b/docs/vibevoice-vllm-asr.md
@@ -15,70 +15,54 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https:
 
 Using Official vLLM Docker Image (Recommended)
 
+1. Clone the repository
 ```bash
-# 1. Pull the official vLLM image
-docker pull vllm/vllm-openai:latest
+git clone https://github.com/microsoft/VibeVoice.git
+cd VibeVoice
+```
 
-# 2. Start an interactive container
-docker run -it --gpus all --name vibevoice-vllm \
+2. Launch the server (background mode)
+```bash
+docker run -d --gpus all --name vibevoice-vllm \
   --ipc=host \
   -p 8000:8000 \
   -e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \
   -e PYTORCH_ALLOC_CONF=expandable_segments:True \
-  -v /path/to/models:/models \
-  -v /path/to/VibeVoice:/app \
+  -v $(pwd):/app \
   -w /app \
   --entrypoint bash \
-  vllm/vllm-openai:latest
-
-# 3. Inside container: Install system dependencies
-bash vllm_plugin/scripts/install_deps.sh
-
-# 4. Inside container: Install VibeVoice with vLLM support
-pip install -e .[vllm]
-
-# 5. Inside container: (Optional) Generate tokenizer files if needed
-python3 -m vllm_plugin.tools.generate_tokenizer_files --output /models/your_model
-
-# 6. Inside container: Start vLLM server
-vllm serve /models/your_model \
-  --served-model-name vibevoice \
-  --trust-remote-code \
-  --dtype bfloat16 \
-  --max-num-seqs 64 \
-  --max-model-len 65536 \
-  --max-num-batched-tokens 32768 \
-  --gpu-memory-utilization 0.8 \
-  --enforce-eager \
-  --no-enable-prefix-caching \
-  --enable-chunked-prefill \
-  --chat-template-content-format openai \
-  --tensor-parallel-size 1 \
-  --allowed-local-media-path /app \
-  --port 8000
+  vllm/vllm-openai:latest \
+  -c "python3 /app/vllm_plugin/scripts/start_server.py"
 ```
 
-> **Note**: This approach allows you to switch models, adjust parameters, and debug issues without rebuilding the container.
+3. View logs
+```bash
+docker logs -f vibevoice-vllm
+```
 
+> **Note**: 
+> - The `-d` flag runs the container in background (detached mode)
+> - Use `docker stop vibevoice-vllm` to stop the service
+> - The model will be downloaded to HuggingFace cache (`~/.cache/huggingface`) inside the container
 
-## 🚀 Quick Start
+## 🚀 Usages
 
 ### Test the API
 
 Once the vLLM server is running, test it with the provided script:
 
 ```bash
-# Run the test script (inside container)
-python3 vllm_plugin/tests/test_api.py /path/to/audio.wav
+# Run the test (use container path /app/...)
+docker exec -it vibevoice-vllm python3 vllm_plugin/tests/test_api.py /app/audio.wav
 ```
-
+> **Note**: The audio file must be inside the mounted directory (`/app` in the container). Copy your audio to the VibeVoice folder before testing.
 
 ### Environment Variables
 
 | Variable | Description | Default |
 |----------|-------------|---------|
 | `VIBEVOICE_FFMPEG_MAX_CONCURRENCY` | Maximum FFmpeg processes for audio decoding | `64` |
-| `PYTORCH_CUDA_ALLOC_CONF` | CUDA memory allocator config | `expandable_segments:True` |
+| `PYTORCH_ALLOC_CONF` | PyTorch memory allocator config | `expandable_segments:True` |
 
 
 
diff --git a/vllm_plugin/scripts/install_deps.sh b/vllm_plugin/scripts/install_deps.sh
deleted file mode 100644
index 1e62f45..0000000
--- a/vllm_plugin/scripts/install_deps.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Install system dependencies for VibeVoice vLLM plugin
-# Run this script inside the vLLM container before using the plugin
-
-set -e
-
-echo "Installing system dependencies for VibeVoice vLLM plugin..."
-
-# Update package list
-apt-get update
-
-# Install FFmpeg and audio processing libraries
-apt-get install -y \
-    ffmpeg \
-    libsndfile1 \
-    git
-
-echo "✅ System dependencies installed successfully!"
-echo ""
-echo "Next steps:"
-echo "  1. Install VibeVoice: pip install -e .[vllm]"
-echo "  2. Generate tokenizer files (if needed): python -m vllm_plugin.tools.generate_tokenizer_files -o /path/to/model"
-echo "  3. Start vLLM server: vllm serve <model_path> --trust-remote-code --enforce-eager --no-enable-prefix-caching"
diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py
new file mode 100644
index 0000000..ae11b20
--- /dev/null
+++ b/vllm_plugin/scripts/start_server.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+VibeVoice vLLM ASR Server Launcher
+
+One-click deployment script that handles:
+1. Installing system dependencies (FFmpeg, etc.)
+2. Installing VibeVoice Python package
+3. Downloading model from HuggingFace
+4. Generating tokenizer files
+5. Starting vLLM server
+
+Usage:
+    python3 start_server.py [--model MODEL_ID] [--port PORT]
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+
+def run_command(cmd: list[str], description: str, shell: bool = False) -> None:
+    """Run a command with logging."""
+    print(f"\n{'='*60}")
+    print(f"  {description}")
+    print(f"{'='*60}\n")
+    if shell:
+        subprocess.run(cmd, shell=True, check=True)
+    else:
+        subprocess.run(cmd, check=True)
+
+
+def install_system_deps() -> None:
+    """Install system dependencies (FFmpeg, etc.)."""
+    run_command(["apt-get", "update"], "Updating package list")
+    run_command(
+        ["apt-get", "install", "-y", "ffmpeg", "libsndfile1"],
+        "Installing FFmpeg and audio libraries"
+    )
+
+
+def install_vibevoice() -> None:
+    """Install VibeVoice Python package."""
+    run_command(
+        [sys.executable, "-m", "pip", "install", "-e", "/app[vllm]"],
+        "Installing VibeVoice with vLLM support"
+    )
+
+
+def download_model(model_id: str) -> str:
+    """Download model from HuggingFace using default cache."""
+    print(f"\n{'='*60}")
+    print(f"  Downloading model: {model_id}")
+    print(f"{'='*60}\n")
+    
+    import warnings
+    from huggingface_hub import snapshot_download
+    
+    # Suppress deprecation warnings from huggingface_hub
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        model_path = snapshot_download(model_id)
+    
+    print(f"\n{'='*60}")
+    print(f"  ✅ Model downloaded successfully!")
+    print(f"  📁 Path: {model_path}")
+    print(f"{'='*60}\n")
+    return model_path
+
+
+def generate_tokenizer(model_path: str) -> None:
+    """Generate tokenizer files for the model."""
+    run_command(
+        [sys.executable, "-m", "vllm_plugin.tools.generate_tokenizer_files", 
+         "--output", model_path],
+        "Generating tokenizer files"
+    )
+
+
+def start_vllm_server(model_path: str, port: int) -> None:
+    """Start vLLM server (replaces current process)."""
+    print(f"\n{'='*60}")
+    print(f"  Starting vLLM server on port {port}")
+    print(f"{'='*60}\n")
+    
+    vllm_cmd = [
+        "vllm", "serve", model_path,
+        "--served-model-name", "vibevoice",
+        "--trust-remote-code",
+        "--dtype", "bfloat16",
+        "--max-num-seqs", "64",
+        "--max-model-len", "65536",
+        "--max-num-batched-tokens", "32768",
+        "--gpu-memory-utilization", "0.8",
+        "--enforce-eager",
+        "--no-enable-prefix-caching",
+        "--enable-chunked-prefill",
+        "--chat-template-content-format", "openai",
+        "--tensor-parallel-size", "1",
+        "--allowed-local-media-path", "/app",
+        "--port", str(port),
+    ]
+    
+    os.execvp("vllm", vllm_cmd)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="VibeVoice vLLM ASR Server - One-Click Deployment",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Start with default settings
+    python3 start_server.py
+
+    # Use custom port
+    python3 start_server.py --port 8080
+
+    # Skip dependency installation (if already installed)
+    python3 start_server.py --skip-deps
+        """
+    )
+    parser.add_argument(
+        "--model", "-m",
+        default="microsoft/VibeVoice-ASR",
+        help="HuggingFace model ID (default: microsoft/VibeVoice-ASR)"
+    )
+    parser.add_argument(
+        "--port", "-p",
+        type=int,
+        default=8000,
+        help="Server port (default: 8000)"
+    )
+    parser.add_argument(
+        "--skip-deps",
+        action="store_true",
+        help="Skip installing system dependencies"
+    )
+    parser.add_argument(
+        "--skip-tokenizer",
+        action="store_true",
+        help="Skip generating tokenizer files"
+    )
+    args = parser.parse_args()
+
+    print("\n" + "="*60)
+    print("  VibeVoice vLLM ASR Server - One-Click Deployment")
+    print("="*60)
+
+    # Step 1: Install system dependencies
+    if not args.skip_deps:
+        install_system_deps()
+
+    # Step 2: Install VibeVoice
+    install_vibevoice()
+
+    # Step 3: Download model
+    model_path = download_model(args.model)
+
+    # Step 4: Generate tokenizer files
+    if not args.skip_tokenizer:
+        generate_tokenizer(model_path)
+
+    # Step 5: Start vLLM server
+    start_vllm_server(model_path, args.port)
+
+
+if __name__ == "__main__":
+    main()