From 1eb04f53a279e37327e8a57c6e294bbc7d50bf66 Mon Sep 17 00:00:00 2001 From: YingboHAO <3259482542@qq.com> Date: Mon, 26 Jan 2026 07:26:29 +0000 Subject: [PATCH] Replace install_deps.sh with start_server.py one-click deployment --- docs/vibevoice-vllm-asr.md | 62 ++++------ vllm_plugin/scripts/install_deps.sh | 23 ---- vllm_plugin/scripts/start_server.py | 169 ++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+), 62 deletions(-) delete mode 100644 vllm_plugin/scripts/install_deps.sh create mode 100644 vllm_plugin/scripts/start_server.py diff --git a/docs/vibevoice-vllm-asr.md b/docs/vibevoice-vllm-asr.md index f32cab4..16cb2ca 100644 --- a/docs/vibevoice-vllm-asr.md +++ b/docs/vibevoice-vllm-asr.md @@ -15,70 +15,54 @@ Deploy VibeVoice ASR model as a high-performance API service using [vLLM](https: Using Official vLLM Docker Image (Recommended) +1. Clone the repository ```bash -# 1. Pull the official vLLM image -docker pull vllm/vllm-openai:latest +git clone https://github.com/microsoft/VibeVoice.git +cd VibeVoice +``` -# 2. Start an interactive container -docker run -it --gpus all --name vibevoice-vllm \ +2. Launch the server (background mode) +```bash +docker run -d --gpus all --name vibevoice-vllm \ --ipc=host \ -p 8000:8000 \ -e VIBEVOICE_FFMPEG_MAX_CONCURRENCY=64 \ -e PYTORCH_ALLOC_CONF=expandable_segments:True \ - -v /path/to/models:/models \ - -v /path/to/VibeVoice:/app \ + -v $(pwd):/app \ -w /app \ --entrypoint bash \ - vllm/vllm-openai:latest - -# 3. Inside container: Install system dependencies -bash vllm_plugin/scripts/install_deps.sh - -# 4. Inside container: Install VibeVoice with vLLM support -pip install -e .[vllm] - -# 5. Inside container: (Optional) Generate tokenizer files if needed -python3 -m vllm_plugin.tools.generate_tokenizer_files --output /models/your_model - -# 6. Inside container: Start vLLM server -vllm serve /models/your_model \ - --served-model-name vibevoice \ - --trust-remote-code \ - --dtype bfloat16 \ - --max-num-seqs 64 \ - --max-model-len 65536 \ - --max-num-batched-tokens 32768 \ - --gpu-memory-utilization 0.8 \ - --enforce-eager \ - --no-enable-prefix-caching \ - --enable-chunked-prefill \ - --chat-template-content-format openai \ - --tensor-parallel-size 1 \ - --allowed-local-media-path /app \ - --port 8000 + vllm/vllm-openai:latest \ + -c "python3 /app/vllm_plugin/scripts/start_server.py" ``` -> **Note**: This approach allows you to switch models, adjust parameters, and debug issues without rebuilding the container. +3. View logs +```bash +docker logs -f vibevoice-vllm +``` +> **Note**: +> - The `-d` flag runs the container in background (detached mode) +> - Use `docker stop vibevoice-vllm` to stop the service +> - The model will be downloaded to HuggingFace cache (`~/.cache/huggingface`) inside the container -## 🚀 Quick Start +## 🚀 Usages ### Test the API Once the vLLM server is running, test it with the provided script: ```bash -# Run the test script (inside container) -python3 vllm_plugin/tests/test_api.py /path/to/audio.wav +# Run the test (use container path /app/...) +docker exec -it vibevoice-vllm python3 vllm_plugin/tests/test_api.py /app/audio.wav ``` - +> **Note**: The audio file must be inside the mounted directory (`/app` in the container). Copy your audio to the VibeVoice folder before testing. ### Environment Variables | Variable | Description | Default | |----------|-------------|---------| | `VIBEVOICE_FFMPEG_MAX_CONCURRENCY` | Maximum FFmpeg processes for audio decoding | `64` | -| `PYTORCH_CUDA_ALLOC_CONF` | CUDA memory allocator config | `expandable_segments:True` | +| `PYTORCH_ALLOC_CONF` | PyTorch memory allocator config | `expandable_segments:True` | diff --git a/vllm_plugin/scripts/install_deps.sh b/vllm_plugin/scripts/install_deps.sh deleted file mode 100644 index 1e62f45..0000000 --- a/vllm_plugin/scripts/install_deps.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Install system dependencies for VibeVoice vLLM plugin -# Run this script inside the vLLM container before using the plugin - -set -e - -echo "Installing system dependencies for VibeVoice vLLM plugin..." - -# Update package list -apt-get update - -# Install FFmpeg and audio processing libraries -apt-get install -y \ - ffmpeg \ - libsndfile1 \ - git - -echo "✅ System dependencies installed successfully!" -echo "" -echo "Next steps:" -echo " 1. Install VibeVoice: pip install -e .[vllm]" -echo " 2. Generate tokenizer files (if needed): python -m vllm_plugin.tools.generate_tokenizer_files -o /path/to/model" -echo " 3. Start vLLM server: vllm serve --trust-remote-code --enforce-eager --no-enable-prefix-caching" diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py new file mode 100644 index 0000000..ae11b20 --- /dev/null +++ b/vllm_plugin/scripts/start_server.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +VibeVoice vLLM ASR Server Launcher + +One-click deployment script that handles: +1. Installing system dependencies (FFmpeg, etc.) +2. Installing VibeVoice Python package +3. Downloading model from HuggingFace +4. Generating tokenizer files +5. Starting vLLM server + +Usage: + python3 start_server.py [--model MODEL_ID] [--port PORT] +""" + +import argparse +import os +import subprocess +import sys + + +def run_command(cmd: list[str], description: str, shell: bool = False) -> None: + """Run a command with logging.""" + print(f"\n{'='*60}") + print(f" {description}") + print(f"{'='*60}\n") + if shell: + subprocess.run(cmd, shell=True, check=True) + else: + subprocess.run(cmd, check=True) + + +def install_system_deps() -> None: + """Install system dependencies (FFmpeg, etc.).""" + run_command(["apt-get", "update"], "Updating package list") + run_command( + ["apt-get", "install", "-y", "ffmpeg", "libsndfile1"], + "Installing FFmpeg and audio libraries" + ) + + +def install_vibevoice() -> None: + """Install VibeVoice Python package.""" + run_command( + [sys.executable, "-m", "pip", "install", "-e", "/app[vllm]"], + "Installing VibeVoice with vLLM support" + ) + + +def download_model(model_id: str) -> str: + """Download model from HuggingFace using default cache.""" + print(f"\n{'='*60}") + print(f" Downloading model: {model_id}") + print(f"{'='*60}\n") + + import warnings + from huggingface_hub import snapshot_download + + # Suppress deprecation warnings from huggingface_hub + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + model_path = snapshot_download(model_id) + + print(f"\n{'='*60}") + print(f" ✅ Model downloaded successfully!") + print(f" 📁 Path: {model_path}") + print(f"{'='*60}\n") + return model_path + + +def generate_tokenizer(model_path: str) -> None: + """Generate tokenizer files for the model.""" + run_command( + [sys.executable, "-m", "vllm_plugin.tools.generate_tokenizer_files", + "--output", model_path], + "Generating tokenizer files" + ) + + +def start_vllm_server(model_path: str, port: int) -> None: + """Start vLLM server (replaces current process).""" + print(f"\n{'='*60}") + print(f" Starting vLLM server on port {port}") + print(f"{'='*60}\n") + + vllm_cmd = [ + "vllm", "serve", model_path, + "--served-model-name", "vibevoice", + "--trust-remote-code", + "--dtype", "bfloat16", + "--max-num-seqs", "64", + "--max-model-len", "65536", + "--max-num-batched-tokens", "32768", + "--gpu-memory-utilization", "0.8", + "--enforce-eager", + "--no-enable-prefix-caching", + "--enable-chunked-prefill", + "--chat-template-content-format", "openai", + "--tensor-parallel-size", "1", + "--allowed-local-media-path", "/app", + "--port", str(port), + ] + + os.execvp("vllm", vllm_cmd) + + +def main(): + parser = argparse.ArgumentParser( + description="VibeVoice vLLM ASR Server - One-Click Deployment", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Start with default settings + python3 start_server.py + + # Use custom port + python3 start_server.py --port 8080 + + # Skip dependency installation (if already installed) + python3 start_server.py --skip-deps + """ + ) + parser.add_argument( + "--model", "-m", + default="microsoft/VibeVoice-ASR", + help="HuggingFace model ID (default: microsoft/VibeVoice-ASR)" + ) + parser.add_argument( + "--port", "-p", + type=int, + default=8000, + help="Server port (default: 8000)" + ) + parser.add_argument( + "--skip-deps", + action="store_true", + help="Skip installing system dependencies" + ) + parser.add_argument( + "--skip-tokenizer", + action="store_true", + help="Skip generating tokenizer files" + ) + args = parser.parse_args() + + print("\n" + "="*60) + print(" VibeVoice vLLM ASR Server - One-Click Deployment") + print("="*60) + + # Step 1: Install system dependencies + if not args.skip_deps: + install_system_deps() + + # Step 2: Install VibeVoice + install_vibevoice() + + # Step 3: Download model + model_path = download_model(args.model) + + # Step 4: Generate tokenizer files + if not args.skip_tokenizer: + generate_tokenizer(model_path) + + # Step 5: Start vLLM server + start_vllm_server(model_path, args.port) + + +if __name__ == "__main__": + main()