diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py index 375c29e..19e70d0 100644 --- a/vllm_plugin/scripts/start_server.py +++ b/vllm_plugin/scripts/start_server.py @@ -214,6 +214,14 @@ def start_dp_server(model_path: str, frontend_port: int, f"but only {num_gpus} available" ) + # Auto-tune per-worker env vars based on dp size + ffmpeg_concurrency = max( + 64, int(os.environ.get("VIBEVOICE_FFMPEG_MAX_CONCURRENCY", "64")) + ) + media_threads = max( + 8, int(os.environ.get("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")) + ) + _install_nginx() # Assign internal ports: frontend_port + 100, +101, ... @@ -228,6 +236,8 @@ def start_dp_server(model_path: str, frontend_port: int, print(f" GPUs per replica: {gpus_per_replica}") print(f" Max Num Seqs: {max_num_seqs}") print(f" Max Model Len: {max_model_len}") + print(f" FFmpeg concurrency (per worker): {ffmpeg_concurrency}") + print(f" Media loading threads (per worker): {media_threads}") print(f"{'='*60}\n") # Write nginx config @@ -242,6 +252,8 @@ def start_dp_server(model_path: str, frontend_port: int, env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = gpu_ids + env["VIBEVOICE_FFMPEG_MAX_CONCURRENCY"] = str(ffmpeg_concurrency) + env["VLLM_MEDIA_LOADING_THREAD_COUNT"] = str(media_threads) vllm_cmd = _build_vllm_cmd( model_path, port,