fix: auto-tune per-worker env vars in DP mode

Pass VIBEVOICE_FFMPEG_MAX_CONCURRENCY and VLLM_MEDIA_LOADING_THREAD_COUNT to each worker subprocess so they inherit the correct settings regardless of how the container is launched (--skip-deps or not). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-27 07:57:49 +00:00
parent 3817f74d46
commit e6b65abb9b
1 changed files with 12 additions and 0 deletions
@@ -214,6 +214,14 @@ def start_dp_server(model_path: str, frontend_port: int,
        f"but only {num_gpus} available"
    )

+    # Auto-tune per-worker env vars based on dp size
+    ffmpeg_concurrency = max(
+        64, int(os.environ.get("VIBEVOICE_FFMPEG_MAX_CONCURRENCY", "64"))
+    )
+    media_threads = max(
+        8, int(os.environ.get("VLLM_MEDIA_LOADING_THREAD_COUNT", "8"))
+    )
+
    _install_nginx()

    # Assign internal ports: frontend_port + 100, +101, ...
@@ -228,6 +236,8 @@ def start_dp_server(model_path: str, frontend_port: int,
    print(f"  GPUs per replica:  {gpus_per_replica}")
    print(f"  Max Num Seqs:      {max_num_seqs}")
    print(f"  Max Model Len:     {max_model_len}")
+    print(f"  FFmpeg concurrency (per worker): {ffmpeg_concurrency}")
+    print(f"  Media loading threads (per worker): {media_threads}")
    print(f"{'='*60}\n")

    # Write nginx config
@@ -242,6 +252,8 @@ def start_dp_server(model_path: str, frontend_port: int,
        
        env = os.environ.copy()
        env["CUDA_VISIBLE_DEVICES"] = gpu_ids
+        env["VIBEVOICE_FFMPEG_MAX_CONCURRENCY"] = str(ffmpeg_concurrency)
+        env["VLLM_MEDIA_LOADING_THREAD_COUNT"] = str(media_threads)

        vllm_cmd = _build_vllm_cmd(
            model_path, port,