From cd945395d4d63e2fa76144fd019ee39e3152bd79 Mon Sep 17 00:00:00 2001 From: Jianwei Yu Date: Fri, 27 Mar 2026 09:16:05 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20set=20nginx=20workers=20to=202=C3=97dp?= =?UTF-8?q?=20for=20optimal=20HTTP=20throughput?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nginx worker_processes now defaults to 2×N (where N is the number of DP replicas) instead of 'auto'. This ensures enough HTTP handler processes to fully saturate all GPU backends under heavy concurrent load. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/vibevoice-vllm-asr.md | 4 ++-- vllm_plugin/scripts/start_server.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/vibevoice-vllm-asr.md b/docs/vibevoice-vllm-asr.md index bde87c8..9ed738b 100644 --- a/docs/vibevoice-vllm-asr.md +++ b/docs/vibevoice-vllm-asr.md @@ -47,9 +47,9 @@ The launcher supports two types of GPU parallelism via `--tp` and `--dp` flags: ### Data Parallel (Recommended for scaling throughput) -Run 4 independent replicas on 4 GPUs with automatic load balancing behind a single port. +Run N independent replicas on N GPUs with automatic load balancing behind a single port. When `--dp N` is specified (N > 1), the launcher automatically starts N independent vLLM -processes behind an nginx reverse proxy for optimal throughput: +processes behind an nginx reverse proxy (2×N workers) for optimal throughput: ```bash docker run -d --gpus '"device=0,1,2,3"' --name vibevoice-vllm \ diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py index 19e70d0..6c50377 100644 --- a/vllm_plugin/scripts/start_server.py +++ b/vllm_plugin/scripts/start_server.py @@ -146,11 +146,18 @@ def _install_nginx() -> None: ) -def _write_nginx_config(frontend_port: int, backend_ports: list[int]) -> str: - """Write nginx config for round-robin load balancing.""" +def _write_nginx_config(frontend_port: int, backend_ports: list[int], + num_workers: int = 0) -> str: + """Write nginx config for round-robin load balancing. + + Args: + num_workers: Number of nginx worker processes. 0 = auto (2 × num backends). + """ + if num_workers <= 0: + num_workers = len(backend_ports) * 2 backends = "\n".join(f" server 127.0.0.1:{p};" for p in backend_ports) config = textwrap.dedent(f"""\ - worker_processes auto; + worker_processes {num_workers}; worker_rlimit_nofile 65536; error_log /dev/stderr warn; pid /tmp/nginx.pid;