From a4add8e52fcf2fb28890d1e4a9a400ba67958ecf Mon Sep 17 00:00:00 2001
From: YingboHAO <3259482542@qq.com>
Date: Sun, 8 Feb 2026 09:58:19 +0000
Subject: [PATCH] fix backend

---
 vllm_plugin/model.py                | 95 ++++++++++++++++++++++++-----
 vllm_plugin/scripts/start_server.py |  4 +-
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/vllm_plugin/model.py b/vllm_plugin/model.py
index 2ca3cd2..1c33355 100644
--- a/vllm_plugin/model.py
+++ b/vllm_plugin/model.py
@@ -530,28 +530,86 @@ class VibeVoiceProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        """Return the maximum number of audio tokens per item.
+
+        This tells vLLM's scheduler the upper bound so that
+        ``encoder_compute_budget`` is large enough for any audio length
+        the model can handle, preventing the silent scheduling deadlock
+        described in docs/max_num_batched_tokens_issue.md.
+
+        Formula: audio_tokens = ceil(audio_samples / compress_ratio) + 3
+        where +3 accounts for speech_start, speech_end, and newline tokens.
+        The max audio samples is bounded by seq_len (the model's context
+        window cannot hold more tokens than that).
+        """
+        hf_config = self.get_hf_config()
+
+        def _cfg(key: str, default):
+            if isinstance(hf_config, dict):
+                return hf_config.get(key, default)
+            return getattr(hf_config, key, default)
+
+        compress_ratio = int(_cfg("speech_tok_compress_ratio", 3200))
+        sample_rate = int(_cfg("target_sample_rate", 24000))
+
+        # Upper bound: 61-minute audio at 24 kHz
+        max_audio_samples = 61 * 60 * sample_rate  # 88,464,000
+        max_audio_tokens = int(np.ceil(max_audio_samples / compress_ratio)) + 3
+
+        # Cannot exceed the model's context window
+        max_audio_tokens = min(max_audio_tokens, seq_len)
+
+        return {"audio": max_audio_tokens}
+
 
 class VibeVoiceDummyInputsBuilder(BaseDummyInputsBuilder[VibeVoiceProcessingInfo]):
     """
     Build dummy inputs for multimodal profiling.
     
-    Dummy text uses the raw <|AUDIO|> token(s). vLLM's processing pipeline will
-    expand each <|AUDIO|> via `VibeVoiceMultiModalProcessor._get_prompt_updates`
-    into the full ASR format:
-        [speech_start_id] + [speech_pad_id] * N + [speech_end_id] + [newline_id]
-    where N is derived from audio length / compress_ratio.
+    vLLM uses dummy inputs to:
+    1. Measure peak GPU activation memory → determine KV cache capacity
+    2. Warm up CUDA graphs
+    
+    The dummy audio length must be consistent with ``get_mm_max_tokens_per_item``
+    so that the memory estimate covers the worst-case (longest audio) scenario.
     """
     
+    def _get_max_audio_samples(self, seq_len: int) -> int:
+        """Compute maximum audio samples consistent with ``get_mm_max_tokens_per_item``.
+        
+        Uses the same formula: max_tokens = min(ceil(61min * sr / ratio) + 3, seq_len),
+        then converts back to samples.
+        """
+        hf_config = self.info.get_hf_config()
+
+        def _cfg(key: str, default):
+            if isinstance(hf_config, dict):
+                return hf_config.get(key, default)
+            return getattr(hf_config, key, default)
+
+        compress_ratio = int(_cfg("speech_tok_compress_ratio", 3200))
+        sample_rate = int(_cfg("target_sample_rate", 24000))
+
+        # Upper bound: 61-minute audio at 24 kHz
+        max_hour_samples = 61 * 60 * sample_rate  # 88,464,000
+        max_tokens_from_audio = int(np.ceil(max_hour_samples / compress_ratio)) + 3
+        # Cannot exceed model context window
+        max_tokens = min(max_tokens_from_audio, seq_len)
+        # Convert tokens back to samples
+        return max_tokens * compress_ratio
+
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_audios = mm_counts.get("audio", 0)
         if num_audios <= 0:
             return ""
         
-        # Get the audio token from our token info helper
         token_info = self.info.get_audio_token_info()
         audio_token = token_info["audio_token"]
-        
-        # Return ONLY the audio tokens - the HF processor adds bos/eos
         return audio_token * num_audios
 
     def get_dummy_mm_data(
@@ -560,16 +618,23 @@ class VibeVoiceDummyInputsBuilder(BaseDummyInputsBuilder[VibeVoiceProcessingInfo
         mm_counts: Mapping[str, int],
         mm_options: Mapping[str, Any] | None = None,
     ) -> Dict[str, Any]:
-        """Generate dummy audio data for profiling."""
-        feature_extractor = self.info.get_feature_extractor()
+        """Generate dummy audio data for profiling.
         
-        sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
+        The audio length is derived from ``seq_len`` so that profiling
+        accurately measures memory for the longest audio the model can handle.
+        Supports ``AudioDummyOptions.length`` override for faster startup.
+        """
         num_audios = mm_counts.get("audio", 0)
-        
-        # Generate dummy audio as numpy arrays (what the HF processor expects)
+        max_audio_len = self._get_max_audio_samples(seq_len)
+
+        audio_overrides = mm_options.get("audio") if mm_options else None
+
         return {
-            "audio": [np.zeros(audio_len, dtype=np.float32) for _ in range(num_audios)]
+            "audio": self._get_dummy_audios(
+                length=max_audio_len,
+                num_audios=num_audios,
+                overrides=audio_overrides,
+            )
         }
 
     def get_dummy_processor_inputs(
diff --git a/vllm_plugin/scripts/start_server.py b/vllm_plugin/scripts/start_server.py
index ae11b20..7032391 100644
--- a/vllm_plugin/scripts/start_server.py
+++ b/vllm_plugin/scripts/start_server.py
@@ -90,9 +90,9 @@ def start_vllm_server(model_path: str, port: int) -> None:
         "--dtype", "bfloat16",
         "--max-num-seqs", "64",
         "--max-model-len", "65536",
-        "--max-num-batched-tokens", "32768",
+        # "--max-num-batched-tokens", "32768",
         "--gpu-memory-utilization", "0.8",
-        "--enforce-eager",
+        # "--enforce-eager",
         "--no-enable-prefix-caching",
         "--enable-chunked-prefill",
         "--chat-template-content-format", "openai",