From 79470ff5768e17cbef6a3e1a93d1fd82ecc9a00d Mon Sep 17 00:00:00 2001 From: hydropix <5154373+hydropix@users.noreply.github.com> Date: Sun, 7 Dec 2025 16:10:11 +0100 Subject: [PATCH] Fix: Remove unnecessary Path() conversion for HuggingFace model IDs The model_path was being converted to a Path object and then back to string for from_pretrained() calls. This is unnecessary since HuggingFace accepts strings directly, and causes issues on Windows where Path() converts forward slashes to backslashes (e.g., "microsoft/VibeVoice-Realtime-0.5B" becomes "microsoft\VibeVoice-Realtime-0.5B"). This fix: - Keeps model_path as a string (no behavior change on Linux/macOS) - Fixes Windows compatibility for HuggingFace repo IDs - Removes redundant str() conversions --- demo/web/app.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/demo/web/app.py b/demo/web/app.py index 9c2632d..887d62b 100644 --- a/demo/web/app.py +++ b/demo/web/app.py @@ -45,7 +45,8 @@ class StreamingTTSService: device: str = "cuda", inference_steps: int = 5, ) -> None: - self.model_path = Path(model_path) + # Keep model_path as string for HuggingFace repo IDs (Path() converts / to \ on Windows) + self.model_path = model_path self.inference_steps = inference_steps self.sample_rate = SAMPLE_RATE @@ -66,7 +67,7 @@ class StreamingTTSService: def load(self) -> None: print(f"[startup] Loading processor from {self.model_path}") - self.processor = VibeVoiceStreamingProcessor.from_pretrained(str(self.model_path)) + self.processor = VibeVoiceStreamingProcessor.from_pretrained(self.model_path) # Decide dtype & attention @@ -86,7 +87,7 @@ class StreamingTTSService: # Load model try: self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( - str(self.model_path), + self.model_path, torch_dtype=load_dtype, device_map=device_map, attn_implementation=attn_impl_primary, @@ -99,7 +100,7 @@ class StreamingTTSService: print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.") self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( - str(self.model_path), + self.model_path, torch_dtype=load_dtype, device_map=self.device, attn_implementation='sdpa',