From 79470ff5768e17cbef6a3e1a93d1fd82ecc9a00d Mon Sep 17 00:00:00 2001
From: hydropix <5154373+hydropix@users.noreply.github.com>
Date: Sun, 7 Dec 2025 16:10:11 +0100
Subject: [PATCH] Fix: Remove unnecessary Path() conversion for HuggingFace
 model IDs

The model_path was being converted to a Path object and then back to string
for from_pretrained() calls. This is unnecessary since HuggingFace accepts
strings directly, and causes issues on Windows where Path() converts forward
slashes to backslashes (e.g., "microsoft/VibeVoice-Realtime-0.5B" becomes
"microsoft\VibeVoice-Realtime-0.5B").

This fix:
- Keeps model_path as a string (no behavior change on Linux/macOS)
- Fixes Windows compatibility for HuggingFace repo IDs
- Removes redundant str() conversions
---
 demo/web/app.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/demo/web/app.py b/demo/web/app.py
index 9c2632d..887d62b 100644
--- a/demo/web/app.py
+++ b/demo/web/app.py
@@ -45,7 +45,8 @@ class StreamingTTSService:
         device: str = "cuda",
         inference_steps: int = 5,
     ) -> None:
-        self.model_path = Path(model_path)
+        # Keep model_path as string for HuggingFace repo IDs (Path() converts / to \ on Windows)
+        self.model_path = model_path
         self.inference_steps = inference_steps
         self.sample_rate = SAMPLE_RATE
 
@@ -66,7 +67,7 @@ class StreamingTTSService:
 
     def load(self) -> None:
         print(f"[startup] Loading processor from {self.model_path}")
-        self.processor = VibeVoiceStreamingProcessor.from_pretrained(str(self.model_path))
+        self.processor = VibeVoiceStreamingProcessor.from_pretrained(self.model_path)
 
         
         # Decide dtype & attention
@@ -86,7 +87,7 @@ class StreamingTTSService:
         # Load model
         try:
             self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-                str(self.model_path),
+                self.model_path,
                 torch_dtype=load_dtype,
                 device_map=device_map,
                 attn_implementation=attn_impl_primary,
@@ -99,7 +100,7 @@ class StreamingTTSService:
                 print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.")
                 
                 self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
-                    str(self.model_path),
+                    self.model_path,
                     torch_dtype=load_dtype,
                     device_map=self.device,
                     attn_implementation='sdpa',