more experimental voices
This commit is contained in:
@@ -175,3 +175,4 @@ tags
|
|||||||
/checkpoints/
|
/checkpoints/
|
||||||
exp
|
exp
|
||||||
.gradio/
|
.gradio/
|
||||||
|
experimental_voices
|
||||||
@@ -25,7 +25,10 @@
|
|||||||
|
|
||||||
<strong>2025-12-03: 📣 We open-sourced <a href="docs/vibevoice-realtime-0.5b.md"><strong>VibeVoice‑Realtime‑0.5B</strong></a>, a real‑time text‑to‑speech model that supports streaming text input and robust long-form speech generation. Try it on [Colab](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb).</strong>
|
<strong>2025-12-03: 📣 We open-sourced <a href="docs/vibevoice-realtime-0.5b.md"><strong>VibeVoice‑Realtime‑0.5B</strong></a>, a real‑time text‑to‑speech model that supports streaming text input and robust long-form speech generation. Try it on [Colab](https://colab.research.google.com/github/microsoft/VibeVoice/blob/main/demo/vibevoice_realtime_colab.ipynb).</strong>
|
||||||
|
|
||||||
<strong>2025-12-09: 📣 We’ve added experimental speakers in nine languages (DE, FR, IT, JP, KR, NL, PL, PT, ES) for exploration—welcome to try them out and share your feedback.</strong>
|
<strong>2025-12-09: 📣 We added experimental speakers in nine languages (DE, FR, IT, JP, KR, NL, PL, PT, ES) for exploration—welcome to try them out and share your feedback.</strong>
|
||||||
|
|
||||||
|
<strong>2025-12-16: 📣 We added more experimental speakers for exploration, including multilingual voices and 11 distinct English style voices. [Try it](docs/vibevoice-realtime-0.5b.md#optional-more-experimental-voices). More speaker types will be added over time.</strong>
|
||||||
|
|
||||||
|
|
||||||
To mitigate deepfake risks and ensure low latency for the first speech chunk, voice prompts are provided in an embedded format. For users requiring voice customization, please reach out to our team. We will also be expanding the range of available speakers.
|
To mitigate deepfake risks and ensure low latency for the first speech chunk, voice prompts are provided in an embedded format. For users requiring voice customization, please reach out to our team. We will also be expanding the range of available speakers.
|
||||||
<br>
|
<br>
|
||||||
|
|||||||
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "[INFO] Starting download of experimental voices..."
|
||||||
|
|
||||||
|
# Absolute path of the current script directory
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# Target directory relative to this script location
|
||||||
|
TARGET_DIR="$SCRIPT_DIR/voices/streaming_model/experimental_voices"
|
||||||
|
|
||||||
|
echo "[INFO] Script directory: $SCRIPT_DIR"
|
||||||
|
echo "[INFO] Target directory: $TARGET_DIR"
|
||||||
|
|
||||||
|
# Ensure the target directory exists
|
||||||
|
echo "[INFO] Creating target directory if needed..."
|
||||||
|
mkdir -p "$TARGET_DIR"
|
||||||
|
|
||||||
|
# List of archives and their URLs
|
||||||
|
FILES=(
|
||||||
|
"experimental_voices_de.tar.gz|https://github.com/user-attachments/files/24035887/experimental_voices_de.tar.gz"
|
||||||
|
"experimental_voices_fr.tar.gz|https://github.com/user-attachments/files/24035880/experimental_voices_fr.tar.gz"
|
||||||
|
"experimental_voices_jp.tar.gz|https://github.com/user-attachments/files/24035882/experimental_voices_jp.tar.gz"
|
||||||
|
"experimental_voices_kr.tar.gz|https://github.com/user-attachments/files/24035883/experimental_voices_kr.tar.gz"
|
||||||
|
"experimental_voices_pl.tar.gz|https://github.com/user-attachments/files/24035885/experimental_voices_pl.tar.gz"
|
||||||
|
"experimental_voices_pt.tar.gz|https://github.com/user-attachments/files/24035886/experimental_voices_pt.tar.gz"
|
||||||
|
"experimental_voices_sp.tar.gz|https://github.com/user-attachments/files/24035884/experimental_voices_sp.tar.gz"
|
||||||
|
"experimental_voices_en1.tar.gz|https://github.com/user-attachments/files/24189272/experimental_voices_en1.tar.gz"
|
||||||
|
"experimental_voices_en2.tar.gz|https://github.com/user-attachments/files/24189273/experimental_voices_en2.tar.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Download, extract, and clean up each archive
|
||||||
|
for entry in "${FILES[@]}"; do
|
||||||
|
IFS="|" read -r FNAME URL <<< "$entry"
|
||||||
|
|
||||||
|
echo "[INFO] Downloading $FNAME ..."
|
||||||
|
wget -O "$FNAME" "$URL"
|
||||||
|
|
||||||
|
echo "[INFO] Extracting $FNAME ..."
|
||||||
|
tar -xzvf "$FNAME" -C "$TARGET_DIR"
|
||||||
|
|
||||||
|
echo "[INFO] Cleaning up $FNAME ..."
|
||||||
|
rm -f "$FNAME"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[SUCCESS] All experimental speakers installed successfully!"
|
||||||
|
echo "[SUCCESS] Speakers are located at:"
|
||||||
|
echo " $TARGET_DIR"
|
||||||
@@ -6,6 +6,7 @@ from typing import List, Tuple, Union, Dict, Any
|
|||||||
import time
|
import time
|
||||||
import torch
|
import torch
|
||||||
import copy
|
import copy
|
||||||
|
import glob
|
||||||
|
|
||||||
from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
|
from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference
|
||||||
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
|
from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor
|
||||||
@@ -20,20 +21,8 @@ class VoiceMapper:
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.setup_voice_presets()
|
self.setup_voice_presets()
|
||||||
|
# for k, v in self.voice_presets.items():
|
||||||
# change name according to our preset voice file
|
# print(f"{k}: {v}")
|
||||||
new_dict = {}
|
|
||||||
for name, path in self.voice_presets.items():
|
|
||||||
|
|
||||||
if '_' in name:
|
|
||||||
name = name.split('_')[0]
|
|
||||||
|
|
||||||
if '-' in name:
|
|
||||||
name = name.split('-')[-1]
|
|
||||||
|
|
||||||
new_dict[name] = path
|
|
||||||
self.voice_presets.update(new_dict)
|
|
||||||
# print(list(self.voice_presets.keys()))
|
|
||||||
|
|
||||||
def setup_voice_presets(self):
|
def setup_voice_presets(self):
|
||||||
"""Setup voice presets by scanning the voices directory."""
|
"""Setup voice presets by scanning the voices directory."""
|
||||||
@@ -50,15 +39,13 @@ class VoiceMapper:
|
|||||||
self.voice_presets = {}
|
self.voice_presets = {}
|
||||||
|
|
||||||
# Get all .pt files in the voices directory
|
# Get all .pt files in the voices directory
|
||||||
pt_files = [f for f in os.listdir(voices_dir)
|
pt_files = glob.glob(os.path.join(voices_dir, "**", "*.pt"), recursive=True)
|
||||||
if f.lower().endswith('.pt') and os.path.isfile(os.path.join(voices_dir, f))]
|
|
||||||
|
|
||||||
# Create dictionary with filename (without extension) as key
|
# Create dictionary with filename (without extension) as key
|
||||||
for pt_file in pt_files:
|
for pt_file in pt_files:
|
||||||
# Remove .pt extension to get the name
|
# key: filename without extension
|
||||||
name = os.path.splitext(pt_file)[0]
|
name = os.path.splitext(os.path.basename(pt_file))[0].lower()
|
||||||
# Create full path
|
full_path = os.path.abspath(pt_file)
|
||||||
full_path = os.path.join(voices_dir, pt_file)
|
|
||||||
self.voice_presets[name] = full_path
|
self.voice_presets[name] = full_path
|
||||||
|
|
||||||
# Sort the voice presets alphabetically by name for better UI
|
# Sort the voice presets alphabetically by name for better UI
|
||||||
@@ -76,14 +63,19 @@ class VoiceMapper:
|
|||||||
def get_voice_path(self, speaker_name: str) -> str:
|
def get_voice_path(self, speaker_name: str) -> str:
|
||||||
"""Get voice file path for a given speaker name"""
|
"""Get voice file path for a given speaker name"""
|
||||||
# First try exact match
|
# First try exact match
|
||||||
|
speaker_name = speaker_name.lower()
|
||||||
if speaker_name in self.voice_presets:
|
if speaker_name in self.voice_presets:
|
||||||
return self.voice_presets[speaker_name]
|
return self.voice_presets[speaker_name]
|
||||||
|
|
||||||
# Try partial matching (case insensitive)
|
# Try partial matching (case insensitive)
|
||||||
speaker_lower = speaker_name.lower()
|
matched_path = None
|
||||||
for preset_name, path in self.voice_presets.items():
|
for preset_name, path in self.voice_presets.items():
|
||||||
if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
|
if preset_name.lower() in speaker_name or speaker_name in preset_name.lower():
|
||||||
return path
|
if matched_path is not None:
|
||||||
|
raise ValueError(f"Multiple voice presets match the speaker name '{speaker_name}', please make the speaker_name more specific.")
|
||||||
|
matched_path = path
|
||||||
|
if matched_path is not None:
|
||||||
|
return matched_path
|
||||||
|
|
||||||
# Default to first voice if no match found
|
# Default to first voice if no match found
|
||||||
default_voice = list(self.voice_presets.values())[0]
|
default_voice = list(self.voice_presets.values())[0]
|
||||||
@@ -229,6 +221,7 @@ def main():
|
|||||||
|
|
||||||
target_device = args.device if args.device != "cpu" else "cpu"
|
target_device = args.device if args.device != "cpu" else "cpu"
|
||||||
voice_sample = voice_mapper.get_voice_path(args.speaker_name)
|
voice_sample = voice_mapper.get_voice_path(args.speaker_name)
|
||||||
|
print(f"Using voice preset for {args.speaker_name}: {voice_sample}")
|
||||||
all_prefilled_outputs = torch.load(voice_sample, map_location=target_device, weights_only=False)
|
all_prefilled_outputs = torch.load(voice_sample, map_location=target_device, weights_only=False)
|
||||||
|
|
||||||
# Prepare inputs for the model
|
# Prepare inputs for the model
|
||||||
|
|||||||
@@ -106,6 +106,24 @@
|
|||||||
"print(\"✅ Downloaded model: microsoft/VibeVoice-Realtime-0.5B\")"
|
"print(\"✅ Downloaded model: microsoft/VibeVoice-Realtime-0.5B\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "dfe30d6f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"[Optional] More experimental voices"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bb33c9ce",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!bash /content/VibeVoice/demo/download_experimental_voices.sh"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "pgKlV7153Ifi",
|
"id": "pgKlV7153Ifi",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import argparse, os, uvicorn
|
|||||||
def main():
|
def main():
|
||||||
p = argparse.ArgumentParser()
|
p = argparse.ArgumentParser()
|
||||||
p.add_argument("--port", type=int, default=3000)
|
p.add_argument("--port", type=int, default=3000)
|
||||||
p.add_argument("--model_path", type=str, default="default_model")
|
p.add_argument("--model_path", type=str, default="microsoft/VibeVoice-Realtime-0.5B")
|
||||||
p.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda", "mpx", "mps"])
|
p.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda", "mpx", "mps"])
|
||||||
p.add_argument("--reload", action="store_true", help="Reload the model or not")
|
p.add_argument("--reload", action="store_true", help="Reload the model or not")
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|||||||
+1
-1
@@ -129,7 +129,7 @@ class StreamingTTSService:
|
|||||||
raise RuntimeError(f"Voices directory not found: {voices_dir}")
|
raise RuntimeError(f"Voices directory not found: {voices_dir}")
|
||||||
|
|
||||||
presets: Dict[str, Path] = {}
|
presets: Dict[str, Path] = {}
|
||||||
for pt_path in voices_dir.glob("*.pt"):
|
for pt_path in voices_dir.rglob("*.pt"):
|
||||||
presets[pt_path.stem] = pt_path
|
presets[pt_path.stem] = pt_path
|
||||||
|
|
||||||
if not presets:
|
if not presets:
|
||||||
|
|||||||
@@ -121,7 +121,11 @@ Tip: Just try it on [Colab](https://colab.research.google.com/github/microsoft/V
|
|||||||
python demo/realtime_model_inference_from_file.py --model_path microsoft/VibeVoice-Realtime-0.5B --txt_path demo/text_examples/1p_vibevoice.txt --speaker_name Carter
|
python demo/realtime_model_inference_from_file.py --model_path microsoft/VibeVoice-Realtime-0.5B --txt_path demo/text_examples/1p_vibevoice.txt --speaker_name Carter
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### [Optional] More experimental voices
|
||||||
|
Download additional experimental multi-lingual speakers before launching demo or inference from files.
|
||||||
|
```bash
|
||||||
|
bash demo/download_experimental_voices.sh
|
||||||
|
```
|
||||||
## Risks and limitations
|
## Risks and limitations
|
||||||
|
|
||||||
While efforts have been made to optimize it through various techniques, it may still produce outputs that are unexpected, biased, or inaccurate. VibeVoice inherits any biases, errors, or omissions produced by its base model (specifically, Qwen2.5 0.5b in this release).
|
While efforts have been made to optimize it through various techniques, it may still produce outputs that are unexpected, biased, or inaccurate. VibeVoice inherits any biases, errors, or omissions produced by its base model (specifically, Qwen2.5 0.5b in this release).
|
||||||
|
|||||||
Reference in New Issue
Block a user