576 lines
20 KiB
Python
576 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Standalone tool to generate VibeVoice tokenizer files from Qwen2 base.
|
|
|
|
Downloads base tokenizer from Qwen2 and patches it with VibeVoice-specific
|
|
audio tokens and chat template modifications.
|
|
|
|
Usage:
|
|
python generate_tokenizer_files.py --output /path/to/output [--compare /path/to/reference]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
from typing import Optional, Dict, Any
|
|
|
|
|
|
# Qwen2.5 extended tokens (151646-151664)
|
|
# These are NOT in base Qwen2-7B but ARE in Qwen2.5 and Qwen2-VL
|
|
# VibeVoice uses some of these for speech: object_ref_start/end, box_start
|
|
QWEN25_EXTENDED_TOKENS = {
|
|
"<|object_ref_start|>": 151646, # Used as speech_start_id
|
|
"<|object_ref_end|>": 151647, # Used as speech_end_id
|
|
"<|box_start|>": 151648, # Used as speech_pad_id
|
|
"<|box_end|>": 151649,
|
|
"<|quad_start|>": 151650,
|
|
"<|quad_end|>": 151651,
|
|
"<|vision_start|>": 151652,
|
|
"<|vision_end|>": 151653,
|
|
"<|vision_pad|>": 151654,
|
|
"<|image_pad|>": 151655,
|
|
"<|video_pad|>": 151656,
|
|
"<tool_call>": 151657,
|
|
"</tool_call>": 151658,
|
|
"<|fim_prefix|>": 151659,
|
|
"<|fim_middle|>": 151660,
|
|
"<|fim_suffix|>": 151661,
|
|
"<|fim_pad|>": 151662,
|
|
"<|repo_name|>": 151663,
|
|
"<|file_sep|>": 151664,
|
|
}
|
|
|
|
# VibeVoice-specific audio tokens (IDs follow Qwen2.5's last token 151664)
|
|
VIBEVOICE_AUDIO_TOKENS = {
|
|
"<|AUDIO|>": 151665,
|
|
"<|audio_bos|>": 151666,
|
|
"<|audio_eos|>": 151667,
|
|
}
|
|
|
|
# All extended tokens (Qwen2.5 + VibeVoice)
|
|
ALL_EXTENDED_TOKENS = {**QWEN25_EXTENDED_TOKENS, **VIBEVOICE_AUDIO_TOKENS}
|
|
|
|
# Chat template with audio support
|
|
# Key modification: handles part['type'] == 'audio' or 'audio_url' -> '<|AUDIO|>'
|
|
VIBEVOICE_CHAT_TEMPLATE = """{%- if tools %}
|
|
{{- '<|im_start|>system\\n' }}
|
|
{%- if messages[0]['role'] == 'system' %}
|
|
{%- if messages[0]['content'] is string %}
|
|
{{- messages[0]['content'] }}
|
|
{%- else %}
|
|
{%- for part in messages[0]['content'] %}
|
|
{%- if part['type'] == 'text' %}
|
|
{{- part['text'] }}
|
|
{%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
|
|
{{- '<|AUDIO|>' }}
|
|
{%- endif %}
|
|
{%- endfor %}
|
|
{%- endif %}
|
|
{%- else %}
|
|
{{- 'You are a helpful assistant.' }}
|
|
{%- endif %}
|
|
{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}
|
|
{%- for tool in tools %}
|
|
{{- "\\n" }}
|
|
{{- tool | tojson }}
|
|
{%- endfor %}
|
|
{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}
|
|
{%- else %}
|
|
{%- if messages[0]['role'] == 'system' %}
|
|
{{- '<|im_start|>system\\n' }}
|
|
{%- if messages[0]['content'] is string %}
|
|
{{- messages[0]['content'] }}
|
|
{%- else %}
|
|
{%- for part in messages[0]['content'] %}
|
|
{%- if part['type'] == 'text' %}
|
|
{{- part['text'] }}
|
|
{%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
|
|
{{- '<|AUDIO|>' }}
|
|
{%- endif %}
|
|
{%- endfor %}
|
|
{%- endif %}
|
|
{{- '<|im_end|>\\n' }}
|
|
{%- else %}
|
|
{{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}
|
|
{%- endif %}
|
|
{%- endif %}
|
|
{%- for message in messages %}
|
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
|
{{- '<|im_start|>' + message.role + '\\n' }}
|
|
{%- if message['content'] is string %}
|
|
{{- message['content'] }}
|
|
{%- else %}
|
|
{%- for part in message['content'] %}
|
|
{%- if part['type'] == 'text' %}
|
|
{{- part['text'] }}
|
|
{%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
|
|
{{- '<|AUDIO|>' }}
|
|
{%- endif %}
|
|
{%- endfor %}
|
|
{%- endif %}
|
|
{{- '<|im_end|>\\n' }}
|
|
{%- elif message.role == "assistant" %}
|
|
{{- '<|im_start|>' + message.role }}
|
|
{%- if message.content %}
|
|
{{- '\\n' + message.content }}
|
|
{%- endif %}
|
|
{%- for tool_call in message.tool_calls %}
|
|
{%- if tool_call.function is defined %}
|
|
{%- set tool_call = tool_call.function %}
|
|
{%- endif %}
|
|
{{- '\\n<tool_call>\\n{"name": "' }}
|
|
{{- tool_call.name }}
|
|
{{- '", "arguments": ' }}
|
|
{{- tool_call.arguments | tojson }}
|
|
{{- '}\\n</tool_call>' }}
|
|
{%- endfor %}
|
|
{{- '<|im_end|>\\n' }}
|
|
{%- elif message.role == "tool" %}
|
|
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
|
{{- '<|im_start|>user' }}
|
|
{%- endif %}
|
|
{{- '\\n<tool_response>\\n' }}
|
|
{{- message.content }}
|
|
{{- '\\n</tool_response>' }}
|
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
|
{{- '<|im_end|>\\n' }}
|
|
{%- endif %}
|
|
{%- endif %}
|
|
{%- endfor %}
|
|
{%- if add_generation_prompt %}
|
|
{{- '<|im_start|>assistant\\n' }}
|
|
{%- endif %}"""
|
|
|
|
|
|
# Default to Qwen2.5-7B which has all the extended tokens (151646-151664)
|
|
DEFAULT_QWEN_MODEL = "Qwen/Qwen2.5-7B"
|
|
|
|
|
|
def download_qwen_tokenizer_files(output_dir: str, qwen_model: str = DEFAULT_QWEN_MODEL) -> None:
|
|
"""Download base tokenizer files from Qwen2.5 (which includes extended tokens)."""
|
|
try:
|
|
from huggingface_hub import hf_hub_download
|
|
except ImportError:
|
|
raise ImportError("Please install huggingface_hub: pip install huggingface_hub")
|
|
|
|
files_to_download = [
|
|
"vocab.json",
|
|
"merges.txt",
|
|
"tokenizer.json",
|
|
"tokenizer_config.json",
|
|
]
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for filename in files_to_download:
|
|
print(f"Downloading {filename} from {qwen_model}...")
|
|
hf_hub_download(
|
|
repo_id=qwen_model,
|
|
filename=filename,
|
|
local_dir=output_dir,
|
|
local_dir_use_symlinks=False,
|
|
)
|
|
|
|
|
|
def patch_tokenizer_config(output_dir: str) -> None:
|
|
"""
|
|
Patch tokenizer_config.json with VibeVoice audio tokens and chat template.
|
|
"""
|
|
config_path = os.path.join(output_dir, "tokenizer_config.json")
|
|
|
|
with open(config_path, "r", encoding="utf-8") as f:
|
|
config = json.load(f)
|
|
|
|
# 1. Add ALL extended tokens to added_tokens_decoder (Qwen2.5 + VibeVoice audio)
|
|
if "added_tokens_decoder" not in config:
|
|
config["added_tokens_decoder"] = {}
|
|
|
|
for token, token_id in ALL_EXTENDED_TOKENS.items():
|
|
if str(token_id) not in config["added_tokens_decoder"]:
|
|
# Determine if token should be marked as "special"
|
|
# tool_call tokens are NOT special in Qwen2.5
|
|
is_special = token not in ("<tool_call>", "</tool_call>", "<|fim_prefix|>",
|
|
"<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>",
|
|
"<|repo_name|>", "<|file_sep|>")
|
|
config["added_tokens_decoder"][str(token_id)] = {
|
|
"content": token,
|
|
"lstrip": False,
|
|
"normalized": False,
|
|
"rstrip": False,
|
|
"single_word": False,
|
|
"special": is_special,
|
|
}
|
|
|
|
# 2. Add audio tokens to additional_special_tokens
|
|
if "additional_special_tokens" not in config:
|
|
config["additional_special_tokens"] = []
|
|
|
|
for token in VIBEVOICE_AUDIO_TOKENS.keys():
|
|
if token not in config["additional_special_tokens"]:
|
|
config["additional_special_tokens"].append(token)
|
|
|
|
# 3. Modify chat_template to support audio
|
|
# Instead of replacing entirely, we patch the existing template to handle audio
|
|
chat_template = config.get("chat_template", "")
|
|
if chat_template and "<|AUDIO|>" not in chat_template:
|
|
# Insert audio handling into the template
|
|
# Find patterns like: {%- if part['type'] == 'text' %}
|
|
# Add after: {%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}\n {{- '<|AUDIO|>' }}
|
|
audio_handler = """{%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
|
|
{{- '<|AUDIO|>' }}"""
|
|
|
|
# Pattern to find: after handling 'text' type, before endif
|
|
import re
|
|
# Look for the pattern where we handle text type and add audio handling
|
|
pattern = r"(\{\%- if part\['type'\] == 'text' \%\}\s*\n\s*\{\{- part\['text'\] \}\})"
|
|
replacement = r"\1\n " + audio_handler.replace("\n", r"\n")
|
|
|
|
modified_template = re.sub(pattern, replacement, chat_template)
|
|
|
|
if modified_template != chat_template:
|
|
config["chat_template"] = modified_template
|
|
print(" - Added audio support to existing chat_template")
|
|
else:
|
|
# Fallback: use our predefined template
|
|
print(" - Warning: Could not patch existing template, using predefined template")
|
|
config["chat_template"] = VIBEVOICE_CHAT_TEMPLATE
|
|
|
|
# 4. Update model_max_length for long audio support
|
|
config["model_max_length"] = 131072
|
|
|
|
# 5. Add add_bos_token if not present
|
|
if "add_bos_token" not in config:
|
|
config["add_bos_token"] = False
|
|
|
|
# Write back
|
|
with open(config_path, "w", encoding="utf-8") as f:
|
|
json.dump(config, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Patched {config_path}")
|
|
|
|
|
|
def patch_tokenizer_json(output_dir: str) -> None:
|
|
"""
|
|
Patch tokenizer.json with VibeVoice audio tokens.
|
|
"""
|
|
tokenizer_path = os.path.join(output_dir, "tokenizer.json")
|
|
|
|
with open(tokenizer_path, "r", encoding="utf-8") as f:
|
|
tokenizer = json.load(f)
|
|
|
|
# Find existing token IDs to avoid duplicates
|
|
existing_ids = set()
|
|
if "added_tokens" in tokenizer:
|
|
for token_entry in tokenizer["added_tokens"]:
|
|
existing_ids.add(token_entry.get("id"))
|
|
|
|
# Add ALL extended tokens (Qwen2.5 + VibeVoice audio)
|
|
for token, token_id in ALL_EXTENDED_TOKENS.items():
|
|
if token_id not in existing_ids:
|
|
# Determine if token should be marked as "special"
|
|
is_special = token not in ("<tool_call>", "</tool_call>", "<|fim_prefix|>",
|
|
"<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>",
|
|
"<|repo_name|>", "<|file_sep|>")
|
|
tokenizer["added_tokens"].append({
|
|
"id": token_id,
|
|
"content": token,
|
|
"single_word": False,
|
|
"lstrip": False,
|
|
"rstrip": False,
|
|
"normalized": False,
|
|
"special": is_special,
|
|
})
|
|
|
|
# Write back
|
|
with open(tokenizer_path, "w", encoding="utf-8") as f:
|
|
json.dump(tokenizer, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Patched {tokenizer_path}")
|
|
|
|
|
|
def generate_added_tokens_json(output_dir: str) -> None:
|
|
"""
|
|
Generate added_tokens.json from tokenizer_config.json.
|
|
"""
|
|
config_path = os.path.join(output_dir, "tokenizer_config.json")
|
|
|
|
with open(config_path, "r", encoding="utf-8") as f:
|
|
config = json.load(f)
|
|
|
|
added_tokens = {}
|
|
for token_id, token_info in config.get("added_tokens_decoder", {}).items():
|
|
content = token_info.get("content")
|
|
if content:
|
|
added_tokens[content] = int(token_id)
|
|
|
|
output_path = os.path.join(output_dir, "added_tokens.json")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(added_tokens, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated {output_path}")
|
|
|
|
|
|
def generate_special_tokens_map_json(output_dir: str) -> None:
|
|
"""
|
|
Generate special_tokens_map.json with VibeVoice special tokens.
|
|
"""
|
|
# Build the special tokens map
|
|
special_tokens_map = {
|
|
"additional_special_tokens": [],
|
|
"eos_token": "<|endoftext|>",
|
|
"pad_token": "<|endoftext|>",
|
|
"unk_token": "<|endoftext|>",
|
|
}
|
|
|
|
# Add audio tokens as additional_special_tokens
|
|
for token in VIBEVOICE_AUDIO_TOKENS.keys():
|
|
special_tokens_map["additional_special_tokens"].append({
|
|
"content": token,
|
|
"lstrip": False,
|
|
"normalized": False,
|
|
"rstrip": False,
|
|
"single_word": False,
|
|
})
|
|
|
|
# Add some commonly used special tokens
|
|
common_special = ["<|object_ref_start|>", "<|object_ref_end|>", "<|box_start|>"]
|
|
for token in common_special:
|
|
special_tokens_map["additional_special_tokens"].append({
|
|
"content": token,
|
|
"lstrip": False,
|
|
"normalized": False,
|
|
"rstrip": False,
|
|
"single_word": False,
|
|
})
|
|
|
|
output_path = os.path.join(output_dir, "special_tokens_map.json")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated {output_path}")
|
|
|
|
|
|
def generate_vibevoice_tokenizer_files(output_dir: str, qwen_model: str = DEFAULT_QWEN_MODEL) -> None:
|
|
"""
|
|
Generate all 6 VibeVoice tokenizer files.
|
|
|
|
Files generated:
|
|
1. vocab.json - from Qwen2.5 (unchanged)
|
|
2. merges.txt - from Qwen2.5 (unchanged)
|
|
3. tokenizer.json - from Qwen2.5 + audio tokens
|
|
4. tokenizer_config.json - from Qwen2.5 + audio tokens + chat_template
|
|
5. added_tokens.json - generated from tokenizer_config.json
|
|
6. special_tokens_map.json - generated with VibeVoice tokens
|
|
"""
|
|
print(f"=== Generating VibeVoice tokenizer files to {output_dir} ===\n")
|
|
|
|
# Step 1: Download base files from Qwen2
|
|
download_qwen_tokenizer_files(output_dir, qwen_model)
|
|
|
|
# Step 2: Patch tokenizer_config.json
|
|
patch_tokenizer_config(output_dir)
|
|
|
|
# Step 3: Patch tokenizer.json
|
|
patch_tokenizer_json(output_dir)
|
|
|
|
# Step 4: Generate added_tokens.json
|
|
generate_added_tokens_json(output_dir)
|
|
|
|
# Step 5: Generate special_tokens_map.json
|
|
generate_special_tokens_map_json(output_dir)
|
|
|
|
print(f"\n✅ All 6 tokenizer files generated in {output_dir}")
|
|
|
|
|
|
def compare_json_files(file1: str, file2: str, name: str) -> Dict[str, Any]:
|
|
"""Compare two JSON files and return differences."""
|
|
result = {
|
|
"name": name,
|
|
"identical": False,
|
|
"differences": [],
|
|
}
|
|
|
|
if not os.path.exists(file1):
|
|
result["differences"].append(f"File 1 not found: {file1}")
|
|
return result
|
|
|
|
if not os.path.exists(file2):
|
|
result["differences"].append(f"File 2 not found: {file2}")
|
|
return result
|
|
|
|
with open(file1, "r", encoding="utf-8") as f:
|
|
data1 = json.load(f)
|
|
|
|
with open(file2, "r", encoding="utf-8") as f:
|
|
data2 = json.load(f)
|
|
|
|
if data1 == data2:
|
|
result["identical"] = True
|
|
return result
|
|
|
|
# Find specific differences
|
|
def find_diff(d1, d2, path=""):
|
|
diffs = []
|
|
if isinstance(d1, dict) and isinstance(d2, dict):
|
|
all_keys = set(d1.keys()) | set(d2.keys())
|
|
for k in all_keys:
|
|
new_path = f"{path}.{k}" if path else k
|
|
if k not in d1:
|
|
diffs.append(f"Missing in generated: {new_path}")
|
|
elif k not in d2:
|
|
diffs.append(f"Extra in generated: {new_path}")
|
|
else:
|
|
diffs.extend(find_diff(d1[k], d2[k], new_path))
|
|
elif isinstance(d1, list) and isinstance(d2, list):
|
|
if len(d1) != len(d2):
|
|
diffs.append(f"{path}: list length differs ({len(d1)} vs {len(d2)})")
|
|
# For lists, just check if they're equal (detailed diff is complex)
|
|
if d1 != d2:
|
|
diffs.append(f"{path}: list content differs")
|
|
elif d1 != d2:
|
|
# Truncate long values for readability
|
|
v1 = str(d1)[:100] + "..." if len(str(d1)) > 100 else str(d1)
|
|
v2 = str(d2)[:100] + "..." if len(str(d2)) > 100 else str(d2)
|
|
diffs.append(f"{path}: '{v1}' vs '{v2}'")
|
|
return diffs
|
|
|
|
result["differences"] = find_diff(data1, data2)
|
|
return result
|
|
|
|
|
|
def compare_text_files(file1: str, file2: str, name: str) -> Dict[str, Any]:
|
|
"""Compare two text files."""
|
|
result = {
|
|
"name": name,
|
|
"identical": False,
|
|
"differences": [],
|
|
}
|
|
|
|
if not os.path.exists(file1):
|
|
result["differences"].append(f"File 1 not found: {file1}")
|
|
return result
|
|
|
|
if not os.path.exists(file2):
|
|
result["differences"].append(f"File 2 not found: {file2}")
|
|
return result
|
|
|
|
with open(file1, "r", encoding="utf-8") as f:
|
|
content1 = f.read()
|
|
|
|
with open(file2, "r", encoding="utf-8") as f:
|
|
content2 = f.read()
|
|
|
|
if content1 == content2:
|
|
result["identical"] = True
|
|
else:
|
|
lines1 = content1.splitlines()
|
|
lines2 = content2.splitlines()
|
|
result["differences"].append(f"Line count: {len(lines1)} vs {len(lines2)}")
|
|
|
|
# Find first difference
|
|
for i, (l1, l2) in enumerate(zip(lines1, lines2)):
|
|
if l1 != l2:
|
|
result["differences"].append(f"First diff at line {i+1}")
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
def compare_with_reference(generated_dir: str, reference_dir: str) -> None:
|
|
"""Compare generated files with reference files."""
|
|
print(f"\n=== Comparing generated files with reference ===")
|
|
print(f"Generated: {generated_dir}")
|
|
print(f"Reference: {reference_dir}\n")
|
|
|
|
files_to_compare = [
|
|
("vocab.json", "json"),
|
|
("merges.txt", "text"),
|
|
("tokenizer.json", "json"),
|
|
("tokenizer_config.json", "json"),
|
|
("added_tokens.json", "json"),
|
|
("special_tokens_map.json", "json"),
|
|
]
|
|
|
|
all_identical = True
|
|
|
|
for filename, file_type in files_to_compare:
|
|
gen_file = os.path.join(generated_dir, filename)
|
|
ref_file = os.path.join(reference_dir, filename)
|
|
|
|
if file_type == "json":
|
|
result = compare_json_files(gen_file, ref_file, filename)
|
|
else:
|
|
result = compare_text_files(gen_file, ref_file, filename)
|
|
|
|
if result["identical"]:
|
|
print(f"✅ {filename}: IDENTICAL")
|
|
else:
|
|
print(f"❌ {filename}: DIFFERENT")
|
|
for diff in result["differences"][:5]: # Show first 5 differences
|
|
print(f" - {diff}")
|
|
if len(result["differences"]) > 5:
|
|
print(f" ... and {len(result['differences']) - 5} more differences")
|
|
all_identical = False
|
|
|
|
print()
|
|
if all_identical:
|
|
print("🎉 All files are identical!")
|
|
else:
|
|
print("⚠️ Some files have differences. See details above.")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate VibeVoice tokenizer files from Qwen2 base"
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output directory for generated files (default: temp directory)"
|
|
)
|
|
parser.add_argument(
|
|
"--compare", "-c",
|
|
type=str,
|
|
default=None,
|
|
help="Reference directory to compare generated files against"
|
|
)
|
|
parser.add_argument(
|
|
"--qwen-model",
|
|
type=str,
|
|
default=DEFAULT_QWEN_MODEL,
|
|
help=f"Qwen model to download base tokenizer from (default: {DEFAULT_QWEN_MODEL})"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine output directory
|
|
if args.output:
|
|
output_dir = args.output
|
|
cleanup = False
|
|
else:
|
|
output_dir = tempfile.mkdtemp(prefix="vibevoice_tokenizer_")
|
|
cleanup = not args.compare # Only cleanup if not comparing
|
|
|
|
try:
|
|
# Generate files
|
|
generate_vibevoice_tokenizer_files(output_dir, args.qwen_model)
|
|
|
|
# Compare if requested
|
|
if args.compare:
|
|
compare_with_reference(output_dir, args.compare)
|
|
|
|
if not args.output:
|
|
print(f"\nGenerated files are in: {output_dir}")
|
|
|
|
finally:
|
|
if cleanup and not args.output:
|
|
print(f"\nCleaning up temporary directory: {output_dir}")
|
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|