VibeVoice/vllm_plugin/tools/generate_tokenizer_files.py

#!/usr/bin/env python3
"""
Standalone tool to generate VibeVoice tokenizer files from Qwen2 base.

Downloads base tokenizer from Qwen2 and patches it with VibeVoice-specific
audio tokens and chat template modifications.

Usage:
    python generate_tokenizer_files.py --output /path/to/output [--compare /path/to/reference]
"""

import argparse
import json
import os
import shutil
import tempfile
from typing import Optional, Dict, Any


# Qwen2.5 extended tokens (151646-151664)
# These are NOT in base Qwen2-7B but ARE in Qwen2.5 and Qwen2-VL
# VibeVoice uses some of these for speech: object_ref_start/end, box_start
QWEN25_EXTENDED_TOKENS = {
    "<|object_ref_start|>": 151646,  # Used as speech_start_id
    "<|object_ref_end|>": 151647,    # Used as speech_end_id
    "<|box_start|>": 151648,         # Used as speech_pad_id
    "<|box_end|>": 151649,
    "<|quad_start|>": 151650,
    "<|quad_end|>": 151651,
    "<|vision_start|>": 151652,
    "<|vision_end|>": 151653,
    "<|vision_pad|>": 151654,
    "<|image_pad|>": 151655,
    "<|video_pad|>": 151656,
    "<tool_call>": 151657,
    "</tool_call>": 151658,
    "<|fim_prefix|>": 151659,
    "<|fim_middle|>": 151660,
    "<|fim_suffix|>": 151661,
    "<|fim_pad|>": 151662,
    "<|repo_name|>": 151663,
    "<|file_sep|>": 151664,
}

# VibeVoice-specific audio tokens (IDs follow Qwen2.5's last token 151664)
VIBEVOICE_AUDIO_TOKENS = {
    "<|AUDIO|>": 151665,
    "<|audio_bos|>": 151666,
    "<|audio_eos|>": 151667,
}

# All extended tokens (Qwen2.5 + VibeVoice)
ALL_EXTENDED_TOKENS = {**QWEN25_EXTENDED_TOKENS, **VIBEVOICE_AUDIO_TOKENS}

# Chat template with audio support
# Key modification: handles part['type'] == 'audio' or 'audio_url' -> '<|AUDIO|>'
VIBEVOICE_CHAT_TEMPLATE = """{%- if tools %}
    {{- '<|im_start|>system\\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {%- if messages[0]['content'] is string %}
            {{- messages[0]['content'] }}
        {%- else %}
            {%- for part in messages[0]['content'] %}
                {%- if part['type'] == 'text' %}
                    {{- part['text'] }}
                {%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
                    {{- '<|AUDIO|>' }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
    {%- else %}
        {{- 'You are a helpful assistant.' }}
    {%- endif %}
    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}
    {%- for tool in tools %}
        {{- "\\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\\n' }}
        {%- if messages[0]['content'] is string %}
            {{- messages[0]['content'] }}
        {%- else %}
            {%- for part in messages[0]['content'] %}
                {%- if part['type'] == 'text' %}
                    {{- part['text'] }}
                {%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
                    {{- '<|AUDIO|>' }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\\n' }}
    {%- else %}
        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
        {{- '<|im_start|>' + message.role + '\\n' }}
        {%- if message['content'] is string %}
            {{- message['content'] }}
        {%- else %}
            {%- for part in message['content'] %}
                {%- if part['type'] == 'text' %}
                    {{- part['text'] }}
                {%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
                    {{- '<|AUDIO|>' }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\\n' }}
    {%- elif message.role == "assistant" %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\\n' + message.content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\\n<tool_call>\\n{"name": "' }}
            {{- tool_call.name }}
            {{- '", "arguments": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\\n' }}
    {%- elif message.role == "tool" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\\n<tool_response>\\n' }}
        {{- message.content }}
        {{- '\\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\\n' }}
{%- endif %}"""


# Default to Qwen2.5-7B which has all the extended tokens (151646-151664)
DEFAULT_QWEN_MODEL = "Qwen/Qwen2.5-7B"


def download_qwen_tokenizer_files(output_dir: str, qwen_model: str = DEFAULT_QWEN_MODEL) -> None:
    """Download base tokenizer files from Qwen2.5 (which includes extended tokens)."""
    try:
        from huggingface_hub import hf_hub_download
    except ImportError:
        raise ImportError("Please install huggingface_hub: pip install huggingface_hub")

    files_to_download = [
        "vocab.json",
        "merges.txt",
        "tokenizer.json",
        "tokenizer_config.json",
    ]

    os.makedirs(output_dir, exist_ok=True)

    for filename in files_to_download:
        print(f"Downloading {filename} from {qwen_model}...")
        hf_hub_download(
            repo_id=qwen_model,
            filename=filename,
            local_dir=output_dir,
            local_dir_use_symlinks=False,
        )


def patch_tokenizer_config(output_dir: str) -> None:
    """
    Patch tokenizer_config.json with VibeVoice audio tokens and chat template.
    """
    config_path = os.path.join(output_dir, "tokenizer_config.json")

    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)

    # 1. Add ALL extended tokens to added_tokens_decoder (Qwen2.5 + VibeVoice audio)
    if "added_tokens_decoder" not in config:
        config["added_tokens_decoder"] = {}

    for token, token_id in ALL_EXTENDED_TOKENS.items():
        if str(token_id) not in config["added_tokens_decoder"]:
            # Determine if token should be marked as "special"
            # tool_call tokens are NOT special in Qwen2.5
            is_special = token not in ("<tool_call>", "</tool_call>", "<|fim_prefix|>",
                                       "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>",
                                       "<|repo_name|>", "<|file_sep|>")
            config["added_tokens_decoder"][str(token_id)] = {
                "content": token,
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": is_special,
            }

    # 2. Add audio tokens to additional_special_tokens
    if "additional_special_tokens" not in config:
        config["additional_special_tokens"] = []

    for token in VIBEVOICE_AUDIO_TOKENS.keys():
        if token not in config["additional_special_tokens"]:
            config["additional_special_tokens"].append(token)

    # 3. Modify chat_template to support audio
    # Instead of replacing entirely, we patch the existing template to handle audio
    chat_template = config.get("chat_template", "")
    if chat_template and "<|AUDIO|>" not in chat_template:
        # Insert audio handling into the template
        # Find patterns like: {%- if part['type'] == 'text' %}
        # Add after: {%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}\n                    {{- '<|AUDIO|>' }}
        audio_handler = """{%- elif part['type'] == 'audio' or part['type'] == 'audio_url' %}
                    {{- '<|AUDIO|>' }}"""

        # Pattern to find: after handling 'text' type, before endif
        import re
        # Look for the pattern where we handle text type and add audio handling
        pattern = r"(\{\%- if part\['type'\] == 'text' \%\}\s*\n\s*\{\{- part\['text'\] \}\})"
        replacement = r"\1\n                " + audio_handler.replace("\n", r"\n")

        modified_template = re.sub(pattern, replacement, chat_template)

        if modified_template != chat_template:
            config["chat_template"] = modified_template
            print("  - Added audio support to existing chat_template")
        else:
            # Fallback: use our predefined template
            print("  - Warning: Could not patch existing template, using predefined template")
            config["chat_template"] = VIBEVOICE_CHAT_TEMPLATE

    # 4. Update model_max_length for long audio support
    config["model_max_length"] = 131072

    # 5. Add add_bos_token if not present
    if "add_bos_token" not in config:
        config["add_bos_token"] = False

    # Write back
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, indent=2, ensure_ascii=False)

    print(f"Patched {config_path}")


def patch_tokenizer_json(output_dir: str) -> None:
    """
    Patch tokenizer.json with VibeVoice audio tokens.
    """
    tokenizer_path = os.path.join(output_dir, "tokenizer.json")

    with open(tokenizer_path, "r", encoding="utf-8") as f:
        tokenizer = json.load(f)

    # Find existing token IDs to avoid duplicates
    existing_ids = set()
    if "added_tokens" in tokenizer:
        for token_entry in tokenizer["added_tokens"]:
            existing_ids.add(token_entry.get("id"))

    # Add ALL extended tokens (Qwen2.5 + VibeVoice audio)
    for token, token_id in ALL_EXTENDED_TOKENS.items():
        if token_id not in existing_ids:
            # Determine if token should be marked as "special"
            is_special = token not in ("<tool_call>", "</tool_call>", "<|fim_prefix|>",
                                       "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>",
                                       "<|repo_name|>", "<|file_sep|>")
            tokenizer["added_tokens"].append({
                "id": token_id,
                "content": token,
                "single_word": False,
                "lstrip": False,
                "rstrip": False,
                "normalized": False,
                "special": is_special,
            })

    # Write back
    with open(tokenizer_path, "w", encoding="utf-8") as f:
        json.dump(tokenizer, f, indent=2, ensure_ascii=False)

    print(f"Patched {tokenizer_path}")


def generate_added_tokens_json(output_dir: str) -> None:
    """
    Generate added_tokens.json from tokenizer_config.json.
    """
    config_path = os.path.join(output_dir, "tokenizer_config.json")

    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)

    added_tokens = {}
    for token_id, token_info in config.get("added_tokens_decoder", {}).items():
        content = token_info.get("content")
        if content:
            added_tokens[content] = int(token_id)

    output_path = os.path.join(output_dir, "added_tokens.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(added_tokens, f, indent=2, ensure_ascii=False)

    print(f"Generated {output_path}")


def generate_special_tokens_map_json(output_dir: str) -> None:
    """
    Generate special_tokens_map.json with VibeVoice special tokens.
    """
    # Build the special tokens map
    special_tokens_map = {
        "additional_special_tokens": [],
        "eos_token": "<|endoftext|>",
        "pad_token": "<|endoftext|>",
        "unk_token": "<|endoftext|>",
    }

    # Add audio tokens as additional_special_tokens
    for token in VIBEVOICE_AUDIO_TOKENS.keys():
        special_tokens_map["additional_special_tokens"].append({
            "content": token,
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
        })

    # Add some commonly used special tokens
    common_special = ["<|object_ref_start|>", "<|object_ref_end|>", "<|box_start|>"]
    for token in common_special:
        special_tokens_map["additional_special_tokens"].append({
            "content": token,
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
        })

    output_path = os.path.join(output_dir, "special_tokens_map.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)

    print(f"Generated {output_path}")


def generate_vibevoice_tokenizer_files(output_dir: str, qwen_model: str = DEFAULT_QWEN_MODEL) -> None:
    """
    Generate all 6 VibeVoice tokenizer files.

    Files generated:
    1. vocab.json - from Qwen2.5 (unchanged)
    2. merges.txt - from Qwen2.5 (unchanged)
    3. tokenizer.json - from Qwen2.5 + audio tokens
    4. tokenizer_config.json - from Qwen2.5 + audio tokens + chat_template
    5. added_tokens.json - generated from tokenizer_config.json
    6. special_tokens_map.json - generated with VibeVoice tokens
    """
    print(f"=== Generating VibeVoice tokenizer files to {output_dir} ===\n")

    # Step 1: Download base files from Qwen2
    download_qwen_tokenizer_files(output_dir, qwen_model)

    # Step 2: Patch tokenizer_config.json
    patch_tokenizer_config(output_dir)

    # Step 3: Patch tokenizer.json
    patch_tokenizer_json(output_dir)

    # Step 4: Generate added_tokens.json
    generate_added_tokens_json(output_dir)

    # Step 5: Generate special_tokens_map.json
    generate_special_tokens_map_json(output_dir)

    print(f"\n✅ All 6 tokenizer files generated in {output_dir}")


def compare_json_files(file1: str, file2: str, name: str) -> Dict[str, Any]:
    """Compare two JSON files and return differences."""
    result = {
        "name": name,
        "identical": False,
        "differences": [],
    }

    if not os.path.exists(file1):
        result["differences"].append(f"File 1 not found: {file1}")
        return result

    if not os.path.exists(file2):
        result["differences"].append(f"File 2 not found: {file2}")
        return result

    with open(file1, "r", encoding="utf-8") as f:
        data1 = json.load(f)

    with open(file2, "r", encoding="utf-8") as f:
        data2 = json.load(f)

    if data1 == data2:
        result["identical"] = True
        return result

    # Find specific differences
    def find_diff(d1, d2, path=""):
        diffs = []
        if isinstance(d1, dict) and isinstance(d2, dict):
            all_keys = set(d1.keys()) | set(d2.keys())
            for k in all_keys:
                new_path = f"{path}.{k}" if path else k
                if k not in d1:
                    diffs.append(f"Missing in generated: {new_path}")
                elif k not in d2:
                    diffs.append(f"Extra in generated: {new_path}")
                else:
                    diffs.extend(find_diff(d1[k], d2[k], new_path))
        elif isinstance(d1, list) and isinstance(d2, list):
            if len(d1) != len(d2):
                diffs.append(f"{path}: list length differs ({len(d1)} vs {len(d2)})")
            # For lists, just check if they're equal (detailed diff is complex)
            if d1 != d2:
                diffs.append(f"{path}: list content differs")
        elif d1 != d2:
            # Truncate long values for readability
            v1 = str(d1)[:100] + "..." if len(str(d1)) > 100 else str(d1)
            v2 = str(d2)[:100] + "..." if len(str(d2)) > 100 else str(d2)
            diffs.append(f"{path}: '{v1}' vs '{v2}'")
        return diffs

    result["differences"] = find_diff(data1, data2)
    return result


def compare_text_files(file1: str, file2: str, name: str) -> Dict[str, Any]:
    """Compare two text files."""
    result = {
        "name": name,
        "identical": False,
        "differences": [],
    }

    if not os.path.exists(file1):
        result["differences"].append(f"File 1 not found: {file1}")
        return result

    if not os.path.exists(file2):
        result["differences"].append(f"File 2 not found: {file2}")
        return result

    with open(file1, "r", encoding="utf-8") as f:
        content1 = f.read()

    with open(file2, "r", encoding="utf-8") as f:
        content2 = f.read()

    if content1 == content2:
        result["identical"] = True
    else:
        lines1 = content1.splitlines()
        lines2 = content2.splitlines()
        result["differences"].append(f"Line count: {len(lines1)} vs {len(lines2)}")

        # Find first difference
        for i, (l1, l2) in enumerate(zip(lines1, lines2)):
            if l1 != l2:
                result["differences"].append(f"First diff at line {i+1}")
                break

    return result


def compare_with_reference(generated_dir: str, reference_dir: str) -> None:
    """Compare generated files with reference files."""
    print(f"\n=== Comparing generated files with reference ===")
    print(f"Generated: {generated_dir}")
    print(f"Reference: {reference_dir}\n")

    files_to_compare = [
        ("vocab.json", "json"),
        ("merges.txt", "text"),
        ("tokenizer.json", "json"),
        ("tokenizer_config.json", "json"),
        ("added_tokens.json", "json"),
        ("special_tokens_map.json", "json"),
    ]

    all_identical = True

    for filename, file_type in files_to_compare:
        gen_file = os.path.join(generated_dir, filename)
        ref_file = os.path.join(reference_dir, filename)

        if file_type == "json":
            result = compare_json_files(gen_file, ref_file, filename)
        else:
            result = compare_text_files(gen_file, ref_file, filename)

        if result["identical"]:
            print(f"✅ {filename}: IDENTICAL")
        else:
            print(f"❌ {filename}: DIFFERENT")
            for diff in result["differences"][:5]:  # Show first 5 differences
                print(f"   - {diff}")
            if len(result["differences"]) > 5:
                print(f"   ... and {len(result['differences']) - 5} more differences")
            all_identical = False

    print()
    if all_identical:
        print("🎉 All files are identical!")
    else:
        print("⚠️  Some files have differences. See details above.")


def main():
    parser = argparse.ArgumentParser(
        description="Generate VibeVoice tokenizer files from Qwen2 base"
    )
    parser.add_argument(
        "--output", "-o",
        type=str,
        default=None,
        help="Output directory for generated files (default: temp directory)"
    )
    parser.add_argument(
        "--compare", "-c",
        type=str,
        default=None,
        help="Reference directory to compare generated files against"
    )
    parser.add_argument(
        "--qwen-model",
        type=str,
        default=DEFAULT_QWEN_MODEL,
        help=f"Qwen model to download base tokenizer from (default: {DEFAULT_QWEN_MODEL})"
    )

    args = parser.parse_args()

    # Determine output directory
    if args.output:
        output_dir = args.output
        cleanup = False
    else:
        output_dir = tempfile.mkdtemp(prefix="vibevoice_tokenizer_")
        cleanup = not args.compare  # Only cleanup if not comparing

    try:
        # Generate files
        generate_vibevoice_tokenizer_files(output_dir, args.qwen_model)

        # Compare if requested
        if args.compare:
            compare_with_reference(output_dir, args.compare)

        if not args.output:
            print(f"\nGenerated files are in: {output_dir}")

    finally:
        if cleanup and not args.output:
            print(f"\nCleaning up temporary directory: {output_dir}")
            shutil.rmtree(output_dir, ignore_errors=True)


if __name__ == "__main__":
    main()