VibeVoice/index.html

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="preload" as="audio" href="assets/audio/2p_see_u_again.mp3">
<link rel="preload" as="audio" href="assets/audio/2p_argument.mp3">
<link rel="preload" as="audio" href="assets/audio/3p_gpt5.mp3">
<link rel="preload" as="audio" href="assets/audio/2p_goat.mp3">
<link rel="preload" as="audio" href="assets/audio/1p_EN2CH.mp3">
<link rel="preload" as="audio" href="assets/audio/1p_CH2EN.mp3">
<link rel="preload" as="audio" href="assets/audio/4p_climate_100min.mp3">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css">

<title>VibeVoice</title>
<style>
  :root {
    --bg:#ffffff;
    --card:#ffffff;
    --muted:#555555;
    --text:#000000;
    --border:#dddddd;
    --accent:#0066cc;
  }
  * { box-sizing:border-box; }
  html, body { margin:0; padding:0; }
  body {
    color:var(--text);
    background: var(--bg);
    font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial;
    /* 去掉分页吸附与固定高度，整体自然流式 */
  }

  /* 容器：流式堆叠，不限制高度 */
  .wrap {
    max-width:1100px;
    width:min(92vw,1100px);
    margin:0 auto;
    padding:24px 0;
  }

  /* 小号全局间距：让内容紧凑连续 */
  section.page { padding: 16px 16px; }
  h1 { margin:0 0 12px; font-size:clamp(28px,4vw,40px); text-align:center; }
  h2 { margin:16px 0 10px; font-size:22px; }
  h3 { margin:8px 0 6px; font-size:16px; font-weight:600; color:#222; }
  .muted { color:var(--muted); }

  .links a { text-decoration: none; color: var(--accent); font-weight: 500; }
  .links a:hover { text-decoration: underline; }
  .links .sep { margin: 0 6px; color: var(--muted); }

  .case {
    margin: 32px 0 24px;   /* 上方 32px 留白，下方 24px */
  }

  /* 音频+转写纵向排布（无需拉伸） */
  .sync-block { display:block; }

  audio { width:100%; display:block; margin:6px 0 8px; }

  /* 只保留文本框；限制自身高度并滚动，避免溢出叠到下一个 case */
  .transcript {
    max-height: 20vh;          /* 需要更紧凑可调成 30vh / 28vh */
    overflow:auto;
    padding:8px;
    border:1px solid var(--border);
    border-radius:10px;
    background:#fff;
    scroll-behavior:smooth;
  }

  /* 行内文本：长词/URL 自动换行，绝不撑破容器 */
  .line {
    display:grid;
    grid-template-columns:84px 75px 1fr;
    gap:10px;
    padding:8px 10px;
    border-radius:10px;
    border:1px solid transparent;
    cursor:pointer;
  }
  .line:hover { background:#f7f7f7; border-color:#eee; }
  .line.active { background:rgba(0,102,204,0.08); border-color:#0066cc; }

  .ts, .spk {
    font-family:ui-monospace,Menlo,Consolas,monospace;
    color:#333; font-size:14px; align-self:center; opacity:.9;
    white-space:nowrap;
  }
  .ts { text-align:right; }
  .spk { text-align:left; }
  .txt {
    white-space:pre-wrap;
    word-break:break-word;      /* 关键：长词断行 */
    overflow-wrap:anywhere;     /* 关键：避免撑破容器 */
  }

  /* 移除所有固定高度链条，避免内容被强制压缩 */
  .card, .scroll, .sync-grid { height:auto; min-height:0; }
</style>
</head>
<body>

<div class="wrap">

  <!-- 首页：和后面 demo 自然续上 -->
  <section class="page" id="overview">
    <header>
      <h1 style="font-size: 2.3em;">VibeVoice: A Frontier Open-Source Text-to-Speech Model</h1>
      <!-- <p class="links" style="text-align:center; margin:0 0 4px;">
        <a href="https://aka.ms/GeneralAI" target="_blank">MSRA GeneralAI Group</a>
      </p> -->
      <p class="links" style="text-align:center; margin:0 0 14px;">
        <a href="https://github.com/microsoft/VibeVoice/report/TechnicalReport.pdf" target="_blank">📄 Report</a>
        <span class="sep">·</span>
        <a href="https://github.com/microsoft/VibeVoice" target="_blank"><svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: text-bottom;"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg> Code</a>
        <span class="sep">·</span>
        <a href="https://huggingface.co/collections/microsoft/vibevoice-68a2ef24a875c44be47b034f" target="_blank">🤗 Hugging Face</a>
        <span class="sep">·</span>
        <a href="https://aka.ms/VibeVoice-Demo" target="_blank">
          <img src="assets/image/microphone.svg" alt="Demo" width="16" height="16" style="vertical-align:text-bottom;"> Demo
        </a>
      </p>

      <p class="muted" style="margin:0;">
        VibeVoice is a novel framework designed for generating <b>expressive, long-form, multi-speaker conversational audio</b>, such as podcasts, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly in scalability, speaker consistency, and natural turn-taking.
A core innovation of VibeVoice is its use of continuous speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and dialogue flow, and a diffusion head to generate high-fidelity acoustic details.
The model can synthesize speech up to 90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.
      </p>

      <div style="display:flex; justify-content:center; align-items:flex-start; gap:60px; margin:20px 0; width:100%;">
        <div style="flex:1; text-align:center;">
          <img src="assets/image/VibeVoice.jpg" alt="VibeVoice Framework" style="width:120%; height:400px; object-fit:contain;">
        </div>
        <div style="flex:1; text-align:center;">
          <img src="assets/image/MOS-preference.png" alt="MOS Preference Results" style="width:75%; height:400px; object-fit:contain;">
        </div>
      </div>
    </header>
  </section>


  <section class="page" id="demo4">
    <h2>Context-Aware Expression</h2>

    <div class="case" data-key="demo4-a" data-json="assets/text/2p_argument_gt_timestamp.json">
      <h3>Spontaneous Emotion</h3>
      <div class="sync-block">
        <audio id="audio-demo4-a" controls preload="metadata">
          <source src="assets/audio/2p_argument.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo4-a"></div>
      </div>
    </div>


    <div class="case" data-key="demo4-b" data-json="assets/text/2p_see_u_again_gt_timestamp.json">
      <h3>Spontaneous Singing</h3>
      <div class="sync-block">
        <audio id="audio-demo4-b" controls preload="metadata">
          <source src="assets/audio/2p_see_u_again.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo4-b"></div>
      </div>
      <!-- <p style="font-size:0.85em; color:#888; margin:6px 0 0;">* TTS input is the above text only; timestamps are derived from the generated audio and may contain errors.</p> -->
    </div>
  </section>

  <section class="page" id="demo1">
    <h2>Podcast with Background Music</h2>

    <div class="case" data-key="demo1-a" data-json="assets/text/3p_gpt5_gt_timestamp.json">
      <h3>Example 1</h3>
      <div class="sync-block">
        <audio id="audio-demo1-a" controls preload="metadata">
          <source src="assets/audio/3p_gpt5.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo1-a"></div>
      </div>
    </div>

    <div class="case" data-key="demo1-b" data-json="assets/text/2p_goat_gt_timestamp.json">
      <h3>Example 2</h3>
      <div class="sync-block">
        <audio id="audio-demo1-b" controls preload="metadata">
          <source src="assets/audio/2p_goat.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo1-b"></div>
      </div>
    </div>
  </section>

  <section class="page" id="demo3">
    <h2>Cross-Lingual</h2>

    <div class="case" data-key="demo3-a" data-json="assets/text/1p_CH2EN_gt_timestamp.json">
      <h3>Mandarin to English</h3>
      <div class="sync-block">
        <audio id="audio-demo3-a" controls preload="metadata">
          <source src="assets/audio/1p_CH2EN.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo3-a"></div>
      </div>
    </div>

    <div class="case" data-key="demo3-b" data-json="assets/text/1p_EN2CH_gt_timestamp.json">
      <h3>English to Mandarin</h3>
      <div class="sync-block">
        <audio id="audio-demo3-b" controls preload="metadata">
          <source src="assets/audio/1p_EN2CH.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo3-b"></div>
      </div>
    </div>

  </section>

  <section class="page" id="demo2">
    <h2>Long Conversational Speech</h2>

    <div class="case" data-key="demo2-a" data-json="assets/text/4p_climate_45min_gt_timestamp.json">
      <!-- <h3>Case A</h3> -->
      <div class="sync-block">
        <audio id="audio-demo2-a" controls preload="metadata">
          <source src="assets/audio/4p_climate_45min.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo2-a"></div>
      </div>
    </div>

    <div class="case" data-key="demo2-a" data-json="assets/text/4p_climate_100min_gt_timestamp.json">
      <!-- <h3>Case A</h3> -->
      <div class="sync-block">
        <audio id="audio-demo2-a" controls preload="metadata">
          <source src="assets/audio/4p_climate_100min.mp3" type="audio/mpeg">
        </audio>
        <div class="transcript" id="trans-demo2-a"></div>
      </div>
    </div>

  </section>

  <!-- <section class="page" id="evaluation">
    <h2>Subjective Evaluation Results</h2>

    <div style="text-align:center; margin:20px 0;">
      <img src="assets/MOS-all.svg" alt="MOS Evaluation Results" style="max-width:90%; height:auto;">
    </div>
  </section> -->

<!-- <footer style="margin:32px 0 16px; font-size:0.85em; color:#888; text-align:center;">
  * Timestamps are derived from generated audio and may contain errors.
</footer> -->
</div>

<script>
/* ---- 全局互斥播放：一个播放，其他暂停 ---- */
const allAudios = Array.from(document.querySelectorAll('audio'));
allAudios.forEach(p => p.addEventListener('play', () => {
  allAudios.forEach(o => { if (o !== p) o.pause(); });
}));

/* ---- 对齐播放器 ---- */
function mmssms(t){
  const m = Math.floor(t/60), s = Math.floor(t%60), ms = Math.floor((t-Math.floor(t))*1000);
  return `${String(m).padStart(2,'0')}:${String(s).padStart(2,'0')}.${String(ms).padStart(3,'0')}`;
}

class SyncPlayer{
  constructor(key, segs){
    this.audio   = document.getElementById(`audio-${key}`);
    this.transEl = document.getElementById(`trans-${key}`);
    this.idx = -1;
    this.segments = segs.slice().sort((a,b)=>a.start-b.start);
    for(let i=0;i<this.segments.length;i++){
      const cur=this.segments[i], nxt=this.segments[i+1];
      if (cur.end == null) cur.end = nxt ? nxt.start : Infinity;
    }
    this.shouldScroll = false; // 仅在用户主动跳转时滚动
    this.render(); this.bind();
  }
  esc(s){
    return String(s||'').replace(/[&<>"']/g, ch => (
      {'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[ch]
    ));
  }
  render(){
    this.transEl.innerHTML = '';
    this.segments.forEach((seg,i)=>{
      const d = document.createElement('div');
      d.className = 'line';
      d.dataset.idx = i;
      d.innerHTML = `
        <div class="ts">${mmssms(seg.start)}</div>
        <div class="spk">${this.esc(seg.speaker || '')}</div>
        <div class="txt">${this.esc(seg.text)}</div>
      `;
      d.onclick = () => this.seek(seg.start, true, true);
      this.transEl.appendChild(d);
    });
  }
  seek(t, autoplay=false, scroll=false){
    this.shouldScroll = !!scroll;
    this.audio.currentTime = Math.max(0, t);
    if (autoplay) this.audio.play().catch(()=>{});
  }
  findIdx(t){
    let lo=0, hi=this.segments.length-1, ans=-1;
    while (lo<=hi){
      const mid=(lo+hi)>>1, seg=this.segments[mid];
      if (t < seg.start) hi=mid-1;
      else if (t >= seg.end) lo=mid+1;
      else { ans=mid; break; }
    }
    return ans;
  }
  setActive(i, scroll=true){
    const prev = this.idx >= 0 ? this.transEl.querySelector(`.line[data-idx="${this.idx}"]`) : null;
    const next = i >= 0 ? this.transEl.querySelector(`.line[data-idx="${i}"]`) : null;

    if (i === this.idx){
      if (scroll && next){
        const c=this.transEl;
        // 滚动到顶部，而不是中心
        const targetTop = next.offsetTop - c.offsetTop;
        const maxTop = c.scrollHeight - c.clientHeight;
        c.scrollTo({ top: Math.max(0, Math.min(targetTop, maxTop)), behavior: 'smooth' });
      }
      return;
    }
    if (prev) prev.classList.remove('active');
    if (next){
      next.classList.add('active');
      if (scroll){
        const c=this.transEl;
        // 滚动到顶部，而不是中心
        const targetTop = next.offsetTop - c.offsetTop;
        const maxTop = c.scrollHeight - c.clientHeight;
        c.scrollTo({ top: Math.max(0, Math.min(targetTop, maxTop)), behavior: 'smooth' });
      }
    }
    this.idx = i;
  }
  onTime(){
    const i = this.findIdx(this.audio.currentTime);
    // 播放时始终启用滚动，除非音频已暂停
    const doScroll = !this.audio.paused;
    this.setActive(i, doScroll);
  }
  bind(){
    const t = () => this.onTime();
    this.audio.addEventListener('timeupdate', t);
    this.audio.addEventListener('seeked',  () => { this.shouldScroll = true; this.onTime(); });
    this.audio.addEventListener('seeking', () => { this.shouldScroll = true; });
    this.audio.addEventListener('play', t);
    this.audio.addEventListener('loadedmetadata', ()=>{
      const dur = isFinite(this.audio.duration) ? this.audio.duration : Infinity;
      for (let i=0;i<this.segments.length;i++){
        if (this.segments[i].end === Infinity) this.segments[i].end = dur;
      }
    });
  }
}

/* ---- 初始化所有 case ---- */
function initCases(){
  document.querySelectorAll('.case').forEach(section => {
    const key  = section.dataset.key;
    const json = section.dataset.json;
    if (!key || !json) return;
    // 让 audio 与 key 对应
    const audioEl = section.querySelector('audio');
    const transEl = section.querySelector('.transcript');
    if (audioEl && transEl){
      audioEl.id  = `audio-${key}`;
      transEl.id  = `trans-${key}`;
    }
    fetch(json)
      .then(r => r.json())
      .then(data => { new SyncPlayer(key, data); })
      .catch(err => console.error(`init ${key} failed:`, err));
  });
}
window.addEventListener('DOMContentLoaded', initCases);
</script>

</body>
</html>