Files
VibeVoice/index.html
T
2025-08-25 10:22:42 -07:00

390 lines
15 KiB
HTML

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="preload" as="audio" href="assets/audio/2p_see_u_again.mp3">
<link rel="preload" as="audio" href="assets/audio/2p_argument.mp3">
<link rel="preload" as="audio" href="assets/audio/3p_gpt5.mp3">
<link rel="preload" as="audio" href="assets/audio/2p_goat.mp3">
<link rel="preload" as="audio" href="assets/audio/1p_EN2CH.mp3">
<link rel="preload" as="audio" href="assets/audio/1p_CH2EN.mp3">
<link rel="preload" as="audio" href="assets/audio/4p_climate_100min.mp3">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css">
<title>VibeVoice</title>
<style>
:root {
--bg:#ffffff;
--card:#ffffff;
--muted:#555555;
--text:#000000;
--border:#dddddd;
--accent:#0066cc;
}
* { box-sizing:border-box; }
html, body { margin:0; padding:0; }
body {
color:var(--text);
background: var(--bg);
font:16px/1.6 system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial;
/* 去掉分页吸附与固定高度,整体自然流式 */
}
/* 容器:流式堆叠,不限制高度 */
.wrap {
max-width:1100px;
width:min(92vw,1100px);
margin:0 auto;
padding:24px 0;
}
/* 小号全局间距:让内容紧凑连续 */
section.page { padding: 16px 16px; }
h1 { margin:0 0 12px; font-size:clamp(28px,4vw,40px); text-align:center; }
h2 { margin:16px 0 10px; font-size:22px; }
h3 { margin:8px 0 6px; font-size:16px; font-weight:600; color:#222; }
.muted { color:var(--muted); }
.links a { text-decoration: none; color: var(--accent); font-weight: 500; }
.links a:hover { text-decoration: underline; }
.links .sep { margin: 0 6px; color: var(--muted); }
.case {
margin: 32px 0 24px; /* 上方 32px 留白,下方 24px */
}
/* 音频+转写纵向排布(无需拉伸) */
.sync-block { display:block; }
audio { width:100%; display:block; margin:6px 0 8px; }
/* 只保留文本框;限制自身高度并滚动,避免溢出叠到下一个 case */
.transcript {
max-height: 20vh; /* 需要更紧凑可调成 30vh / 28vh */
overflow:auto;
padding:8px;
border:1px solid var(--border);
border-radius:10px;
background:#fff;
scroll-behavior:smooth;
}
/* 行内文本:长词/URL 自动换行,绝不撑破容器 */
.line {
display:grid;
grid-template-columns:84px 75px 1fr;
gap:10px;
padding:8px 10px;
border-radius:10px;
border:1px solid transparent;
cursor:pointer;
}
.line:hover { background:#f7f7f7; border-color:#eee; }
.line.active { background:rgba(0,102,204,0.08); border-color:#0066cc; }
.ts, .spk {
font-family:ui-monospace,Menlo,Consolas,monospace;
color:#333; font-size:14px; align-self:center; opacity:.9;
white-space:nowrap;
}
.ts { text-align:right; }
.spk { text-align:left; }
.txt {
white-space:pre-wrap;
word-break:break-word; /* 关键:长词断行 */
overflow-wrap:anywhere; /* 关键:避免撑破容器 */
}
/* 移除所有固定高度链条,避免内容被强制压缩 */
.card, .scroll, .sync-grid { height:auto; min-height:0; }
</style>
</head>
<body>
<div class="wrap">
<!-- 首页:和后面 demo 自然续上 -->
<section class="page" id="overview">
<header>
<h1 style="font-size: 2.3em;">VibeVoice: A Frontier Open-Source Text-to-Speech Model</h1>
<!-- <p class="links" style="text-align:center; margin:0 0 4px;">
<a href="https://aka.ms/GeneralAI" target="_blank">MSRA GeneralAI Group</a>
</p> -->
<p class="links" style="text-align:center; margin:0 0 14px;">
<a href="https://github.com/microsoft/VibeVoice/report/TechnicalReport.pdf" target="_blank">📄 Report</a>
<span class="sep">·</span>
<a href="https://github.com/microsoft/VibeVoice" target="_blank"><svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: text-bottom;"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg> Code</a>
<span class="sep">·</span>
<a href="https://huggingface.co/collections/microsoft/vibevoice-68a2ef24a875c44be47b034f" target="_blank">🤗 Hugging Face</a>
<span class="sep">·</span>
<a href="https://aka.ms/VibeVoice-Demo" target="_blank">
<img src="assets/image/microphone.svg" alt="Demo" width="16" height="16" style="vertical-align:text-bottom;"> Demo
</a>
</p>
<p class="muted" style="margin:0;">
VibeVoice is a novel framework designed for generating <b>expressive, long-form, multi-speaker conversational audio</b>, such as podcasts, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly in scalability, speaker consistency, and natural turn-taking.
A core innovation of VibeVoice is its use of continuous speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and dialogue flow, and a diffusion head to generate high-fidelity acoustic details.
The model can synthesize speech up to 90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.
</p>
<div style="display:flex; justify-content:center; align-items:flex-start; gap:60px; margin:20px 0; width:100%;">
<div style="flex:1; text-align:center;">
<img src="assets/image/VibeVoice.jpg" alt="VibeVoice Framework" style="width:120%; height:400px; object-fit:contain;">
</div>
<div style="flex:1; text-align:center;">
<img src="assets/image/MOS-preference.png" alt="MOS Preference Results" style="width:75%; height:400px; object-fit:contain;">
</div>
</div>
</header>
</section>
<section class="page" id="demo4">
<h2>Context-Aware Expression</h2>
<div class="case" data-key="demo4-a" data-json="assets/text/2p_argument_gt_timestamp.json">
<h3>Spontaneous Emotion</h3>
<div class="sync-block">
<audio id="audio-demo4-a" controls preload="metadata">
<source src="assets/audio/2p_argument.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo4-a"></div>
</div>
</div>
<div class="case" data-key="demo4-b" data-json="assets/text/2p_see_u_again_gt_timestamp.json">
<h3>Spontaneous Singing</h3>
<div class="sync-block">
<audio id="audio-demo4-b" controls preload="metadata">
<source src="assets/audio/2p_see_u_again.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo4-b"></div>
</div>
<!-- <p style="font-size:0.85em; color:#888; margin:6px 0 0;">* TTS input is the above text only; timestamps are derived from the generated audio and may contain errors.</p> -->
</div>
</section>
<section class="page" id="demo1">
<h2>Podcast with Background Music</h2>
<div class="case" data-key="demo1-a" data-json="assets/text/3p_gpt5_gt_timestamp.json">
<h3>Example 1</h3>
<div class="sync-block">
<audio id="audio-demo1-a" controls preload="metadata">
<source src="assets/audio/3p_gpt5.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo1-a"></div>
</div>
</div>
<div class="case" data-key="demo1-b" data-json="assets/text/2p_goat_gt_timestamp.json">
<h3>Example 2</h3>
<div class="sync-block">
<audio id="audio-demo1-b" controls preload="metadata">
<source src="assets/audio/2p_goat.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo1-b"></div>
</div>
</div>
</section>
<section class="page" id="demo3">
<h2>Cross-Lingual</h2>
<div class="case" data-key="demo3-a" data-json="assets/text/1p_CH2EN_gt_timestamp.json">
<h3>Mandarin to English</h3>
<div class="sync-block">
<audio id="audio-demo3-a" controls preload="metadata">
<source src="assets/audio/1p_CH2EN.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo3-a"></div>
</div>
</div>
<div class="case" data-key="demo3-b" data-json="assets/text/1p_EN2CH_gt_timestamp.json">
<h3>English to Mandarin</h3>
<div class="sync-block">
<audio id="audio-demo3-b" controls preload="metadata">
<source src="assets/audio/1p_EN2CH.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo3-b"></div>
</div>
</div>
</section>
<section class="page" id="demo2">
<h2>Long Conversational Speech</h2>
<div class="case" data-key="demo2-a" data-json="assets/text/4p_climate_45min_gt_timestamp.json">
<!-- <h3>Case A</h3> -->
<div class="sync-block">
<audio id="audio-demo2-a" controls preload="metadata">
<source src="assets/audio/4p_climate_45min.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo2-a"></div>
</div>
</div>
<div class="case" data-key="demo2-a" data-json="assets/text/4p_climate_100min_gt_timestamp.json">
<!-- <h3>Case A</h3> -->
<div class="sync-block">
<audio id="audio-demo2-a" controls preload="metadata">
<source src="assets/audio/4p_climate_100min.mp3" type="audio/mpeg">
</audio>
<div class="transcript" id="trans-demo2-a"></div>
</div>
</div>
</section>
<!-- <section class="page" id="evaluation">
<h2>Subjective Evaluation Results</h2>
<div style="text-align:center; margin:20px 0;">
<img src="assets/MOS-all.svg" alt="MOS Evaluation Results" style="max-width:90%; height:auto;">
</div>
</section> -->
<!-- <footer style="margin:32px 0 16px; font-size:0.85em; color:#888; text-align:center;">
* Timestamps are derived from generated audio and may contain errors.
</footer> -->
</div>
<script>
/* ---- 全局互斥播放:一个播放,其他暂停 ---- */
const allAudios = Array.from(document.querySelectorAll('audio'));
allAudios.forEach(p => p.addEventListener('play', () => {
allAudios.forEach(o => { if (o !== p) o.pause(); });
}));
/* ---- 对齐播放器 ---- */
function mmssms(t){
const m = Math.floor(t/60), s = Math.floor(t%60), ms = Math.floor((t-Math.floor(t))*1000);
return `${String(m).padStart(2,'0')}:${String(s).padStart(2,'0')}.${String(ms).padStart(3,'0')}`;
}
class SyncPlayer{
constructor(key, segs){
this.audio = document.getElementById(`audio-${key}`);
this.transEl = document.getElementById(`trans-${key}`);
this.idx = -1;
this.segments = segs.slice().sort((a,b)=>a.start-b.start);
for(let i=0;i<this.segments.length;i++){
const cur=this.segments[i], nxt=this.segments[i+1];
if (cur.end == null) cur.end = nxt ? nxt.start : Infinity;
}
this.shouldScroll = false; // 仅在用户主动跳转时滚动
this.render(); this.bind();
}
esc(s){
return String(s||'').replace(/[&<>"']/g, ch => (
{'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[ch]
));
}
render(){
this.transEl.innerHTML = '';
this.segments.forEach((seg,i)=>{
const d = document.createElement('div');
d.className = 'line';
d.dataset.idx = i;
d.innerHTML = `
<div class="ts">${mmssms(seg.start)}</div>
<div class="spk">${this.esc(seg.speaker || '')}</div>
<div class="txt">${this.esc(seg.text)}</div>
`;
d.onclick = () => this.seek(seg.start, true, true);
this.transEl.appendChild(d);
});
}
seek(t, autoplay=false, scroll=false){
this.shouldScroll = !!scroll;
this.audio.currentTime = Math.max(0, t);
if (autoplay) this.audio.play().catch(()=>{});
}
findIdx(t){
let lo=0, hi=this.segments.length-1, ans=-1;
while (lo<=hi){
const mid=(lo+hi)>>1, seg=this.segments[mid];
if (t < seg.start) hi=mid-1;
else if (t >= seg.end) lo=mid+1;
else { ans=mid; break; }
}
return ans;
}
setActive(i, scroll=true){
const prev = this.idx >= 0 ? this.transEl.querySelector(`.line[data-idx="${this.idx}"]`) : null;
const next = i >= 0 ? this.transEl.querySelector(`.line[data-idx="${i}"]`) : null;
if (i === this.idx){
if (scroll && next){
const c=this.transEl;
// 滚动到顶部,而不是中心
const targetTop = next.offsetTop - c.offsetTop;
const maxTop = c.scrollHeight - c.clientHeight;
c.scrollTo({ top: Math.max(0, Math.min(targetTop, maxTop)), behavior: 'smooth' });
}
return;
}
if (prev) prev.classList.remove('active');
if (next){
next.classList.add('active');
if (scroll){
const c=this.transEl;
// 滚动到顶部,而不是中心
const targetTop = next.offsetTop - c.offsetTop;
const maxTop = c.scrollHeight - c.clientHeight;
c.scrollTo({ top: Math.max(0, Math.min(targetTop, maxTop)), behavior: 'smooth' });
}
}
this.idx = i;
}
onTime(){
const i = this.findIdx(this.audio.currentTime);
// 播放时始终启用滚动,除非音频已暂停
const doScroll = !this.audio.paused;
this.setActive(i, doScroll);
}
bind(){
const t = () => this.onTime();
this.audio.addEventListener('timeupdate', t);
this.audio.addEventListener('seeked', () => { this.shouldScroll = true; this.onTime(); });
this.audio.addEventListener('seeking', () => { this.shouldScroll = true; });
this.audio.addEventListener('play', t);
this.audio.addEventListener('loadedmetadata', ()=>{
const dur = isFinite(this.audio.duration) ? this.audio.duration : Infinity;
for (let i=0;i<this.segments.length;i++){
if (this.segments[i].end === Infinity) this.segments[i].end = dur;
}
});
}
}
/* ---- 初始化所有 case ---- */
function initCases(){
document.querySelectorAll('.case').forEach(section => {
const key = section.dataset.key;
const json = section.dataset.json;
if (!key || !json) return;
// 让 audio 与 key 对应
const audioEl = section.querySelector('audio');
const transEl = section.querySelector('.transcript');
if (audioEl && transEl){
audioEl.id = `audio-${key}`;
transEl.id = `trans-${key}`;
}
fetch(json)
.then(r => r.json())
.then(data => { new SyncPlayer(key, data); })
.catch(err => console.error(`init ${key} failed:`, err));
});
}
window.addEventListener('DOMContentLoaded', initCases);
</script>
</body>
</html>