feat(web): optional bring-your-own Xiaomi MiMo TTS key (browser-side synthesis)

Public users share one server TTS key, so Xiaomi's per-key RPM/TPM limits
cause silent playback under concurrency. This adds an OPTIONAL path: a user
can store their own Xiaomi MiMo key in the browser and synthesize voice
client-side against Xiaomi's CORS-open endpoints. The key lives only in
localStorage and is never sent to or logged by our server; the shared server
key still serves everyone who does not opt in.

- components/TtsKeyModal.tsx: shared key modal (key-family + region picker),
  reused by both the home and play pages
- app/play/page.tsx: silence nudge moved beside the mute toggle; modal opens
  in place instead of redirecting to the home page
- app/page.tsx: home page consumes the shared modal + readStoredTtsConfig
- lib/clientTtsConfig.ts, lib/ttsPresets.ts: browser config + region presets
- app/api/{start,scene,insert-beat}: thread per-request voice; lib/types update
- docs/xiaomi-tts-key.md + README note

Verified with tsc --noEmit (exit 0).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
yuanzonghao
2026-06-04 11:24:16 +08:00
parent 24b674d792
commit b0b2e922d3
13 changed files with 843 additions and 48 deletions
+3 -1
View File
@@ -22,7 +22,9 @@ export async function POST(req: Request) {
}
try {
const config = loadEngineConfig(req.headers);
const base = loadEngineConfig(req.headers);
// See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
const config = body.clientTts ? { ...base, tts: undefined } : base;
const result = await requestInsertBeat(config, body);
return NextResponse.json(result);
} catch (err) {
+3 -1
View File
@@ -23,7 +23,9 @@ export async function POST(req: Request) {
}
try {
const config = loadEngineConfig(req.headers);
const base = loadEngineConfig(req.headers);
// See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
const config = body.clientTts ? { ...base, tts: undefined } : base;
const result = await requestScene(config, body);
return NextResponse.json(result);
} catch (err) {
+5 -1
View File
@@ -41,7 +41,11 @@ export async function POST(req: Request) {
}
try {
const config = loadEngineConfig(req.headers);
const base = loadEngineConfig(req.headers);
// BYO key: the browser provisions + synths voices directly against Xiaomi
// (key never reaches us), so strip server-side TTS so the engine skips all
// provisioning + synth. See StartRequest.clientTts.
const config = body.clientTts ? { ...base, tts: undefined } : base;
const result = await startSession(config, body);
return NextResponse.json(result);
} catch (err) {
+52 -8
View File
@@ -10,14 +10,8 @@ import {
PLOT_STYLES,
type Gender,
} from "@/lib/options";
/* ============================================================================
InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型)
- 顶部 Header:左上角衬线 wordmark logo
"use client";
import { useRouter } from "next/navigation";
import { useEffect, useRef, useState, type ReactNode } from "react";
import { readStoredTtsConfig } from "@/lib/clientTtsConfig";
import { TtsKeyModal } from "@/components/TtsKeyModal";
/* ============================================================================
InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型)
@@ -1771,7 +1765,12 @@ export default function HomePage() {
// 顶部使用提示:默认展示,用户可点 × 永久关闭(localStorage:infiplot:hintClosed)。
const [hintClosed, setHintClosed] = useState(false);
// 自带 TTS Key 弹窗:可选增强,Key 只存浏览器、绝不经过服务器。
const [ttsOpen, setTtsOpen] = useState(false);
const [ttsConfigured, setTtsConfigured] = useState(false);
const styleRow = OPTS.findIndex((o) => o.modal);
const voiceRow = OPTS.findIndex((o) => o.label === "语音配音");
const genderIndex = sel[0] ?? 0;
const gender = (OPTS[0]!.items[genderIndex] as Gender) ?? "男性向";
const phrases = EXAMPLE_PHRASES[gender];
@@ -1826,6 +1825,11 @@ export default function HomePage() {
}
};
// 启动时回填「已启用」徽标——读 localStorage 判断用户是否已存过 Key。
useEffect(() => {
setTtsConfigured(readStoredTtsConfig() != null);
}, []);
// 输入框随内容自动增高:长文本整段可见(打字与点卡片填入都覆盖)。
useEffect(() => {
const el = inputRef.current;
@@ -2067,6 +2071,30 @@ export default function HomePage() {
))}
</div>
{/* 自带 TTS Key 入口:公共语音模型有 RPM/TPM 限额,高并发易静音;
填自己的小米 MiMo Key(免费)→ 稳定配音、延迟更低,且 Key 只存本地。 */}
<div className="mt-5 flex justify-center">
<button
type="button"
onClick={() => setTtsOpen(true)}
className={
"inline-flex items-center gap-2 rounded-full border px-4 py-1.5 font-sans text-xs md:text-[13px] transition-colors " +
(ttsConfigured
? "border-ember-500/40 bg-ember-500/5 text-ember-500 hover:bg-ember-500/10"
: "border-clay-900/15 text-clay-500 hover:border-clay-900/30 hover:text-clay-700")
}
>
<i
className={
ttsConfigured
? "fa-solid fa-circle-check text-[11px]"
: "fa-solid fa-microphone-lines text-[11px]"
}
/>
{ttsConfigured ? "自带配音 Key · 已启用" : "经常没声音?自带配音 Key(可选)"}
</button>
</div>
{/* 使用提示:可被用户永久关闭(localStorage:infiplot:hintClosed */}
{!hintClosed && (
<div className="relative mx-auto mt-10 md:mt-12 max-w-[640px] rounded-sm border border-clay-900/10 bg-cream-100/50 px-8 py-3.5">
@@ -2235,6 +2263,22 @@ export default function HomePage() {
{byoApiOpen && (
<ByoApiModal value={byoApi} onSave={saveByoApi} onClose={() => setByoApiOpen(false)} />
)}
{ttsOpen && (
<TtsKeyModal
onClose={() => setTtsOpen(false)}
onSaved={(configured) => {
setTtsConfigured(configured);
// 启用自带 Key 时顺手把「语音配音」拨到「开启」——否则用户配了 Key
// 却还是静音,体验自相矛盾。停用时不动其选择,尊重用户原本的偏好。
if (configured && voiceRow >= 0) {
const onIdx = OPTS[voiceRow]!.items.indexOf("开启");
if (onIdx >= 0)
setSel((s) => s.map((v, j) => (j === voiceRow ? onIdx : v)));
}
}}
/>
)}
</div>
);
}
+234 -37
View File
@@ -11,19 +11,25 @@ import {
useState,
} from "react";
import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
import { TtsKeyModal } from "@/components/TtsKeyModal";
import { annotateClick } from "@/lib/annotateClient";
import { loadClientTtsConfig } from "@/lib/clientTtsConfig";
import { PRESETS } from "@/lib/presets";
import { provisionVoice, synthesize } from "@infiplot/tts-client";
import type {
Beat,
BeatAudio,
BeatAudioResponse,
BeatChoice,
Character,
CharacterVoice,
InsertBeatResponse,
Scene,
SceneExit,
SceneResponse,
Session,
StartResponse,
TtsConfig,
VisionResponse,
} from "@infiplot/types";
import { track } from "@/lib/analytics";
@@ -47,6 +53,11 @@ function getByoHeaders(): Record<string, string> {
return {};
}
// Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a
// non-BYO, unmuted player. Set high enough that one transient miss won't trip
// it, low enough to catch a scene that's clearly being rate-limited.
const SILENCE_NUDGE_THRESHOLD = 3;
// Cap how long we wait for the browser to download + decode a scene image
// before giving up and rendering anyway. Runware's CDN is usually <2s for a
// 1792×1024 PNG, but over slow links / VPN / strict corp networks the same
@@ -274,6 +285,7 @@ function prefetchScenePath(
baseSession: Session,
steps: ScenePathStep[],
depth: number,
clientTts: boolean,
): void {
if (depth >= PREFETCH_MAX_DEPTH) return;
const key = pathKey(steps);
@@ -288,7 +300,7 @@ function prefetchScenePath(
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify({ session: specSession }),
body: JSON.stringify({ session: specSession, clientTts }),
signal: abort.signal,
});
if (!res.ok) {
@@ -327,7 +339,13 @@ function prefetchScenePath(
characters: data.characters,
storyState: data.storyState,
};
prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1);
prefetchScenePath(
pool,
carriedBase,
[...steps, nextStep],
depth + 1,
clientTts,
);
}
}
@@ -362,6 +380,44 @@ function clearPool(pool: Map<string, PrefetchEntry>): void {
pool.clear();
}
// ──────────────────────────────────────────────────────────────────────
// BYO voice resolution (client-direct Xiaomi TTS).
//
// In BYO mode the server skips all TTS (clientTts:true), so the browser must
// obtain each speaker's reference audio itself. `cache` is keyed by character
// NAME and persists for the whole session, so a voice locked in on a
// character's first speaking beat stays identical across every later scene —
// even though /api/scene returns its characters without `.voice`. Storing the
// in-flight Promise (not the resolved value) dedupes the burst of concurrent
// beats by the same speaker into ONE voicedesign call, which matters because
// Xiaomi rate-limits voicedesign hard.
// ──────────────────────────────────────────────────────────────────────
async function resolveByoVoice(
cache: Map<string, Promise<CharacterVoice>>,
cfg: TtsConfig,
speaker: Character,
): Promise<CharacterVoice | null> {
const cached = cache.get(speaker.name);
if (cached) return cached;
// Prebaked cards ship baked reference audio — reuse it directly (cross-key
// synth with the user's key works), keeping the prebaked voice identical.
if (speaker.voice) {
const ready = Promise.resolve(speaker.voice);
cache.set(speaker.name, ready);
return ready;
}
if (!speaker.voiceDescription) return null;
const p = provisionVoice(cfg, speaker.voiceDescription);
cache.set(speaker.name, p);
try {
return await p;
} catch (e) {
cache.delete(speaker.name); // failed provision — let a later beat retry
throw e;
}
}
// ──────────────────────────────────────────────────────────────────────
// Component
// ──────────────────────────────────────────────────────────────────────
@@ -402,6 +458,16 @@ function PlayInner() {
const [error, setError] = useState<string | null>(null);
const [presentation, setPresentation] = useState(false);
const [lastExitLabel, setLastExitLabel] = useState<string | null>(null);
// Consecutive server-side TTS misses (null audio / failed /api/beat-audio).
// Climbs when the shared server key is rate-limited by MiMo — the exact pain
// BYO fixes — so the play page can nudge non-BYO users to add their own key.
// Reset to 0 on any successful synth. Only the server path touches it.
const [silenceStrikes, setSilenceStrikes] = useState(0);
// Once the player dismisses the silence nudge, keep it gone for this session.
const [nudgeDismissed, setNudgeDismissed] = useState(false);
// The in-place BYO-key modal, opened from the silence nudge so the player can
// add a key without leaving the play page.
const [ttsModalOpen, setTtsModalOpen] = useState(false);
const startedRef = useRef(false);
const poolRef = useRef<Map<string, PrefetchEntry>>(new Map());
@@ -416,6 +482,21 @@ function PlayInner() {
// 不再单独维护 audioEnabledRef —— 单一来源避免两个 flag 漂移。
const mutedRef = useRef<boolean>(muted);
// Resolved bring-your-own Xiaomi TTS config (region preset + key), read once
// from localStorage. When non-null, the browser provisions + synths voices
// directly against Xiaomi — the key never touches our server — and every
// start/scene/insert-beat request carries clientTts:true so the engine skips
// server-side TTS. null = user hasn't opted in (server default / silent).
const [byoTtsConfig, setByoTtsConfig] = useState<TtsConfig | null>(() =>
loadClientTtsConfig(),
);
const byoTtsRef = useRef<TtsConfig | null>(byoTtsConfig);
// BYO voice cache (see resolveByoVoice). Keyed by character name; persists
// across scenes so each speaker is provisioned at most once per session.
const provisionedVoicesRef = useRef<Map<string, Promise<CharacterVoice>>>(
new Map(),
);
// Mirrors for use inside async handlers (closure-stable)
const sessionRef = useRef<Session | null>(null);
const currentSceneRef = useRef<Scene | null>(null);
@@ -496,34 +577,72 @@ function PlayInner() {
// 「首页选关闭」也走这条路:bootstrap 时 muted 已被初始化为 true。
if (!beat.speaker || !beat.line) return;
const speaker = sess.characters.find((c) => c.name === beat.speaker);
if (!speaker?.voice) return; // not yet provisioned — server can't synth anyway
if (!speaker) return;
const byo = byoTtsRef.current;
// Non-BYO relies on the server having provisioned speaker.voice. BYO
// skipped server TTS, so it needs a baked voice (prebaked card) or a
// voiceDescription to provision from in the browser.
if (!byo && !speaker.voice) return;
if (byo && !speaker.voice && !speaker.voiceDescription) return;
if (beatAudioAbortRef.current.has(beat.id)) return;
const abort = new AbortController();
beatAudioAbortRef.current.set(beat.id, abort);
try {
const res = await fetch("/api/beat-audio", {
method: "POST",
headers: {
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify({
beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
voice: speaker.voice,
}),
signal: abort.signal,
});
if (!res.ok) return;
const json = (await res.json()) as BeatAudioResponse;
// Skip the state write if we've been aborted between the .ok check and
let audio: BeatAudio | null = null;
if (byo) {
// Client-direct: provision (once per speaker, cached) + synth against
// Xiaomi with the user's own key — no /api/beat-audio round-trip and
// the key never touches our server.
const voice = await resolveByoVoice(
provisionedVoicesRef.current,
byo,
speaker,
);
if (!voice || abort.signal.aborted) return;
const out = await synthesize(
byo,
voice,
beat.line,
beat.lineDelivery,
abort.signal,
);
audio = { base64: out.audioBase64, mime: out.mimeType };
} else {
const res = await fetch("/api/beat-audio", {
method: "POST",
headers: {
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify({
beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
voice: speaker.voice,
}),
signal: abort.signal,
});
if (!res.ok) {
setSilenceStrikes((n) => Math.min(n + 1, 99));
return;
}
const json = (await res.json()) as BeatAudioResponse;
audio = json.audio;
// Null audio usually means MiMo rate-limited or timed out the shared
// key — track the streak; a real clip resets it.
if (audio) setSilenceStrikes(0);
else setSilenceStrikes((n) => Math.min(n + 1, 99));
}
// Skip the state write if we've been aborted between the await and
// here — beat ids are scene-local, so a late arrival from a prior
// scene would otherwise overwrite the current scene's audio under the
// same id.
if (json.audio && !abort.signal.aborted) {
setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio }));
if (audio && !abort.signal.aborted) {
const settled = audio;
setBeatAudioMap((m) => ({ ...m, [beat.id]: settled }));
}
} catch {
// aborted or network error — silent fallback
// aborted / network / Xiaomi rate-limit — silent fallback (no audio)
} finally {
// Only clear the slot if it's still ours. An aborted prior fetch
// running its finally late could otherwise delete the controller of a
@@ -598,6 +717,27 @@ function PlayInner() {
prefetchSceneAudio();
}, [muted, prefetchSceneAudio]);
// ── BYO key enabled/disabled from the play page (silence nudge → modal) ─
// On enable: point the synth path at the user's key and immediately
// re-synthesize the current scene in-browser, so the voices the player just
// missed come back without a reload (their characters already carry
// server-provisioned `voice`, which resolveByoVoice reuses with the new key).
// On disable: just stop using it; later scenes fall back to the server.
const handleByoSaved = useCallback(
(configured: boolean) => {
const cfg = configured ? loadClientTtsConfig() : null;
byoTtsRef.current = cfg;
setByoTtsConfig(cfg);
if (cfg) {
setSilenceStrikes(0);
cancelBeatAudioFetches();
setBeatAudioMap({});
prefetchSceneAudio();
}
},
[prefetchSceneAudio],
);
// ── Presentation mode toggle ─────────────────────────────────────────
const togglePresentation = useCallback(async () => {
const entering = !presentation;
@@ -720,7 +860,10 @@ function PlayInner() {
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify(livePayload),
body: JSON.stringify({
...livePayload,
clientTts: !!byoTtsRef.current,
}),
}).then(async (r) => {
if (!r.ok) {
const j = (await r.json().catch(() => ({}))) as { error?: string };
@@ -793,7 +936,7 @@ function PlayInner() {
nextSceneSeed: choice.effect.nextSceneSeed,
},
};
prefetchScenePath(poolRef.current, s, [step], 0);
prefetchScenePath(poolRef.current, s, [step], 0, !!byoTtsRef.current);
}
}, [currentScene?.id, session?.id]);
@@ -948,7 +1091,10 @@ function PlayInner() {
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify({ session: specSession }),
body: JSON.stringify({
session: specSession,
clientTts: !!byoTtsRef.current,
}),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
@@ -995,6 +1141,7 @@ function PlayInner() {
body: JSON.stringify({
session,
freeformAction: decision.intent.freeformAction,
clientTts: !!byoTtsRef.current,
}),
});
if (!insertRes.ok) {
@@ -1075,7 +1222,10 @@ function PlayInner() {
"Content-Type": "application/json",
...getByoHeaders(),
},
body: JSON.stringify({ session: specSession }),
body: JSON.stringify({
session: specSession,
clientTts: !!byoTtsRef.current,
}),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as {
@@ -1163,6 +1313,16 @@ function PlayInner() {
const sceneCount = session?.history.length ?? 0;
const beatCount = visitedBeatsRef.current.length;
// Surface the BYO-key nudge only to an unmuted, non-BYO player whose last few
// beats came back silent (shared key rate-limited) — the exact pain BYO fixes.
// Dismissible for the session.
const showSilenceNudge =
phase === "ready" &&
!muted &&
!byoTtsConfig &&
!nudgeDismissed &&
silenceStrikes >= SILENCE_NUDGE_THRESHOLD;
return (
<div className="min-h-screen flex flex-col">
<header className="px-5 md:px-12 pt-6 md:pt-8 flex items-center justify-between">
@@ -1207,18 +1367,46 @@ function PlayInner() {
</button>
}
aboveCanvasLeft={
<button
type="button"
onClick={toggleMuted}
className="text-[10px] smallcaps text-clay-500 hover:text-ember-500 transition-colors flex items-center gap-2"
aria-label={muted ? "取消静音" : "静音"}
title={muted ? "取消静音" : "静音"}
>
<i
className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
/>
{muted ? "静 · 音" : "有 · 声"}
</button>
<>
<button
type="button"
onClick={toggleMuted}
className="text-[10px] smallcaps text-clay-500 hover:text-ember-500 transition-colors flex items-center gap-2"
aria-label={muted ? "取消静音" : "静音"}
title={muted ? "取消静音" : "静音"}
>
<i
className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
/>
{muted ? "静 · 音" : "有 · 声"}
</button>
{/* Silence nudge — a compact pill right beside the mute toggle.
Clicking opens the BYO-key modal in place (no trip to the
homepage). The × dismisses it for the session. */}
{showSilenceNudge && (
<span className="flex items-center gap-1 animate-fade-in">
<button
type="button"
onClick={() => setTtsModalOpen(true)}
className="inline-flex items-center gap-1.5 rounded-full border border-ember-500/40 bg-ember-500/10 px-2.5 py-1 text-[10px] text-ember-500 hover:bg-ember-500/20 transition-colors"
title="经常没声音?填入你自己的小米 MiMo Key(免费),配音更稳定"
>
<i className="fa-solid fa-volume-xmark text-[9px]" />
Key
</button>
<button
type="button"
onClick={() => setNudgeDismissed(true)}
aria-label="关闭提示"
title="关闭"
className="text-clay-400 hover:text-clay-700 transition-colors"
>
<i className="fa-solid fa-xmark text-[10px]" />
</button>
</span>
)}
</>
}
/>
@@ -1235,7 +1423,16 @@ function PlayInner() {
</p>
)}
</div>
</main>
{ttsModalOpen && (
<TtsKeyModal
onClose={() => setTtsModalOpen(false)}
onSaved={handleByoSaved}
footerNote="保存后会立即用这把 Key 在你的浏览器里合成当前这一幕的配音;本设备后续游玩也会自动使用此 Key。"
/>
)}
</div>
);
}