feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration.

- Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session
- Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud)
- Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer
- Graceful degradation: any TTS step failing → silent beat, game continues
- MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens
- Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS)

Squashed from #3:
- feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导
- feat(engine): MOCK_IMAGE 占位图便于本地测试
- fix(tts): address Copilot review on PR #3
- fix(tts): Copilot round-2 review feedback

Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Zonghao Yuan
2026-05-28 20:45:21 +08:00
committed by GitHub
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
+114 -5
View File
@@ -13,30 +13,66 @@ export type Phase =
const SHADOW =
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
const DEFAULT_CHAR_MS = 28;
const MIN_CHAR_MS = 30;
// Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice
// while preserving pitch. Typewriter pacing is divided by the same factor.
const SPEECH_RATE = 1.2;
// If audio metadata never arrives within this window, give up waiting and
// let the typewriter run at default speed.
const AUDIO_WAIT_TIMEOUT_MS = 2500;
// ── Typewriter hook ────────────────────────────────────────────────────
// Returns the progressively-revealed text, a `done` flag, and a `skip()` that
// instantly completes the current text. Reset is keyed by `resetKey` (the beat
// id) rather than the text, so a new beat whose line happens to match the
// previous one still replays from scratch. `done` is derived synchronously
// (not from a post-paint effect) so a stale "done" frame never paints.
// previous one still replays from scratch.
//
// When `targetDurationMs` is provided we space characters to span that audio
// duration, keeping text and voice in lockstep. While `waitForAudio` is true
// and we don't yet know a duration, the typewriter holds (so text doesn't
// race ahead of an audio that's still loading).
function useTypewriter(
text: string,
resetKey: string,
speed = 28,
opts: { targetDurationMs?: number; waitForAudio: boolean } = {
waitForAudio: false,
},
): { shown: string; done: boolean; skip: () => void } {
const { targetDurationMs, waitForAudio } = opts;
const [displayed, setDisplayed] = useState("");
const [prevKey, setPrevKey] = useState(resetKey);
const timer = useRef<ReturnType<typeof setInterval> | null>(null);
// Sticky once the player has skipped this beat: prevents a late-arriving
// audio metadata event from re-triggering the effect and replaying the text.
const skippedRef = useRef(false);
// Render-phase reset (React "adjust state on prop change" pattern): when the
// beat changes, drop the old progress before this render commits.
if (resetKey !== prevKey) {
setPrevKey(resetKey);
setDisplayed("");
skippedRef.current = false;
}
useEffect(() => {
if (!text) return;
// `=== undefined` (not `!targetDurationMs`): 0 means "audio failed or
// timed out — run at default speed". The original truthy check stalled
// the typewriter forever on those fallback paths.
if (waitForAudio && targetDurationMs === undefined) return;
// If the player skipped, settle on the full text and don't restart even
// when audio metadata arrives late and re-triggers this effect.
if (skippedRef.current) {
setDisplayed(text);
return;
}
const speed =
targetDurationMs && text.length > 0
? Math.max(MIN_CHAR_MS, targetDurationMs / text.length)
: DEFAULT_CHAR_MS;
let i = 0;
timer.current = setInterval(() => {
i += 1;
@@ -50,13 +86,14 @@ function useTypewriter(
if (timer.current) clearInterval(timer.current);
timer.current = null;
};
}, [resetKey, text, speed]);
}, [resetKey, text, targetDurationMs, waitForAudio]);
const skip = useCallback(() => {
if (timer.current) {
clearInterval(timer.current);
timer.current = null;
}
skippedRef.current = true;
setDisplayed(text);
}, [text]);
@@ -123,6 +160,9 @@ function ChoiceButton({
// ── Main component ─────────────────────────────────────────────────────
export function PlayCanvas({
imageBase64,
audioBase64,
audioMime,
muted,
phase,
beat,
pendingClick,
@@ -132,6 +172,9 @@ export function PlayCanvas({
fullViewport = false,
}: {
imageBase64: string | null;
audioBase64: string | null;
audioMime: string | null;
muted: boolean;
phase: Phase;
beat: Beat | null;
pendingClick: { x: number; y: number } | null;
@@ -141,7 +184,11 @@ export function PlayCanvas({
fullViewport?: boolean;
}) {
const imgRef = useRef<HTMLImageElement>(null);
const audioRef = useRef<HTMLAudioElement>(null);
const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
const [audioDurationMs, setAudioDurationMs] = useState<number | undefined>(
undefined,
);
const isChoiceBeat = beat?.next.type === "choice";
const choices: BeatChoice[] = isChoiceBeat
@@ -150,7 +197,56 @@ export function PlayCanvas({
const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? "";
const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
useTypewriter(displayBody, beat?.id ?? "", 30);
useTypewriter(displayBody, beat?.id ?? "", {
targetDurationMs: audioDurationMs,
waitForAudio: Boolean(audioBase64),
});
// ── Audio source change ──────────────────────────────────────────────
// Reset duration when audio source changes; if loading takes too long,
// unblock the typewriter via timeout so text doesn't stall.
useEffect(() => {
setAudioDurationMs(undefined);
if (!audioBase64) return;
const timer = setTimeout(() => {
setAudioDurationMs((prev) => prev ?? 0);
}, AUDIO_WAIT_TIMEOUT_MS);
return () => clearTimeout(timer);
}, [audioBase64]);
// ── Mute toggle ───────────────────────────────────────────────────────
useEffect(() => {
const el = audioRef.current;
if (!el) return;
el.muted = muted;
el.playbackRate = SPEECH_RATE;
if (!muted && audioBase64 && el.paused) {
el.play().catch(() => {
// autoplay blocked — silent until next interaction
});
}
}, [muted, audioBase64]);
function handleAudioMetadata() {
const el = audioRef.current;
if (!el) return;
el.playbackRate = SPEECH_RATE;
// Effective playback time is shorter once sped up — keep the typewriter in sync.
const ms = Number.isFinite(el.duration)
? (el.duration * 1000) / SPEECH_RATE
: 0;
setAudioDurationMs(ms > 0 ? ms : 0);
if (!muted) {
el.play().catch(() => {
// autoplay blocked
});
}
}
function handleAudioError() {
// Treat as zero duration so the typewriter runs at default speed.
setAudioDurationMs(0);
}
function handleImageClick(e: React.MouseEvent<HTMLImageElement>) {
if (phase !== "ready" || !imgRef.current || !beat) return;
@@ -197,6 +293,19 @@ export function PlayCanvas({
<div
className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
>
{/* Hidden audio element — voice playback for the current beat */}
{audioBase64 && (
<audio
key={audioBase64.slice(-48)}
ref={audioRef}
src={`data:${audioMime ?? "audio/wav"};base64,${audioBase64}`}
preload="auto"
onLoadedMetadata={handleAudioMetadata}
onError={handleAudioError}
className="hidden"
/>
)}
{imageBase64 ? (
<div
className="relative inline-block"