Files
infiplot-web/apps/web/app/play/page.tsx
T
Zonghao Yuan fcd4e6c1ab feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)
Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration.

- Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session
- Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud)
- Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer
- Graceful degradation: any TTS step failing → silent beat, game continues
- MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens
- Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS)

Squashed from #3:
- feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导
- feat(engine): MOCK_IMAGE 占位图便于本地测试
- fix(tts): address Copilot review on PR #3
- fix(tts): Copilot round-2 review feedback

Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00

791 lines
26 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"use client";
import Link from "next/link";
import { useRouter, useSearchParams } from "next/navigation";
import {
Suspense,
useCallback,
useEffect,
useMemo,
useRef,
useState,
} from "react";
import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
import { PRESETS } from "@/lib/presets";
import type {
Beat,
BeatAudio,
BeatChoice,
InsertBeatResponse,
Scene,
SceneExit,
SceneResponse,
Session,
StartResponse,
VisionResponse,
} from "@yume/types";
const MUTED_STORAGE_KEY = "yume:muted";
// ──────────────────────────────────────────────────────────────────────
// Prefetch pool — speculative SceneResponses keyed by choice path.
//
// Key format: "C1" → reached by choosing C1 from current scene.
// "C1/C2" → after C1, then C2 (recursive must-pass prefetch).
//
// When the player picks a change-scene choice, we keep that key's
// descendants (re-rooted) and abort the rest.
// ──────────────────────────────────────────────────────────────────────
const PREFETCH_MAX_DEPTH = 3;
type PrefetchEntry = {
promise: Promise<SceneResponse>;
abort: AbortController;
};
type ScenePathStep = {
fromScene: Scene;
fromVisitedBeats: string[];
exit: { choiceId: string; label: string; nextSceneSeed: string };
};
function pathKey(steps: ScenePathStep[]): string {
return steps.map((s) => s.exit.choiceId).join("/");
}
function buildSpeculativeSession(
base: Session,
steps: ScenePathStep[],
): Session {
// Drop base's current (last) entry and re-add each step's `fromScene` with
// its exit set. Final result has `history.length = base.length - 1 + steps.length`.
const newHistory = [...base.history.slice(0, -1)];
for (const step of steps) {
newHistory.push({
scene: step.fromScene,
visitedBeatIds: step.fromVisitedBeats,
exit: {
kind: "choice",
choiceId: step.exit.choiceId,
label: step.exit.label,
nextSceneSeed: step.exit.nextSceneSeed,
},
});
}
return { ...base, history: newHistory };
}
function findAllChangeSceneChoices(scene: Scene): BeatChoice[] {
const result: BeatChoice[] = [];
const seen = new Set<string>();
for (const b of scene.beats) {
if (b.next.type === "choice") {
for (const c of b.next.choices) {
if (c.effect.kind === "change-scene" && !seen.has(c.id)) {
seen.add(c.id);
result.push(c);
}
}
}
}
return result;
}
function findSoleChangeSceneChoice(scene: Scene): BeatChoice | null {
const all = findAllChangeSceneChoices(scene);
return all.length === 1 ? all[0]! : null;
}
function prefetchScenePath(
pool: Map<string, PrefetchEntry>,
baseSession: Session,
steps: ScenePathStep[],
depth: number,
): void {
if (depth >= PREFETCH_MAX_DEPTH) return;
const key = pathKey(steps);
if (pool.has(key)) return;
const specSession = buildSpeculativeSession(baseSession, steps);
const abort = new AbortController();
const promise = (async () => {
const res = await fetch("/api/scene", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session: specSession }),
signal: abort.signal,
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? res.statusText);
}
const data = (await res.json()) as SceneResponse;
// Recursive: if the resulting scene has exactly one change-scene exit,
// it is a must-pass node — prefetch its child too.
if (depth + 1 < PREFETCH_MAX_DEPTH) {
const sole = findSoleChangeSceneChoice(data.scene);
if (sole && sole.effect.kind === "change-scene") {
const nextStep: ScenePathStep = {
fromScene: data.scene,
fromVisitedBeats: [data.scene.entryBeatId],
exit: {
choiceId: sole.id,
label: sole.label,
nextSceneSeed: sole.effect.nextSceneSeed,
},
};
// Carry forward the registry that the parent prefetch result already
// settled (it may include characters introduced by the intermediate
// scene). Without this, the L2+ prefetch starts from the original
// base.characters and a later transition through this survivor would
// silently drop voices the player has already heard.
const carriedBase: Session = {
...baseSession,
characters: data.characters,
};
prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1);
}
}
return data;
})();
promise.catch(() => {});
pool.set(key, { promise, abort });
}
function consumeChoice(
pool: Map<string, PrefetchEntry>,
choiceId: string,
): PrefetchEntry | undefined {
const my = pool.get(choiceId);
const survivors = new Map<string, PrefetchEntry>();
for (const [key, entry] of pool) {
if (key === choiceId) continue;
if (key.startsWith(choiceId + "/")) {
survivors.set(key.slice(choiceId.length + 1), entry);
} else {
entry.abort.abort();
}
}
pool.clear();
for (const [k, e] of survivors) pool.set(k, e);
return my;
}
function clearPool(pool: Map<string, PrefetchEntry>): void {
for (const e of pool.values()) e.abort.abort();
pool.clear();
}
// ──────────────────────────────────────────────────────────────────────
// Component
// ──────────────────────────────────────────────────────────────────────
function PlayInner() {
const router = useRouter();
const params = useSearchParams();
const [phase, setPhase] = useState<Phase>("loading-first");
const [session, setSession] = useState<Session | null>(null);
const [currentScene, setCurrentScene] = useState<Scene | null>(null);
const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
const [imageBase64, setImageBase64] = useState<string | null>(null);
const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
// Lazy-initialize from localStorage so PlayCanvas never mounts with the
// wrong muted value (an effect-based read would briefly let audio play
// before the preference settled in a scenario where audio arrives early).
const [muted, setMuted] = useState<boolean>(() => {
if (typeof window === "undefined") return false;
try {
return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1";
} catch {
return false;
}
});
const [pendingClick, setPendingClick] = useState<{
x: number;
y: number;
} | null>(null);
const [error, setError] = useState<string | null>(null);
const [presentation, setPresentation] = useState(false);
const [lastExitLabel, setLastExitLabel] = useState<string | null>(null);
const startedRef = useRef(false);
const poolRef = useRef<Map<string, PrefetchEntry>>(new Map());
// Mirrors for use inside async handlers (closure-stable)
const sessionRef = useRef<Session | null>(null);
const currentSceneRef = useRef<Scene | null>(null);
const currentBeatRef = useRef<Beat | null>(null);
const visitedBeatsRef = useRef<string[]>([]);
const currentBeat = useMemo<Beat | null>(() => {
if (!currentScene || !currentBeatId) return null;
return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
}, [currentScene, currentBeatId]);
const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined;
const audioBase64 = currentBeatAudio?.base64 ?? null;
const audioMime = currentBeatAudio?.mime ?? null;
useEffect(() => {
sessionRef.current = session;
}, [session]);
useEffect(() => {
currentSceneRef.current = currentScene;
}, [currentScene]);
useEffect(() => {
currentBeatRef.current = currentBeat;
}, [currentBeat]);
// Whenever currentBeatId changes, append it to visited (skip consecutive dups)
useEffect(() => {
if (!currentBeatId) return;
if (visitedBeatsRef.current.at(-1) === currentBeatId) return;
visitedBeatsRef.current = [...visitedBeatsRef.current, currentBeatId];
setSession((s) => {
if (!s) return s;
return {
...s,
history: s.history.map((h, i, arr) =>
i === arr.length - 1
? { ...h, visitedBeatIds: [...visitedBeatsRef.current] }
: h,
),
};
});
}, [currentBeatId]);
// ── Mute persistence (read is via the useState lazy initializer above) ─
const toggleMuted = useCallback(() => {
setMuted((prev) => {
const next = !prev;
try {
window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0");
} catch {
// ignore
}
return next;
});
}, []);
// ── Presentation mode toggle ─────────────────────────────────────────
const togglePresentation = useCallback(async () => {
const entering = !presentation;
if (entering) {
try {
if (!document.fullscreenElement) {
await document.documentElement.requestFullscreen();
}
} catch {
// ignore — fall through to chrome-less mode anyway
}
setPresentation(true);
} else {
try {
if (document.fullscreenElement) await document.exitFullscreen();
} catch {
// ignore
}
setPresentation(false);
}
}, [presentation]);
useEffect(() => {
function onKey(e: KeyboardEvent) {
if (e.key === "f" || e.key === "F") {
if (e.metaKey || e.ctrlKey || e.altKey) return;
e.preventDefault();
void togglePresentation();
} else if (e.key === "Escape" && presentation) {
setPresentation(false);
}
}
function onFullscreenChange() {
if (!document.fullscreenElement && presentation) setPresentation(false);
}
window.addEventListener("keydown", onKey);
document.addEventListener("fullscreenchange", onFullscreenChange);
return () => {
window.removeEventListener("keydown", onKey);
document.removeEventListener("fullscreenchange", onFullscreenChange);
};
}, [togglePresentation, presentation]);
// ── Bootstrap: start session ─────────────────────────────────────────
useEffect(() => {
if (startedRef.current) return;
startedRef.current = true;
let payload: { worldSetting: string; styleGuide: string } | null = null;
const presetId = params.get("preset");
if (presetId) {
const p = PRESETS.find((x) => x.id === presetId);
if (p) payload = { worldSetting: p.worldSetting, styleGuide: p.styleGuide };
} else if (params.get("custom") === "1") {
const stored = sessionStorage.getItem("yume:custom");
if (stored) {
try {
payload = JSON.parse(stored);
} catch {
payload = null;
}
}
}
if (!payload) {
router.replace("/");
return;
}
const finalPayload = payload;
fetch("/api/start", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(finalPayload),
})
.then(async (r) => {
if (!r.ok) {
const j = (await r.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? r.statusText);
}
return (await r.json()) as StartResponse;
})
.then((data) => {
const initial: Session = {
id: data.sessionId,
createdAt: Date.now(),
worldSetting: finalPayload.worldSetting,
styleGuide: finalPayload.styleGuide,
history: [
{
scene: data.scene,
visitedBeatIds: [data.scene.entryBeatId],
},
],
characters: data.characters,
};
visitedBeatsRef.current = [data.scene.entryBeatId];
setSession(initial);
setCurrentScene(data.scene);
setCurrentBeatId(data.scene.entryBeatId);
setImageBase64(data.imageBase64);
setBeatAudioMap(data.beatAudio ?? {});
setPhase("ready");
})
.catch((e) => setError(String(e)));
}, [params, router]);
// ── Prefetch on scene entry: L1 + recursive L2/L3 for must-pass ──────
useEffect(() => {
const s = session;
const scene = currentScene;
if (!s || !scene) return;
const exits = findAllChangeSceneChoices(scene);
for (const choice of exits) {
if (choice.effect.kind !== "change-scene") continue;
const step: ScenePathStep = {
fromScene: scene,
// Snapshot of visited beats at prefetch start. Slight drift is OK.
fromVisitedBeats: [...visitedBeatsRef.current],
exit: {
choiceId: choice.id,
label: choice.label,
nextSceneSeed: choice.effect.nextSceneSeed,
},
};
prefetchScenePath(poolRef.current, s, [step], 0);
}
}, [currentScene?.id, session?.id]);
// Abort all in-flight speculative prefetches when the page unmounts, so we
// stop paying for background scene/image generation. Empty deps → fires only
// on unmount; it must NOT run on scene transitions, which rely on
// consumeChoice keeping the re-rooted survivor prefetches alive.
useEffect(() => {
const pool = poolRef.current;
return () => {
clearPool(pool);
};
}, []);
// ── Handlers ──────────────────────────────────────────────────────────
function onAdvance() {
if (phase !== "ready") return;
const beat = currentBeatRef.current;
if (!beat || beat.next.type !== "continue") return;
setCurrentBeatId(beat.next.nextBeatId);
}
async function performSceneTransition(
source: PrefetchEntry | Promise<SceneResponse>,
exit: SceneExit,
visitedForCurrent: string[],
exitLabel: string,
) {
setPhase("transitioning");
setPendingClick(null);
try {
const result = await ("promise" in source ? source.promise : source);
const base = sessionRef.current;
if (!base) throw new Error("Session lost mid-transition");
const closedHistory = base.history.map((h, i, arr) =>
i === arr.length - 1
? { ...h, visitedBeatIds: visitedForCurrent, exit }
: h,
);
const newSession: Session = {
...base,
history: [
...closedHistory,
{
scene: result.scene,
visitedBeatIds: [result.scene.entryBeatId],
},
],
characters: result.characters,
};
visitedBeatsRef.current = [result.scene.entryBeatId];
setSession(newSession);
setCurrentScene(result.scene);
setCurrentBeatId(result.scene.entryBeatId);
setImageBase64(result.imageBase64);
setBeatAudioMap(result.beatAudio ?? {});
setLastExitLabel(exitLabel);
setPhase("ready");
} catch (e) {
if ((e as { name?: string }).name === "AbortError") {
setPhase("ready");
return;
}
setError(String(e));
setPhase("ready");
}
}
function onSelectChoice(choice: BeatChoice) {
if (phase !== "ready" || !session || !currentScene) return;
if (choice.effect.kind === "advance-beat") {
// Pure local jump. No network. No pool changes.
setCurrentBeatId(choice.effect.targetBeatId);
return;
}
const visited = [...visitedBeatsRef.current];
const exit: SceneExit = {
kind: "choice",
choiceId: choice.id,
label: choice.label,
nextSceneSeed: choice.effect.nextSceneSeed,
};
const cached = consumeChoice(poolRef.current, choice.id);
if (cached) {
void performSceneTransition(cached, exit, visited, choice.label);
return;
}
// Cold path — start a fresh fetch
const step: ScenePathStep = {
fromScene: currentScene,
fromVisitedBeats: visited,
exit: {
choiceId: choice.id,
label: choice.label,
nextSceneSeed: choice.effect.nextSceneSeed,
},
};
const specSession = buildSpeculativeSession(session, [step]);
clearPool(poolRef.current);
const promise = (async () => {
const res = await fetch("/api/scene", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session: specSession }),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? res.statusText);
}
return (await res.json()) as SceneResponse;
})();
void performSceneTransition(promise, exit, visited, choice.label);
}
async function onBackgroundClick(click: { x: number; y: number }) {
if (phase !== "ready" || !session || !currentScene || !imageBase64) return;
setPhase("vision-thinking");
setPendingClick(click);
try {
const visionRes = await fetch("/api/vision", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session, prevImageBase64: imageBase64, click }),
});
if (!visionRes.ok) {
const j = (await visionRes.json().catch(() => ({}))) as {
error?: string;
};
throw new Error(j.error ?? visionRes.statusText);
}
const decision = (await visionRes.json()) as VisionResponse;
if (decision.classify === "insert-beat") {
setPhase("inserting-beat");
const insertRes = await fetch("/api/insert-beat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
session,
freeformAction: decision.intent.freeformAction,
}),
});
if (!insertRes.ok) {
const j = (await insertRes.json().catch(() => ({}))) as {
error?: string;
};
throw new Error(j.error ?? insertRes.statusText);
}
const { partial, characters: insertChars, audio } =
(await insertRes.json()) as InsertBeatResponse;
const fromBeatId =
currentBeatRef.current?.id ?? currentScene.entryBeatId;
const newBeatId = `b_ins_${Date.now()}_${Math.random()
.toString(36)
.slice(2, 6)}`;
const newBeat: Beat = {
id: newBeatId,
narration: partial.narration,
speaker: partial.speaker,
line: partial.line,
lineDelivery: partial.lineDelivery,
next: { type: "continue", nextBeatId: fromBeatId },
};
const patched: Scene = {
...currentScene,
beats: [...currentScene.beats, newBeat],
};
setSession((s) =>
s
? {
...s,
history: s.history.map((h, i, arr) =>
i === arr.length - 1 ? { ...h, scene: patched } : h,
),
characters: insertChars,
}
: s,
);
setCurrentScene(patched);
setCurrentBeatId(newBeatId);
if (audio) {
setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
}
setLastExitLabel(decision.intent.freeformAction);
setPhase("ready");
setPendingClick(null);
} else {
const exit: SceneExit = {
kind: "freeform",
action: decision.intent.freeformAction,
};
const visited = [...visitedBeatsRef.current];
const base = sessionRef.current;
if (!base) {
setPhase("ready");
setPendingClick(null);
return;
}
const specSession: Session = {
...base,
history: base.history.map((h, i, arr) =>
i === arr.length - 1 ? { ...h, visitedBeatIds: visited, exit } : h,
),
};
clearPool(poolRef.current);
const promise = (async () => {
const res = await fetch("/api/scene", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session: specSession }),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as {
error?: string;
};
throw new Error(j.error ?? res.statusText);
}
return (await res.json()) as SceneResponse;
})();
await performSceneTransition(
promise,
exit,
visited,
decision.intent.freeformAction,
);
}
} catch (e) {
setError(String(e));
setPendingClick(null);
setPhase("ready");
}
}
// ── Render ────────────────────────────────────────────────────────────
if (error) {
return (
<div className="min-h-screen flex flex-col items-center justify-center px-8">
<div className="max-w-md text-center animate-fade-in">
<p className="text-[10px] smallcaps text-clay-500 mb-6">
· · · ·
</p>
<p className="font-serif italic text-clay-900 text-lg leading-[1.7] mb-10">
{error}
</p>
<Link
href="/"
className="text-[10px] smallcaps text-clay-700 hover:text-ember-500 transition-colors inline-flex items-center gap-3"
>
<i className="fa-solid fa-arrow-left text-[9px]" />
</Link>
</div>
</div>
);
}
if (presentation) {
return (
<div className="fixed inset-0 bg-black flex items-center justify-center z-50">
<PlayCanvas
imageBase64={imageBase64}
audioBase64={audioBase64}
audioMime={audioMime}
muted={muted}
phase={phase}
beat={currentBeat}
pendingClick={pendingClick}
onBackgroundClick={onBackgroundClick}
onAdvance={onAdvance}
onSelectChoice={onSelectChoice}
fullViewport
/>
</div>
);
}
const sceneCount = session?.history.length ?? 0;
const beatCount = visitedBeatsRef.current.length;
return (
<div className="min-h-screen flex flex-col">
<header className="px-5 md:px-12 pt-6 md:pt-8 flex items-center justify-between">
<Link
href="/"
className="text-[10px] smallcaps text-clay-600 hover:text-clay-900 transition-colors flex items-center gap-2"
>
<i className="fa-solid fa-arrow-left text-[9px]" />
云梦
</Link>
<div className="flex items-center gap-3 text-[10px] smallcaps text-clay-500 num">
<span> · {String(sceneCount).padStart(3, "0")} · </span>
<span className="text-clay-300">·</span>
<span>{String(beatCount).padStart(3, "0")} · </span>
<span className="text-clay-300">·</span>
<span className="hidden sm:inline truncate max-w-[180px]">
{session?.id.slice(2, 14) ?? "—"}
</span>
</div>
</header>
<main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
<PlayCanvas
imageBase64={imageBase64}
audioBase64={audioBase64}
audioMime={audioMime}
muted={muted}
phase={phase}
beat={currentBeat}
pendingClick={pendingClick}
onBackgroundClick={onBackgroundClick}
onAdvance={onAdvance}
onSelectChoice={onSelectChoice}
/>
<div className="mt-4 max-w-md w-full text-center min-h-[28px] flex items-center justify-center">
{phase === "loading-first" && (
<p className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
· · · · · ·
</p>
)}
{phase === "ready" && lastExitLabel && (
<p className="text-[9px] smallcaps text-clay-400 animate-fade-in">
<span className="mr-2"> · · ·</span>
<span className="text-clay-600">{lastExitLabel}</span>
</p>
)}
</div>
</main>
<footer className="px-5 md:px-12 pb-6 flex items-center justify-between">
<button
type="button"
onClick={() => void togglePresentation()}
className="text-[9px] smallcaps text-clay-400 hover:text-clay-700 transition-colors flex items-center gap-2"
aria-label="进入演示模式"
>
<i className="fa-solid fa-expand text-[10px]" />
F · ·
</button>
<div className="text-[9px] smallcaps text-clay-400 num"> · </div>
<button
type="button"
onClick={toggleMuted}
className="text-[9px] smallcaps text-clay-400 hover:text-clay-700 transition-colors flex items-center gap-2 w-[80px] justify-end"
aria-label={muted ? "取消静音" : "静音"}
>
<i
className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
/>
{muted ? "静 · 音" : "有 · 声"}
</button>
</footer>
</div>
);
}
export default function PlayPage() {
return (
<Suspense
fallback={
<div className="min-h-screen flex items-center justify-center">
<span className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
载入中
</span>
</div>
}
>
<PlayInner />
</Suspense>
);
}