Merge pull request #34 from zonghaoyuan/worktree-fix+fot-reduction

fix(web): reduce FOT by stripping redundant voice data from transport
This commit is contained in:
Zonghao Yuan
2026-06-05 00:25:51 +08:00
committed by GitHub
5 changed files with 118 additions and 47 deletions
+5 -1
View File
@@ -26,7 +26,11 @@ export async function POST(req: Request) {
try { try {
const config = loadEngineConfig(req.headers); const config = loadEngineConfig(req.headers);
const result = await requestBeatAudio(config, body); const result = await requestBeatAudio(config, body);
return NextResponse.json(result); if (!result.audio) return new Response(null, { status: 204 });
const binary = Buffer.from(result.audio.base64, "base64");
return new Response(binary, {
headers: { "Content-Type": result.audio.mime },
});
} catch (err) { } catch (err) {
// Engine already swallows synth errors and returns audio:null. Anything // Engine already swallows synth errors and returns audio:null. Anything
// that reaches here is config-level — surface so the client can log it. // that reaches here is config-level — surface so the client can log it.
+4 -1
View File
@@ -26,7 +26,10 @@ export async function POST(req: Request) {
// See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
const config = body.clientTts === true ? { ...base, tts: undefined } : base; const config = body.clientTts === true ? { ...base, tts: undefined } : base;
const result = await requestInsertBeat(config, body); const result = await requestInsertBeat(config, body);
return NextResponse.json(result); return NextResponse.json({
...result,
characters: result.characters.map((c) => ({ ...c, voice: undefined })),
});
} catch (err) { } catch (err) {
const message = err instanceof Error ? err.message : "Unknown error"; const message = err instanceof Error ? err.message : "Unknown error";
return NextResponse.json({ error: message }, { status: 500 }); return NextResponse.json({ error: message }, { status: 500 });
+17 -2
View File
@@ -1,8 +1,17 @@
import { requestScene } from "@infiplot/engine"; import { requestScene } from "@infiplot/engine";
import type { SceneRequest } from "@infiplot/types"; import type { Character, SceneRequest } from "@infiplot/types";
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { loadEngineConfig } from "@/lib/config"; import { loadEngineConfig } from "@/lib/config";
function stripKnownVoices(
characters: Character[],
knownNames: Set<string>,
): Character[] {
return characters.map((c) =>
knownNames.has(c.name) ? { ...c, voice: undefined } : c,
);
}
export const runtime = "nodejs"; export const runtime = "nodejs";
// Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is // Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is
// Writer + CharDesigner×N + Cinematographer + Painter — happy path 912s; the // Writer + CharDesigner×N + Cinematographer + Painter — happy path 912s; the
@@ -27,7 +36,13 @@ export async function POST(req: Request) {
// See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
const config = body.clientTts === true ? { ...base, tts: undefined } : base; const config = body.clientTts === true ? { ...base, tts: undefined } : base;
const result = await requestScene(config, body); const result = await requestScene(config, body);
return NextResponse.json(result); const knownNames = new Set(
(body.session.characters ?? []).map((c) => c.name),
);
return NextResponse.json({
...result,
characters: stripKnownVoices(result.characters, knownNames),
});
} catch (err) { } catch (err) {
const message = err instanceof Error ? err.message : "Unknown error"; const message = err instanceof Error ? err.message : "Unknown error";
return NextResponse.json({ error: message }, { status: 500 }); return NextResponse.json({ error: message }, { status: 500 });
+82 -31
View File
@@ -19,8 +19,6 @@ import { PRESETS } from "@/lib/presets";
import { provisionVoice, synthesize } from "@infiplot/tts-client"; import { provisionVoice, synthesize } from "@infiplot/tts-client";
import type { import type {
Beat, Beat,
BeatAudio,
BeatAudioResponse,
BeatChoice, BeatChoice,
Character, Character,
CharacterVoice, CharacterVoice,
@@ -39,6 +37,34 @@ import { getByoHeaders, isByoActive } from "@/lib/byoHeaders";
const MUTED_STORAGE_KEY = "infiplot:muted"; const MUTED_STORAGE_KEY = "infiplot:muted";
// ── FOT reduction helpers ──────────────────────────────────────────────
// Strip bulky voice.referenceAudioBase64 from the session before sending it to
// the server. The engine only needs character names + visualDescriptions for
// scene generation; voice data is only used by /api/beat-audio (which receives
// the voice directly, not via session). The client retains voices locally and
// re-merges them from the response via mergeCharactersPreserveVoice.
function stripVoicesForTransport(session: Session): Session {
return {
...session,
characters: session.characters.map((c) => ({ ...c, voice: undefined })),
};
}
// Merge server-returned characters with locally-held voices. The server strips
// voice from already-known characters (P0), so only NEW characters carry voice.
// For existing characters, re-attach the voice the client already holds.
function mergeCharactersPreserveVoice(
local: Character[],
remote: Character[],
): Character[] {
const localByName = new Map(local.map((c) => [c.name, c]));
return remote.map((c) => {
const prev = localByName.get(c.name);
if (!prev) return c;
return { ...c, voice: c.voice ?? prev.voice };
});
}
// Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a // Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a
// non-BYO, unmuted player. Set high enough that one transient miss won't trip // non-BYO, unmuted player. Set high enough that one transient miss won't trip
// it, low enough to catch a scene that's clearly being rate-limited. // it, low enough to catch a scene that's clearly being rate-limited.
@@ -304,7 +330,7 @@ function prefetchScenePath(
"Content-Type": "application/json", "Content-Type": "application/json",
...getByoHeaders(), ...getByoHeaders(),
}, },
body: JSON.stringify({ session: specSession, clientTts }), body: JSON.stringify({ session: stripVoicesForTransport(specSession), clientTts }),
signal: abort.signal, signal: abort.signal,
}); });
if (!res.ok) { if (!res.ok) {
@@ -319,6 +345,12 @@ function prefetchScenePath(
// transition path awaits the same cached promise via getOrCreateBlobUrl. // transition path awaits the same cached promise via getOrCreateBlobUrl.
void getOrCreateBlobUrl(data.imageUrl); void getOrCreateBlobUrl(data.imageUrl);
// Re-attach locally-held voices the server stripped from known characters.
data.characters = mergeCharactersPreserveVoice(
baseSession.characters,
data.characters,
);
// Recursive: if the resulting scene has exactly one change-scene exit, // Recursive: if the resulting scene has exactly one change-scene exit,
// it is a must-pass node — prefetch its child too. // it is a must-pass node — prefetch its child too.
if (depth + 1 < PREFETCH_MAX_DEPTH) { if (depth + 1 < PREFETCH_MAX_DEPTH) {
@@ -435,7 +467,7 @@ function PlayInner() {
const [currentScene, setCurrentScene] = useState<Scene | null>(null); const [currentScene, setCurrentScene] = useState<Scene | null>(null);
const [currentBeatId, setCurrentBeatId] = useState<string | null>(null); const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
const [imageUrl, setImageUrl] = useState<string | null>(null); const [imageUrl, setImageUrl] = useState<string | null>(null);
const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({}); const [beatAudioMap, setBeatAudioMap] = useState<Record<string, string>>({});
// Lazy-initialize 优先级:本局选择(homepage 的「语音配音」存到 sessionStorage:infiplot:custom) // Lazy-initialize 优先级:本局选择(homepage 的「语音配音」存到 sessionStorage:infiplot:custom)
// > 上次会话的粘性偏好(localStorage:infiplot:muted) > 默认非静音。 // > 上次会话的粘性偏好(localStorage:infiplot:muted) > 默认非静音。
// 这样首页选了「关闭」开始游戏,进来就是静音;选「开启」就不是静音;进入 play 页后用户自己 // 这样首页选了「关闭」开始游戏,进来就是静音;选「开启」就不是静音;进入 play 页后用户自己
@@ -519,9 +551,7 @@ function PlayInner() {
return currentScene.beats.find((b) => b.id === currentBeatId) ?? null; return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
}, [currentScene, currentBeatId]); }, [currentScene, currentBeatId]);
const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined; const audioSrc = (currentBeat ? beatAudioMap[currentBeat.id] : undefined) ?? null;
const audioBase64 = currentBeatAudio?.base64 ?? null;
const audioMime = currentBeatAudio?.mime ?? null;
useEffect(() => { useEffect(() => {
sessionRef.current = session; sessionRef.current = session;
@@ -597,7 +627,7 @@ function PlayInner() {
const abort = new AbortController(); const abort = new AbortController();
beatAudioAbortRef.current.set(beat.id, abort); beatAudioAbortRef.current.set(beat.id, abort);
try { try {
let audio: BeatAudio | null = null; let audioUrl: string | null = null;
if (byo) { if (byo) {
// Client-direct: provision (once per speaker, cached) + synth against // Client-direct: provision (once per speaker, cached) + synth against
// Xiaomi with the user's own key — no /api/beat-audio round-trip and // Xiaomi with the user's own key — no /api/beat-audio round-trip and
@@ -615,7 +645,7 @@ function PlayInner() {
beat.lineDelivery, beat.lineDelivery,
abort.signal, abort.signal,
); );
audio = { base64: out.audioBase64, mime: out.mimeType }; audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`;
} else { } else {
const res = await fetch("/api/beat-audio", { const res = await fetch("/api/beat-audio", {
method: "POST", method: "POST",
@@ -629,24 +659,26 @@ function PlayInner() {
}), }),
signal: abort.signal, signal: abort.signal,
}); });
if (res.status === 204) {
setSilenceStrikes((n) => Math.min(n + 1, 99));
return;
}
if (!res.ok) { if (!res.ok) {
setSilenceStrikes((n) => Math.min(n + 1, 99)); setSilenceStrikes((n) => Math.min(n + 1, 99));
return; return;
} }
const json = (await res.json()) as BeatAudioResponse; const blob = await res.blob();
audio = json.audio; audioUrl = URL.createObjectURL(blob);
// Null audio usually means MiMo rate-limited or timed out the shared setSilenceStrikes(0);
// key — track the streak; a real clip resets it.
if (audio) setSilenceStrikes(0);
else setSilenceStrikes((n) => Math.min(n + 1, 99));
} }
// Skip the state write if we've been aborted between the await and // Skip the state write if we've been aborted between the await and
// here — beat ids are scene-local, so a late arrival from a prior // here — beat ids are scene-local, so a late arrival from a prior
// scene would otherwise overwrite the current scene's audio under the // scene would otherwise overwrite the current scene's audio under the
// same id. // same id.
if (audio && !abort.signal.aborted) { if (audioUrl && !abort.signal.aborted) {
const settled = audio; setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl }));
setBeatAudioMap((m) => ({ ...m, [beat.id]: settled })); } else if (audioUrl?.startsWith("blob:")) {
URL.revokeObjectURL(audioUrl);
} }
} catch { } catch {
// aborted / network / Xiaomi rate-limit — silent fallback (no audio) // aborted / network / Xiaomi rate-limit — silent fallback (no audio)
@@ -685,7 +717,12 @@ function PlayInner() {
// scenes) so a late arrival would land under the wrong beat otherwise. // scenes) so a late arrival would land under the wrong beat otherwise.
useEffect(() => { useEffect(() => {
cancelBeatAudioFetches(); cancelBeatAudioFetches();
setBeatAudioMap({}); setBeatAudioMap((prev) => {
for (const url of Object.values(prev)) {
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
}
return {};
});
prefetchSceneAudio(); prefetchSceneAudio();
}, [currentScene?.id, prefetchSceneAudio]); }, [currentScene?.id, prefetchSceneAudio]);
@@ -720,7 +757,12 @@ function PlayInner() {
if (prev === muted) return; if (prev === muted) return;
cancelBeatAudioFetches(); cancelBeatAudioFetches();
if (muted) return; if (muted) return;
setBeatAudioMap({}); setBeatAudioMap((prev) => {
for (const url of Object.values(prev)) {
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
}
return {};
});
prefetchSceneAudio(); prefetchSceneAudio();
}, [muted, prefetchSceneAudio]); }, [muted, prefetchSceneAudio]);
@@ -738,7 +780,12 @@ function PlayInner() {
if (cfg) { if (cfg) {
setSilenceStrikes(0); setSilenceStrikes(0);
cancelBeatAudioFetches(); cancelBeatAudioFetches();
setBeatAudioMap({}); setBeatAudioMap((prev) => {
for (const url of Object.values(prev)) {
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
}
return {};
});
prefetchSceneAudio(); prefetchSceneAudio();
} }
}, },
@@ -1042,7 +1089,10 @@ function PlayInner() {
visitedBeatIds: [result.scene.entryBeatId], visitedBeatIds: [result.scene.entryBeatId],
}, },
], ],
characters: result.characters, characters: mergeCharactersPreserveVoice(
base.characters,
result.characters,
),
storyState: result.storyState, storyState: result.storyState,
}; };
visitedBeatsRef.current = [result.scene.entryBeatId]; visitedBeatsRef.current = [result.scene.entryBeatId];
@@ -1121,7 +1171,7 @@ function PlayInner() {
...getByoHeaders(), ...getByoHeaders(),
}, },
body: JSON.stringify({ body: JSON.stringify({
session: specSession, session: stripVoicesForTransport(specSession),
clientTts: !!byoTtsRef.current, clientTts: !!byoTtsRef.current,
}), }),
}); });
@@ -1148,7 +1198,7 @@ function PlayInner() {
"Content-Type": "application/json", "Content-Type": "application/json",
...getByoHeaders(), ...getByoHeaders(),
}, },
body: JSON.stringify({ session, annotatedImageBase64 }), body: JSON.stringify({ session: stripVoicesForTransport(session), annotatedImageBase64 }),
}); });
if (!visionRes.ok) { if (!visionRes.ok) {
const j = (await visionRes.json().catch(() => ({}))) as { const j = (await visionRes.json().catch(() => ({}))) as {
@@ -1168,7 +1218,7 @@ function PlayInner() {
...getByoHeaders(), ...getByoHeaders(),
}, },
body: JSON.stringify({ body: JSON.stringify({
session, session: stripVoicesForTransport(session),
freeformAction: decision.intent.freeformAction, freeformAction: decision.intent.freeformAction,
clientTts: !!byoTtsRef.current, clientTts: !!byoTtsRef.current,
}), }),
@@ -1206,7 +1256,10 @@ function PlayInner() {
history: session.history.map((h, i, arr) => history: session.history.map((h, i, arr) =>
i === arr.length - 1 ? { ...h, scene: patched } : h, i === arr.length - 1 ? { ...h, scene: patched } : h,
), ),
characters: insertChars, characters: mergeCharactersPreserveVoice(
session.characters,
insertChars,
),
}; };
setSession(nextSession); setSession(nextSession);
setCurrentScene(patched); setCurrentScene(patched);
@@ -1252,7 +1305,7 @@ function PlayInner() {
...getByoHeaders(), ...getByoHeaders(),
}, },
body: JSON.stringify({ body: JSON.stringify({
session: specSession, session: stripVoicesForTransport(specSession),
clientTts: !!byoTtsRef.current, clientTts: !!byoTtsRef.current,
}), }),
}); });
@@ -1321,8 +1374,7 @@ function PlayInner() {
<div className="fixed inset-0 bg-black flex items-center justify-center z-50"> <div className="fixed inset-0 bg-black flex items-center justify-center z-50">
<PlayCanvas <PlayCanvas
imageUrl={imageUrl} imageUrl={imageUrl}
audioBase64={audioBase64} audioSrc={audioSrc}
audioMime={audioMime}
muted={muted} muted={muted}
phase={phase} phase={phase}
beat={currentBeat} beat={currentBeat}
@@ -1396,8 +1448,7 @@ function PlayInner() {
<main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10"> <main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
<PlayCanvas <PlayCanvas
imageUrl={imageUrl} imageUrl={imageUrl}
audioBase64={audioBase64} audioSrc={audioSrc}
audioMime={audioMime}
muted={muted} muted={muted}
phase={phase} phase={phase}
beat={currentBeat} beat={currentBeat}
+10 -12
View File
@@ -162,8 +162,7 @@ function ChoiceButton({
// ── Main component ───────────────────────────────────────────────────── // ── Main component ─────────────────────────────────────────────────────
export function PlayCanvas({ export function PlayCanvas({
imageUrl, imageUrl,
audioBase64, audioSrc,
audioMime,
muted, muted,
phase, phase,
beat, beat,
@@ -177,8 +176,7 @@ export function PlayCanvas({
aboveCanvasLeft, aboveCanvasLeft,
}: { }: {
imageUrl: string | null; imageUrl: string | null;
audioBase64: string | null; audioSrc: string | null;
audioMime: string | null;
muted: boolean; muted: boolean;
phase: Phase; phase: Phase;
beat: Beat | null; beat: Beat | null;
@@ -209,7 +207,7 @@ export function PlayCanvas({
const { shown: typedBody, done: typingDone, skip: skipTypewriter } = const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
useTypewriter(displayBody, beat?.id ?? "", { useTypewriter(displayBody, beat?.id ?? "", {
targetDurationMs: audioDurationMs, targetDurationMs: audioDurationMs,
waitForAudio: Boolean(audioBase64), waitForAudio: Boolean(audioSrc),
}); });
// ── Audio source change ────────────────────────────────────────────── // ── Audio source change ──────────────────────────────────────────────
@@ -217,12 +215,12 @@ export function PlayCanvas({
// unblock the typewriter via timeout so text doesn't stall. // unblock the typewriter via timeout so text doesn't stall.
useEffect(() => { useEffect(() => {
setAudioDurationMs(undefined); setAudioDurationMs(undefined);
if (!audioBase64) return; if (!audioSrc) return;
const timer = setTimeout(() => { const timer = setTimeout(() => {
setAudioDurationMs((prev) => prev ?? 0); setAudioDurationMs((prev) => prev ?? 0);
}, AUDIO_WAIT_TIMEOUT_MS); }, AUDIO_WAIT_TIMEOUT_MS);
return () => clearTimeout(timer); return () => clearTimeout(timer);
}, [audioBase64]); }, [audioSrc]);
// ── Mute toggle ─────────────────────────────────────────────────────── // ── Mute toggle ───────────────────────────────────────────────────────
useEffect(() => { useEffect(() => {
@@ -230,12 +228,12 @@ export function PlayCanvas({
if (!el) return; if (!el) return;
el.muted = muted; el.muted = muted;
el.playbackRate = SPEECH_RATE; el.playbackRate = SPEECH_RATE;
if (!muted && audioBase64 && el.paused) { if (!muted && audioSrc && el.paused) {
el.play().catch(() => { el.play().catch(() => {
// autoplay blocked — silent until next interaction // autoplay blocked — silent until next interaction
}); });
} }
}, [muted, audioBase64]); }, [muted, audioSrc]);
function handleAudioMetadata() { function handleAudioMetadata() {
const el = audioRef.current; const el = audioRef.current;
@@ -341,11 +339,11 @@ export function PlayCanvas({
className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`} className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
> >
{/* Hidden audio element — voice playback for the current beat */} {/* Hidden audio element — voice playback for the current beat */}
{audioBase64 && ( {audioSrc && (
<audio <audio
key={audioBase64.slice(-48)} key={audioSrc.slice(-48)}
ref={audioRef} ref={audioRef}
src={`data:${audioMime ?? "audio/wav"};base64,${audioBase64}`} src={audioSrc}
preload="auto" preload="auto"
onLoadedMetadata={handleAudioMetadata} onLoadedMetadata={handleAudioMetadata}
onError={handleAudioError} onError={handleAudioError}