From e88e988de3185df9683fb51a1f8ecb628bed4542 Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Fri, 5 Jun 2026 00:08:02 +0800 Subject: [PATCH] fix(web): reduce FOT by stripping redundant voice data from transport MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three transport-only optimizations that cut per-session Vercel FOT by ~50-60%: P0 — Server strips voice.referenceAudioBase64 from already-known characters in /api/scene and /api/insert-beat responses (defense-in-depth). P1 — Client strips all voice data from session before sending to /api/scene, /api/vision, and /api/insert-beat. Voices are retained locally and re-merged from responses via mergeCharactersPreserveVoice(). The engine only needs character names + visualDescriptions for scene generation. P3 — /api/beat-audio returns binary audio (Response with Content-Type) instead of JSON-wrapped base64, saving ~33% encoding overhead. Client converts to blob URLs; PlayCanvas accepts a single audioSrc prop. Co-Authored-By: Claude Opus 4.6 --- app/api/beat-audio/route.ts | 6 +- app/api/insert-beat/route.ts | 5 +- app/api/scene/route.ts | 19 +++++- app/play/page.tsx | 113 +++++++++++++++++++++++++---------- components/PlayCanvas.tsx | 22 ++++--- 5 files changed, 118 insertions(+), 47 deletions(-) diff --git a/app/api/beat-audio/route.ts b/app/api/beat-audio/route.ts index d3cf81b..ace4684 100644 --- a/app/api/beat-audio/route.ts +++ b/app/api/beat-audio/route.ts @@ -26,7 +26,11 @@ export async function POST(req: Request) { try { const config = loadEngineConfig(req.headers); const result = await requestBeatAudio(config, body); - return NextResponse.json(result); + if (!result.audio) return new Response(null, { status: 204 }); + const binary = Buffer.from(result.audio.base64, "base64"); + return new Response(binary, { + headers: { "Content-Type": result.audio.mime }, + }); } catch (err) { // Engine already swallows synth errors and returns audio:null. Anything // that reaches here is config-level — surface so the client can log it. diff --git a/app/api/insert-beat/route.ts b/app/api/insert-beat/route.ts index 2366eb0..8dda7f1 100644 --- a/app/api/insert-beat/route.ts +++ b/app/api/insert-beat/route.ts @@ -26,7 +26,10 @@ export async function POST(req: Request) { // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. const config = body.clientTts === true ? { ...base, tts: undefined } : base; const result = await requestInsertBeat(config, body); - return NextResponse.json(result); + return NextResponse.json({ + ...result, + characters: result.characters.map((c) => ({ ...c, voice: undefined })), + }); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; return NextResponse.json({ error: message }, { status: 500 }); diff --git a/app/api/scene/route.ts b/app/api/scene/route.ts index c3b177b..c59c406 100644 --- a/app/api/scene/route.ts +++ b/app/api/scene/route.ts @@ -1,8 +1,17 @@ import { requestScene } from "@infiplot/engine"; -import type { SceneRequest } from "@infiplot/types"; +import type { Character, SceneRequest } from "@infiplot/types"; import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; +function stripKnownVoices( + characters: Character[], + knownNames: Set, +): Character[] { + return characters.map((c) => + knownNames.has(c.name) ? { ...c, voice: undefined } : c, + ); +} + export const runtime = "nodejs"; // Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is // Writer + CharDesigner×N + Cinematographer + Painter — happy path 9–12s; the @@ -27,7 +36,13 @@ export async function POST(req: Request) { // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. const config = body.clientTts === true ? { ...base, tts: undefined } : base; const result = await requestScene(config, body); - return NextResponse.json(result); + const knownNames = new Set( + (body.session.characters ?? []).map((c) => c.name), + ); + return NextResponse.json({ + ...result, + characters: stripKnownVoices(result.characters, knownNames), + }); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; return NextResponse.json({ error: message }, { status: 500 }); diff --git a/app/play/page.tsx b/app/play/page.tsx index 322fb31..da8beea 100644 --- a/app/play/page.tsx +++ b/app/play/page.tsx @@ -19,8 +19,6 @@ import { PRESETS } from "@/lib/presets"; import { provisionVoice, synthesize } from "@infiplot/tts-client"; import type { Beat, - BeatAudio, - BeatAudioResponse, BeatChoice, Character, CharacterVoice, @@ -39,6 +37,34 @@ import { getByoHeaders, isByoActive } from "@/lib/byoHeaders"; const MUTED_STORAGE_KEY = "infiplot:muted"; +// ── FOT reduction helpers ────────────────────────────────────────────── +// Strip bulky voice.referenceAudioBase64 from the session before sending it to +// the server. The engine only needs character names + visualDescriptions for +// scene generation; voice data is only used by /api/beat-audio (which receives +// the voice directly, not via session). The client retains voices locally and +// re-merges them from the response via mergeCharactersPreserveVoice. +function stripVoicesForTransport(session: Session): Session { + return { + ...session, + characters: session.characters.map((c) => ({ ...c, voice: undefined })), + }; +} + +// Merge server-returned characters with locally-held voices. The server strips +// voice from already-known characters (P0), so only NEW characters carry voice. +// For existing characters, re-attach the voice the client already holds. +function mergeCharactersPreserveVoice( + local: Character[], + remote: Character[], +): Character[] { + const localByName = new Map(local.map((c) => [c.name, c])); + return remote.map((c) => { + const prev = localByName.get(c.name); + if (!prev) return c; + return { ...c, voice: c.voice ?? prev.voice }; + }); +} + // Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a // non-BYO, unmuted player. Set high enough that one transient miss won't trip // it, low enough to catch a scene that's clearly being rate-limited. @@ -304,7 +330,7 @@ function prefetchScenePath( "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify({ session: specSession, clientTts }), + body: JSON.stringify({ session: stripVoicesForTransport(specSession), clientTts }), signal: abort.signal, }); if (!res.ok) { @@ -319,6 +345,12 @@ function prefetchScenePath( // transition path awaits the same cached promise via getOrCreateBlobUrl. void getOrCreateBlobUrl(data.imageUrl); + // Re-attach locally-held voices the server stripped from known characters. + data.characters = mergeCharactersPreserveVoice( + baseSession.characters, + data.characters, + ); + // Recursive: if the resulting scene has exactly one change-scene exit, // it is a must-pass node — prefetch its child too. if (depth + 1 < PREFETCH_MAX_DEPTH) { @@ -435,7 +467,7 @@ function PlayInner() { const [currentScene, setCurrentScene] = useState(null); const [currentBeatId, setCurrentBeatId] = useState(null); const [imageUrl, setImageUrl] = useState(null); - const [beatAudioMap, setBeatAudioMap] = useState>({}); + const [beatAudioMap, setBeatAudioMap] = useState>({}); // Lazy-initialize 优先级:本局选择(homepage 的「语音配音」存到 sessionStorage:infiplot:custom) // > 上次会话的粘性偏好(localStorage:infiplot:muted) > 默认非静音。 // 这样首页选了「关闭」开始游戏,进来就是静音;选「开启」就不是静音;进入 play 页后用户自己 @@ -519,9 +551,7 @@ function PlayInner() { return currentScene.beats.find((b) => b.id === currentBeatId) ?? null; }, [currentScene, currentBeatId]); - const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined; - const audioBase64 = currentBeatAudio?.base64 ?? null; - const audioMime = currentBeatAudio?.mime ?? null; + const audioSrc = (currentBeat ? beatAudioMap[currentBeat.id] : undefined) ?? null; useEffect(() => { sessionRef.current = session; @@ -597,7 +627,7 @@ function PlayInner() { const abort = new AbortController(); beatAudioAbortRef.current.set(beat.id, abort); try { - let audio: BeatAudio | null = null; + let audioUrl: string | null = null; if (byo) { // Client-direct: provision (once per speaker, cached) + synth against // Xiaomi with the user's own key — no /api/beat-audio round-trip and @@ -615,7 +645,7 @@ function PlayInner() { beat.lineDelivery, abort.signal, ); - audio = { base64: out.audioBase64, mime: out.mimeType }; + audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`; } else { const res = await fetch("/api/beat-audio", { method: "POST", @@ -629,24 +659,26 @@ function PlayInner() { }), signal: abort.signal, }); + if (res.status === 204) { + setSilenceStrikes((n) => Math.min(n + 1, 99)); + return; + } if (!res.ok) { setSilenceStrikes((n) => Math.min(n + 1, 99)); return; } - const json = (await res.json()) as BeatAudioResponse; - audio = json.audio; - // Null audio usually means MiMo rate-limited or timed out the shared - // key — track the streak; a real clip resets it. - if (audio) setSilenceStrikes(0); - else setSilenceStrikes((n) => Math.min(n + 1, 99)); + const blob = await res.blob(); + audioUrl = URL.createObjectURL(blob); + setSilenceStrikes(0); } // Skip the state write if we've been aborted between the await and // here — beat ids are scene-local, so a late arrival from a prior // scene would otherwise overwrite the current scene's audio under the // same id. - if (audio && !abort.signal.aborted) { - const settled = audio; - setBeatAudioMap((m) => ({ ...m, [beat.id]: settled })); + if (audioUrl && !abort.signal.aborted) { + setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl })); + } else if (audioUrl?.startsWith("blob:")) { + URL.revokeObjectURL(audioUrl); } } catch { // aborted / network / Xiaomi rate-limit — silent fallback (no audio) @@ -685,7 +717,12 @@ function PlayInner() { // scenes) so a late arrival would land under the wrong beat otherwise. useEffect(() => { cancelBeatAudioFetches(); - setBeatAudioMap({}); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); prefetchSceneAudio(); }, [currentScene?.id, prefetchSceneAudio]); @@ -720,7 +757,12 @@ function PlayInner() { if (prev === muted) return; cancelBeatAudioFetches(); if (muted) return; - setBeatAudioMap({}); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); prefetchSceneAudio(); }, [muted, prefetchSceneAudio]); @@ -738,7 +780,12 @@ function PlayInner() { if (cfg) { setSilenceStrikes(0); cancelBeatAudioFetches(); - setBeatAudioMap({}); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); prefetchSceneAudio(); } }, @@ -1042,7 +1089,10 @@ function PlayInner() { visitedBeatIds: [result.scene.entryBeatId], }, ], - characters: result.characters, + characters: mergeCharactersPreserveVoice( + base.characters, + result.characters, + ), storyState: result.storyState, }; visitedBeatsRef.current = [result.scene.entryBeatId]; @@ -1121,7 +1171,7 @@ function PlayInner() { ...getByoHeaders(), }, body: JSON.stringify({ - session: specSession, + session: stripVoicesForTransport(specSession), clientTts: !!byoTtsRef.current, }), }); @@ -1148,7 +1198,7 @@ function PlayInner() { "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify({ session, annotatedImageBase64 }), + body: JSON.stringify({ session: stripVoicesForTransport(session), annotatedImageBase64 }), }); if (!visionRes.ok) { const j = (await visionRes.json().catch(() => ({}))) as { @@ -1168,7 +1218,7 @@ function PlayInner() { ...getByoHeaders(), }, body: JSON.stringify({ - session, + session: stripVoicesForTransport(session), freeformAction: decision.intent.freeformAction, clientTts: !!byoTtsRef.current, }), @@ -1206,7 +1256,10 @@ function PlayInner() { history: session.history.map((h, i, arr) => i === arr.length - 1 ? { ...h, scene: patched } : h, ), - characters: insertChars, + characters: mergeCharactersPreserveVoice( + session.characters, + insertChars, + ), }; setSession(nextSession); setCurrentScene(patched); @@ -1252,7 +1305,7 @@ function PlayInner() { ...getByoHeaders(), }, body: JSON.stringify({ - session: specSession, + session: stripVoicesForTransport(specSession), clientTts: !!byoTtsRef.current, }), }); @@ -1321,8 +1374,7 @@ function PlayInner() {
{ setAudioDurationMs(undefined); - if (!audioBase64) return; + if (!audioSrc) return; const timer = setTimeout(() => { setAudioDurationMs((prev) => prev ?? 0); }, AUDIO_WAIT_TIMEOUT_MS); return () => clearTimeout(timer); - }, [audioBase64]); + }, [audioSrc]); // ── Mute toggle ─────────────────────────────────────────────────────── useEffect(() => { @@ -230,12 +228,12 @@ export function PlayCanvas({ if (!el) return; el.muted = muted; el.playbackRate = SPEECH_RATE; - if (!muted && audioBase64 && el.paused) { + if (!muted && audioSrc && el.paused) { el.play().catch(() => { // autoplay blocked — silent until next interaction }); } - }, [muted, audioBase64]); + }, [muted, audioSrc]); function handleAudioMetadata() { const el = audioRef.current; @@ -341,11 +339,11 @@ export function PlayCanvas({ className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`} > {/* Hidden audio element — voice playback for the current beat */} - {audioBase64 && ( + {audioSrc && (