fix(web): reduce FOT by stripping redundant voice data from transport

Three transport-only optimizations that cut per-session Vercel FOT by ~50-60%: P0 — Server strips voice.referenceAudioBase64 from already-known characters in /api/scene and /api/insert-beat responses (defense-in-depth). P1 — Client strips all voice data from session before sending to /api/scene, /api/vision, and /api/insert-beat. Voices are retained locally and re-merged from responses via mergeCharactersPreserveVoice(). The engine only needs character names + visualDescriptions for scene generation. P3 — /api/beat-audio returns binary audio (Response with Content-Type) instead of JSON-wrapped base64, saving ~33% encoding overhead. Client converts to blob URLs; PlayCanvas accepts a single audioSrc prop. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-05 00:08:02 +08:00
parent c30d11d60b
commit e88e988de3
5 changed files with 118 additions and 47 deletions
@@ -26,7 +26,11 @@ export async function POST(req: Request) {
  try {
    const config = loadEngineConfig(req.headers);
    const result = await requestBeatAudio(config, body);
-    return NextResponse.json(result);
+    if (!result.audio) return new Response(null, { status: 204 });
+    const binary = Buffer.from(result.audio.base64, "base64");
+    return new Response(binary, {
+      headers: { "Content-Type": result.audio.mime },
+    });
  } catch (err) {
    // Engine already swallows synth errors and returns audio:null. Anything
    // that reaches here is config-level — surface so the client can log it.
@@ -26,7 +26,10 @@ export async function POST(req: Request) {
    // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
    const config = body.clientTts === true ? { ...base, tts: undefined } : base;
    const result = await requestInsertBeat(config, body);
-    return NextResponse.json(result);
+    return NextResponse.json({
+      ...result,
+      characters: result.characters.map((c) => ({ ...c, voice: undefined })),
+    });
  } catch (err) {
    const message = err instanceof Error ? err.message : "Unknown error";
    return NextResponse.json({ error: message }, { status: 500 });
@@ -1,8 +1,17 @@
 import { requestScene } from "@infiplot/engine";
-import type { SceneRequest } from "@infiplot/types";
+import type { Character, SceneRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";

+function stripKnownVoices(
+  characters: Character[],
+  knownNames: Set<string>,
+): Character[] {
+  return characters.map((c) =>
+    knownNames.has(c.name) ? { ...c, voice: undefined } : c,
+  );
+}
+
 export const runtime = "nodejs";
 // Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is
 // Writer + CharDesigner×N + Cinematographer + Painter — happy path 9–12s; the
@@ -27,7 +36,13 @@ export async function POST(req: Request) {
    // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS.
    const config = body.clientTts === true ? { ...base, tts: undefined } : base;
    const result = await requestScene(config, body);
-    return NextResponse.json(result);
+    const knownNames = new Set(
+      (body.session.characters ?? []).map((c) => c.name),
+    );
+    return NextResponse.json({
+      ...result,
+      characters: stripKnownVoices(result.characters, knownNames),
+    });
  } catch (err) {
    const message = err instanceof Error ? err.message : "Unknown error";
    return NextResponse.json({ error: message }, { status: 500 });
@@ -19,8 +19,6 @@ import { PRESETS } from "@/lib/presets";
 import { provisionVoice, synthesize } from "@infiplot/tts-client";
 import type {
  Beat,
-  BeatAudio,
-  BeatAudioResponse,
  BeatChoice,
  Character,
  CharacterVoice,
@@ -39,6 +37,34 @@ import { getByoHeaders, isByoActive } from "@/lib/byoHeaders";

 const MUTED_STORAGE_KEY = "infiplot:muted";

+// ── FOT reduction helpers ──────────────────────────────────────────────
+// Strip bulky voice.referenceAudioBase64 from the session before sending it to
+// the server. The engine only needs character names + visualDescriptions for
+// scene generation; voice data is only used by /api/beat-audio (which receives
+// the voice directly, not via session). The client retains voices locally and
+// re-merges them from the response via mergeCharactersPreserveVoice.
+function stripVoicesForTransport(session: Session): Session {
+  return {
+    ...session,
+    characters: session.characters.map((c) => ({ ...c, voice: undefined })),
+  };
+}
+
+// Merge server-returned characters with locally-held voices. The server strips
+// voice from already-known characters (P0), so only NEW characters carry voice.
+// For existing characters, re-attach the voice the client already holds.
+function mergeCharactersPreserveVoice(
+  local: Character[],
+  remote: Character[],
+): Character[] {
+  const localByName = new Map(local.map((c) => [c.name, c]));
+  return remote.map((c) => {
+    const prev = localByName.get(c.name);
+    if (!prev) return c;
+    return { ...c, voice: c.voice ?? prev.voice };
+  });
+}
+
 // Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a
 // non-BYO, unmuted player. Set high enough that one transient miss won't trip
 // it, low enough to catch a scene that's clearly being rate-limited.
@@ -304,7 +330,7 @@ function prefetchScenePath(
        "Content-Type": "application/json",
        ...getByoHeaders(),
      },
-      body: JSON.stringify({ session: specSession, clientTts }),
+      body: JSON.stringify({ session: stripVoicesForTransport(specSession), clientTts }),
      signal: abort.signal,
    });
    if (!res.ok) {
@@ -319,6 +345,12 @@ function prefetchScenePath(
    // transition path awaits the same cached promise via getOrCreateBlobUrl.
    void getOrCreateBlobUrl(data.imageUrl);

+    // Re-attach locally-held voices the server stripped from known characters.
+    data.characters = mergeCharactersPreserveVoice(
+      baseSession.characters,
+      data.characters,
+    );
+
    // Recursive: if the resulting scene has exactly one change-scene exit,
    // it is a must-pass node — prefetch its child too.
    if (depth + 1 < PREFETCH_MAX_DEPTH) {
@@ -435,7 +467,7 @@ function PlayInner() {
  const [currentScene, setCurrentScene] = useState<Scene | null>(null);
  const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
  const [imageUrl, setImageUrl] = useState<string | null>(null);
-  const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
+  const [beatAudioMap, setBeatAudioMap] = useState<Record<string, string>>({});
  // Lazy-initialize 优先级：本局选择(homepage 的「语音配音」存到 sessionStorage:infiplot:custom)
  // > 上次会话的粘性偏好(localStorage:infiplot:muted) > 默认非静音。
  // 这样首页选了「关闭」开始游戏，进来就是静音；选「开启」就不是静音；进入 play 页后用户自己
@@ -519,9 +551,7 @@ function PlayInner() {
    return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
  }, [currentScene, currentBeatId]);

-  const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined;
-  const audioBase64 = currentBeatAudio?.base64 ?? null;
-  const audioMime = currentBeatAudio?.mime ?? null;
+  const audioSrc = (currentBeat ? beatAudioMap[currentBeat.id] : undefined) ?? null;

  useEffect(() => {
    sessionRef.current = session;
@@ -597,7 +627,7 @@ function PlayInner() {
      const abort = new AbortController();
      beatAudioAbortRef.current.set(beat.id, abort);
      try {
-        let audio: BeatAudio | null = null;
+        let audioUrl: string | null = null;
        if (byo) {
          // Client-direct: provision (once per speaker, cached) + synth against
          // Xiaomi with the user's own key — no /api/beat-audio round-trip and
@@ -615,7 +645,7 @@ function PlayInner() {
            beat.lineDelivery,
            abort.signal,
          );
-          audio = { base64: out.audioBase64, mime: out.mimeType };
+          audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`;
        } else {
          const res = await fetch("/api/beat-audio", {
            method: "POST",
@@ -629,24 +659,26 @@ function PlayInner() {
            }),
            signal: abort.signal,
          });
+          if (res.status === 204) {
+            setSilenceStrikes((n) => Math.min(n + 1, 99));
+            return;
+          }
          if (!res.ok) {
            setSilenceStrikes((n) => Math.min(n + 1, 99));
            return;
          }
-          const json = (await res.json()) as BeatAudioResponse;
-          audio = json.audio;
-          // Null audio usually means MiMo rate-limited or timed out the shared
-          // key — track the streak; a real clip resets it.
-          if (audio) setSilenceStrikes(0);
-          else setSilenceStrikes((n) => Math.min(n + 1, 99));
+          const blob = await res.blob();
+          audioUrl = URL.createObjectURL(blob);
+          setSilenceStrikes(0);
        }
        // Skip the state write if we've been aborted between the await and
        // here — beat ids are scene-local, so a late arrival from a prior
        // scene would otherwise overwrite the current scene's audio under the
        // same id.
-        if (audio && !abort.signal.aborted) {
-          const settled = audio;
-          setBeatAudioMap((m) => ({ ...m, [beat.id]: settled }));
+        if (audioUrl && !abort.signal.aborted) {
+          setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl }));
+        } else if (audioUrl?.startsWith("blob:")) {
+          URL.revokeObjectURL(audioUrl);
        }
      } catch {
        // aborted / network / Xiaomi rate-limit — silent fallback (no audio)
@@ -685,7 +717,12 @@ function PlayInner() {
  // scenes) so a late arrival would land under the wrong beat otherwise.
  useEffect(() => {
    cancelBeatAudioFetches();
-    setBeatAudioMap({});
+    setBeatAudioMap((prev) => {
+      for (const url of Object.values(prev)) {
+        if (url.startsWith("blob:")) URL.revokeObjectURL(url);
+      }
+      return {};
+    });
    prefetchSceneAudio();
  }, [currentScene?.id, prefetchSceneAudio]);

@@ -720,7 +757,12 @@ function PlayInner() {
    if (prev === muted) return;
    cancelBeatAudioFetches();
    if (muted) return;
-    setBeatAudioMap({});
+    setBeatAudioMap((prev) => {
+      for (const url of Object.values(prev)) {
+        if (url.startsWith("blob:")) URL.revokeObjectURL(url);
+      }
+      return {};
+    });
    prefetchSceneAudio();
  }, [muted, prefetchSceneAudio]);

@@ -738,7 +780,12 @@ function PlayInner() {
      if (cfg) {
        setSilenceStrikes(0);
        cancelBeatAudioFetches();
-        setBeatAudioMap({});
+        setBeatAudioMap((prev) => {
+          for (const url of Object.values(prev)) {
+            if (url.startsWith("blob:")) URL.revokeObjectURL(url);
+          }
+          return {};
+        });
        prefetchSceneAudio();
      }
    },
@@ -1042,7 +1089,10 @@ function PlayInner() {
            visitedBeatIds: [result.scene.entryBeatId],
          },
        ],
-        characters: result.characters,
+        characters: mergeCharactersPreserveVoice(
+          base.characters,
+          result.characters,
+        ),
        storyState: result.storyState,
      };
      visitedBeatsRef.current = [result.scene.entryBeatId];
@@ -1121,7 +1171,7 @@ function PlayInner() {
          ...getByoHeaders(),
        },
        body: JSON.stringify({
-          session: specSession,
+          session: stripVoicesForTransport(specSession),
          clientTts: !!byoTtsRef.current,
        }),
      });
@@ -1148,7 +1198,7 @@ function PlayInner() {
          "Content-Type": "application/json",
          ...getByoHeaders(),
        },
-        body: JSON.stringify({ session, annotatedImageBase64 }),
+        body: JSON.stringify({ session: stripVoicesForTransport(session), annotatedImageBase64 }),
      });
      if (!visionRes.ok) {
        const j = (await visionRes.json().catch(() => ({}))) as {
@@ -1168,7 +1218,7 @@ function PlayInner() {
            ...getByoHeaders(),
          },
          body: JSON.stringify({
-            session,
+            session: stripVoicesForTransport(session),
            freeformAction: decision.intent.freeformAction,
            clientTts: !!byoTtsRef.current,
          }),
@@ -1206,7 +1256,10 @@ function PlayInner() {
          history: session.history.map((h, i, arr) =>
            i === arr.length - 1 ? { ...h, scene: patched } : h,
          ),
-          characters: insertChars,
+          characters: mergeCharactersPreserveVoice(
+            session.characters,
+            insertChars,
+          ),
        };
        setSession(nextSession);
        setCurrentScene(patched);
@@ -1252,7 +1305,7 @@ function PlayInner() {
              ...getByoHeaders(),
            },
            body: JSON.stringify({
-              session: specSession,
+              session: stripVoicesForTransport(specSession),
              clientTts: !!byoTtsRef.current,
            }),
          });
@@ -1321,8 +1374,7 @@ function PlayInner() {
      <div className="fixed inset-0 bg-black flex items-center justify-center z-50">
        <PlayCanvas
          imageUrl={imageUrl}
-          audioBase64={audioBase64}
-          audioMime={audioMime}
+          audioSrc={audioSrc}
          muted={muted}
          phase={phase}
          beat={currentBeat}
@@ -1396,8 +1448,7 @@ function PlayInner() {
      <main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
        <PlayCanvas
          imageUrl={imageUrl}
-          audioBase64={audioBase64}
-          audioMime={audioMime}
+          audioSrc={audioSrc}
          muted={muted}
          phase={phase}
          beat={currentBeat}