From e261f4a346c1cd8b3b3ac79b93e94a17d9527d61 Mon Sep 17 00:00:00 2001
From: Zonghao Yuan <64521992+zonghaoyuan@users.noreply.github.com>
Date: Thu, 28 May 2026 23:43:51 +0800
Subject: [PATCH] feat: Runware FLUX.2 image + lazy per-beat TTS (#5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduce median scene-load latency from ~30-80s to ~17-25s by switching image generation to Runware FLUX.2 [klein] 9B KV and moving per-beat TTS synthesis off the scene response into a new lazy /api/beat-audio endpoint with hard timeout + abort support.

- feat(image): migrate to Runware FLUX.2 [klein] 9B KV — task-array API, $0.001/image, sub-second inference.
- feat(tts): split /api/scene into directScene + image + voicedesign-provisioning; lazily synth per beat via /api/beat-audio with 15s hard timeout + AbortSignal threaded to MiMo so timed-out calls don't keep burning sockets/quota; client fans out per-beat fetches on scene-id change with abort + identity-check finally to prevent cross-scene beat-id collisions.
- refactor(tts): slim BeatAudioRequest to { beat, voice } — ~800KB per-beat upload dropped to ~160KB by sending only the speaker's voice instead of the full session.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 README.md                            |   6 +-
 apps/web/.env.example                |  24 ++--
 apps/web/app/api/beat-audio/route.ts |  36 +++++
 apps/web/app/play/page.tsx           | 118 +++++++++++++---
 packages/ai-client/src/image.ts      |  99 +++++++-------
 packages/engine/src/index.ts         |   3 +-
 packages/engine/src/orchestrator.ts  | 133 +++++++++++-------
 packages/engine/src/voice.ts         | 197 ++++++++++++++++-----------
 packages/tts-client/src/xiaomi.ts    |   2 +
 packages/types/src/index.ts          |  27 +++-
 10 files changed, 431 insertions(+), 214 deletions(-)
 create mode 100644 apps/web/app/api/beat-audio/route.ts

diff --git a/README.md b/README.md
index c700021..5b8bc23 100644
--- a/README.md
+++ b/README.md
@@ -45,12 +45,12 @@ After deploy, set the nine environment variables (see below) in your Vercel proj
 
 ## Environment variables
 
-Three providers, all independently configurable. Any OpenAI-compatible chat / image endpoint works (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …).
+Three providers, all independently configurable. Text and Vision accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible).
 
 | Provider | Variables | Recommended |
 |---|---|---|
 | Text · story director | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL` | `claude-opus-4-7` via Anthropic |
-| Image · UI renderer   | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `gpt-image-2` via OpenAI |
+| Image · UI renderer   | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) |
 | Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google |
 
 See `apps/web/.env.example` for the exact shape.
@@ -88,4 +88,4 @@ yume/
 
 ## Cost & limits
 
-Each **scene** costs roughly **\$0.15–0.25** in API fees with the recommended model trio (one text + one image call); tapping through a scene's beats is free. To keep transitions instant, the engine also **pre-generates scenes you might pick but don't** — so real spend runs somewhat higher than the scenes you actually see. There is no rate limiting or auth out of the box — if you make your deployment public, your bill will reflect that. Add limits (and consider lowering the prefetch depth) before sharing widely.
+With the recommended trio, each **scene** is dominated by the text-LLM call. The FLUX.2 [klein] 9B KV image is roughly **\$0.001** per scene (1792×1024, 4 steps, sub-second); the text call is the rest. Tapping through a scene's beats is free. To keep transitions instant, the engine also **pre-generates scenes you might pick but don't** — so real spend runs somewhat higher than the scenes you actually see. There is no rate limiting or auth out of the box — if you make your deployment public, your bill will reflect that. Add limits (and consider lowering the prefetch depth) before sharing widely.
diff --git a/apps/web/.env.example b/apps/web/.env.example
index aa0d983..20b7700 100644
--- a/apps/web/.env.example
+++ b/apps/web/.env.example
@@ -1,12 +1,14 @@
 # =============================================================
 # 云梦 — AI 视觉小说
 # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
-# (one API key covers all three) + any image provider for IMAGE.
+# (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]).
 #
-# Any OpenAI-compatible endpoint works for any slot — OpenRouter,
-# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
-# Image generation uses the chat-completions + modalities API
-# (OpenRouter-style), NOT the legacy /images/generations endpoint.
+# TEXT / VISION / TTS use OpenAI-compatible endpoints (any OpenAI-
+# compatible host works: OpenRouter, OpenAI, Anthropic via proxy,
+# Gemini, DeepSeek, Ollama, ...).
+#
+# IMAGE uses Runware's own task-array protocol (not OpenAI-compatible);
+# the adapter posts an `imageInference` task to IMAGE_BASE_URL.
 # =============================================================
 
 # ---- 1. Text LLM · scene director ----------------------------------
@@ -18,10 +20,14 @@ TEXT_API_KEY=tp-xxx
 TEXT_MODEL=mimo-v2.5-pro
 
 # ---- 2. Image generator (renders the scene background) -------------
-# Any provider supporting chat-completions + modalities image output.
-IMAGE_BASE_URL=https://openrouter.ai/api/v1
-IMAGE_API_KEY=sk-or-v1-xxx
-IMAGE_MODEL=openai/gpt-5.4-image-2
+# Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model,
+# sub-second inference at ~$0.0008/image. Sign up at https://runware.ai
+# AIR ids for FLUX.2 [klein] variants:
+#   runware:400@1  · 4B (smaller)
+#   runware:400@6  · 9B KV (recommended — fastest at 16:9)
+IMAGE_BASE_URL=https://api.runware.ai/v1
+IMAGE_API_KEY=runware-xxx
+IMAGE_MODEL=runware:400@6
 
 # ---- 3. Vision model · multimodal click interpretation -------------
 # Recommended: MiMo V2.5 omni — multimodal.
diff --git a/apps/web/app/api/beat-audio/route.ts b/apps/web/app/api/beat-audio/route.ts
new file mode 100644
index 0000000..a41fd33
--- /dev/null
+++ b/apps/web/app/api/beat-audio/route.ts
@@ -0,0 +1,36 @@
+import { requestBeatAudio } from "@yume/engine";
+import type { BeatAudioRequest } from "@yume/types";
+import { NextResponse } from "next/server";
+import { loadEngineConfig } from "@/lib/config";
+
+export const runtime = "nodejs";
+// The synth itself has a 15s per-call ceiling in the engine. 30s here just
+// covers JSON parsing + outbound network buffer.
+export const maxDuration = 30;
+
+export async function POST(req: Request) {
+  let body: BeatAudioRequest;
+  try {
+    body = (await req.json()) as BeatAudioRequest;
+  } catch {
+    return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
+  }
+
+  if (!body.beat?.id || !body.beat?.line || !body.voice?.referenceAudioBase64) {
+    return NextResponse.json(
+      { error: "beat.id, beat.line and voice.referenceAudioBase64 are required" },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const config = loadEngineConfig();
+    const result = await requestBeatAudio(config, body);
+    return NextResponse.json(result);
+  } catch (err) {
+    // Engine already swallows synth errors and returns audio:null. Anything
+    // that reaches here is config-level — surface so the client can log it.
+    const message = err instanceof Error ? err.message : "Unknown error";
+    return NextResponse.json({ error: message }, { status: 500 });
+  }
+}
diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx
index 586f4ac..b71f21e 100644
--- a/apps/web/app/play/page.tsx
+++ b/apps/web/app/play/page.tsx
@@ -15,6 +15,7 @@ import { PRESETS } from "@/lib/presets";
 import type {
   Beat,
   BeatAudio,
+  BeatAudioResponse,
   BeatChoice,
   InsertBeatResponse,
   Scene,
@@ -215,6 +216,10 @@ function PlayInner() {
 
   const startedRef = useRef(false);
   const poolRef = useRef<Map<string, PrefetchEntry>>(new Map());
+  // Lazy per-beat audio fetches keyed by beat.id. Aborted when the scene
+  // changes so stale in-flight requests can't poison the new scene's map
+  // (beat ids like "b1" are scene-local and would collide across scenes).
+  const beatAudioAbortRef = useRef<Map<string, AbortController>>(new Map());
 
   // Mirrors for use inside async handlers (closure-stable)
   const sessionRef = useRef<Session | null>(null);
@@ -259,6 +264,79 @@ function PlayInner() {
     });
   }, [currentBeatId]);
 
+  // ── Lazy per-beat audio fetch ────────────────────────────────────────
+  // Returns silently on any failure — the UI never waits for audio, so a
+  // null result just means that beat plays without voice.
+  // Sends only the speaker's voice + the line to speak — NOT the whole
+  // session — so the per-beat payload stays small even with many characters
+  // (each voice.referenceAudioBase64 is ~160KB).
+  const fetchBeatAudio = useCallback(
+    async (
+      sess: Session,
+      beat: { id: string; speaker?: string; line?: string; lineDelivery?: string },
+    ): Promise<void> => {
+      if (!beat.speaker || !beat.line) return;
+      const speaker = sess.characters.find((c) => c.name === beat.speaker);
+      if (!speaker?.voice) return; // not yet provisioned — server can't synth anyway
+      if (beatAudioAbortRef.current.has(beat.id)) return;
+      const abort = new AbortController();
+      beatAudioAbortRef.current.set(beat.id, abort);
+      try {
+        const res = await fetch("/api/beat-audio", {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({
+            beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
+            voice: speaker.voice,
+          }),
+          signal: abort.signal,
+        });
+        if (!res.ok) return;
+        const json = (await res.json()) as BeatAudioResponse;
+        // Skip the state write if we've been aborted between the .ok check and
+        // here — beat ids are scene-local, so a late arrival from a prior
+        // scene would otherwise overwrite the current scene's audio under the
+        // same id.
+        if (json.audio && !abort.signal.aborted) {
+          setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio }));
+        }
+      } catch {
+        // aborted or network error — silent fallback
+      } finally {
+        // Only clear the slot if it's still ours. An aborted prior fetch
+        // running its finally late could otherwise delete the controller of a
+        // new fetch that took the same beat id, leaving the new one
+        // unabortable on the next scene change.
+        if (beatAudioAbortRef.current.get(beat.id) === abort) {
+          beatAudioAbortRef.current.delete(beat.id);
+        }
+      }
+    },
+    [],
+  );
+
+  function cancelBeatAudioFetches(): void {
+    for (const c of beatAudioAbortRef.current.values()) c.abort();
+    beatAudioAbortRef.current.clear();
+  }
+
+  // Fire one /api/beat-audio request per speaking beat each time the scene
+  // changes. Cancel any in-flight requests from the prior scene first —
+  // beat ids are scene-local ("b1" repeats across scenes) so a late arrival
+  // would land under the wrong beat in the audio map otherwise.
+  useEffect(() => {
+    cancelBeatAudioFetches();
+    setBeatAudioMap({});
+    const scene = currentScene;
+    const sess = sessionRef.current;
+    if (!scene || !sess) return;
+    for (const b of scene.beats) {
+      if (b.speaker && b.line) {
+        void fetchBeatAudio(sess, b);
+      }
+    }
+  }, [currentScene?.id, fetchBeatAudio]);
+
   // ── Mute persistence (read is via the useState lazy initializer above) ─
   const toggleMuted = useCallback(() => {
     setMuted((prev) => {
@@ -375,7 +453,8 @@ function PlayInner() {
         setCurrentScene(data.scene);
         setCurrentBeatId(data.scene.entryBeatId);
         setImageBase64(data.imageBase64);
-        setBeatAudioMap(data.beatAudio ?? {});
+        // beatAudioMap is populated lazily by the per-beat fetch effect once
+        // currentScene becomes non-null (see fetchBeatAudio).
         setPhase("ready");
       })
       .catch((e) => setError(String(e)));
@@ -410,8 +489,11 @@ function PlayInner() {
   // consumeChoice keeping the re-rooted survivor prefetches alive.
   useEffect(() => {
     const pool = poolRef.current;
+    const beatAborts = beatAudioAbortRef.current;
     return () => {
       clearPool(pool);
+      for (const c of beatAborts.values()) c.abort();
+      beatAborts.clear();
     };
   }, []);
 
@@ -459,7 +541,7 @@ function PlayInner() {
       setCurrentScene(result.scene);
       setCurrentBeatId(result.scene.entryBeatId);
       setImageBase64(result.imageBase64);
-      setBeatAudioMap(result.beatAudio ?? {});
+      // beatAudioMap reset + per-beat fetches kicked off by the scene effect.
       setLastExitLabel(exitLabel);
       setPhase("ready");
     } catch (e) {
@@ -559,7 +641,7 @@ function PlayInner() {
           };
           throw new Error(j.error ?? insertRes.statusText);
         }
-        const { partial, characters: insertChars, audio } =
+        const { partial, characters: insertChars } =
           (await insertRes.json()) as InsertBeatResponse;
 
         const fromBeatId =
@@ -581,21 +663,25 @@ function PlayInner() {
           beats: [...currentScene.beats, newBeat],
         };
 
-        setSession((s) =>
-          s
-            ? {
-                ...s,
-                history: s.history.map((h, i, arr) =>
-                  i === arr.length - 1 ? { ...h, scene: patched } : h,
-                ),
-                characters: insertChars,
-              }
-            : s,
-        );
+        const nextSession: Session = {
+          ...session,
+          history: session.history.map((h, i, arr) =>
+            i === arr.length - 1 ? { ...h, scene: patched } : h,
+          ),
+          characters: insertChars,
+        };
+        setSession(nextSession);
         setCurrentScene(patched);
         setCurrentBeatId(newBeatId);
-        if (audio) {
-          setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
+        // Insert-beat doesn't change scene.id, so the scene effect won't
+        // re-fire — manually kick off the audio fetch for the new beat.
+        if (newBeat.speaker && newBeat.line) {
+          void fetchBeatAudio(nextSession, {
+            id: newBeatId,
+            speaker: newBeat.speaker,
+            line: newBeat.line,
+            lineDelivery: newBeat.lineDelivery,
+          });
         }
         setLastExitLabel(decision.intent.freeformAction);
         setPhase("ready");
diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts
index 72cddb6..837e8de 100644
--- a/packages/ai-client/src/image.ts
+++ b/packages/ai-client/src/image.ts
@@ -1,28 +1,43 @@
 import type { ProviderConfig } from "@yume/types";
 import { fetchWithRetry } from "./fetchWithRetry";
 
-type ImageUrlPart = { type: string; image_url?: { url?: string } };
-type ChatResponse = {
-  choices: {
-    message: {
-      content: string | ImageUrlPart[];
-      images?: ImageUrlPart[];
-    };
-  }[];
+// Runware uses its own task-array protocol (not OpenAI-compatible).
+// POST <baseUrl> with [{ taskType: "imageInference", ... }]; errors come
+// back as a 200 with `errors[]`, so we have to inspect the body either way.
+type RunwareImageResult = {
+  imageBase64Data?: string;
+};
+type RunwareError = {
+  code?: string;
+  message?: string;
+  parameter?: string;
+};
+type RunwareResponse = {
+  data?: RunwareImageResult[];
+  errors?: RunwareError[];
 };
 
 export async function generateImage(
   config: ProviderConfig,
   prompt: string,
 ): Promise<string> {
-  const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
+  const url = config.baseUrl.replace(/\/$/, "");
 
-  const body = {
-    model: config.model,
-    modalities: ["image", "text"],
-    size: "1792x1024",
-    messages: [{ role: "user", content: prompt }],
-  };
+  const body = [
+    {
+      taskType: "imageInference",
+      taskUUID: crypto.randomUUID(),
+      model: config.model,
+      positivePrompt: prompt,
+      width: 1792,
+      height: 1024,
+      steps: 4,
+      CFGScale: 3.5,
+      numberResults: 1,
+      outputType: "base64Data",
+      outputFormat: "PNG",
+    },
+  ];
 
   const res = await fetchWithRetry(url, {
     method: "POST",
@@ -33,47 +48,27 @@ export async function generateImage(
     body: JSON.stringify(body),
   });
 
-  if (!res.ok) {
-    const text = await res.text();
+  const text = await res.text();
+  let json: RunwareResponse;
+  try {
+    json = JSON.parse(text) as RunwareResponse;
+  } catch {
     throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`);
   }
 
-  const json = (await res.json()) as ChatResponse;
-  const msg = json.choices[0]?.message;
-  if (!msg) throw new Error("Image API returned no message");
-
-  // 1) OpenRouter-style: msg.images = [{ image_url: { url } }]
-  // 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }]
-  const structured: ImageUrlPart[] = [];
-  if (msg.images) structured.push(...msg.images);
-  if (Array.isArray(msg.content)) structured.push(...msg.content);
-  for (const part of structured) {
-    const u = part.image_url?.url;
-    if (u) return await urlToBase64(u);
+  if (json.errors?.length) {
+    const e = json.errors[0]!;
+    throw new Error(
+      `Runware error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}` +
+        (e.parameter ? ` (parameter: ${e.parameter})` : ""),
+    );
   }
 
-  // 3) provider-style: content is a string with markdown image ![alt](url)
-  //    or a bare URL fragment
-  if (typeof msg.content === "string") {
-    const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/);
-    if (md?.[1]) return await urlToBase64(md[1]);
-    const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i);
-    if (bare?.[0]) return await urlToBase64(bare[0]);
+  const b64 = json.data?.[0]?.imageBase64Data;
+  if (!b64) {
+    throw new Error(
+      `No image in Runware response: ${text.slice(0, 300)}`,
+    );
   }
-
-  throw new Error(
-    `No image found in response: ${JSON.stringify(msg).slice(0, 300)}`,
-  );
-}
-
-async function urlToBase64(url: string): Promise<string> {
-  if (url.startsWith("data:")) {
-    const idx = url.indexOf("base64,");
-    if (idx === -1) throw new Error("data URL is not base64-encoded");
-    return url.slice(idx + "base64,".length);
-  }
-  const res = await fetch(url);
-  if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`);
-  const buf = await res.arrayBuffer();
-  return Buffer.from(buf).toString("base64");
+  return b64;
 }
diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts
index 3dcf386..9d96a48 100644
--- a/packages/engine/src/index.ts
+++ b/packages/engine/src/index.ts
@@ -3,9 +3,10 @@ export {
   requestScene,
   visionDecide,
   requestInsertBeat,
+  requestBeatAudio,
 } from "./orchestrator";
 export { annotateClick } from "./annotate";
-export { voiceBeat, voiceScene } from "./voice";
+export { provisionVoicesForScene, synthesizeBeat } from "./voice";
 export type { SceneResult } from "./director";
 export type { InsertBeatPartial } from "@yume/types";
 export * from "./prompts";
diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts
index 0f9c290..d75e17b 100644
--- a/packages/engine/src/orchestrator.ts
+++ b/packages/engine/src/orchestrator.ts
@@ -1,5 +1,6 @@
 import type {
-  BeatAudio,
+  BeatAudioRequest,
+  BeatAudioResponse,
   Character,
   EngineConfig,
   InsertBeatRequest,
@@ -18,12 +19,17 @@ import { directInsertBeat, directScene } from "./director";
 import { mockImageBase64 } from "./mockImage";
 import { render } from "./renderer";
 import { interpret } from "./vision";
-import { voiceBeat, voiceScene } from "./voice";
+import { provisionVoicesForScene, synthesizeBeat } from "./voice";
 
 function newSessionId(): string {
   return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
 }
 
+// TEMP: per-phase timing for latency diagnosis. Remove after we have data.
+function tlog(label: string, t0: number): void {
+  console.log(`${label}: ${Date.now() - t0}ms`);
+}
+
 // Merge new character entries into the registry by name. If a name already
 // exists we preserve the existing voice (so a description revision never
 // silently re-provisions a voice the player has already heard).
@@ -46,30 +52,26 @@ async function renderImage(
   return render(config.image, scene, styleGuide);
 }
 
-async function runVoiceScene(
+async function provisionForScene(
   config: EngineConfig,
   session: Session,
   scene: Scene,
-): Promise<{
-  beatAudio?: Record<string, BeatAudio>;
-  characters: Character[];
-}> {
+): Promise<{ characters: Character[] }> {
   if (!config.tts) return { characters: session.characters };
-  const res = await voiceScene(config.tts, session, scene);
-  return {
-    beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
-    characters: res.characters,
-  };
+  return provisionVoicesForScene(config.tts, session, scene);
 }
 
 // ──────────────────────────────────────────────────────────────────────
-//  startSession — first scene + image + per-beat voice
+//  startSession — first scene + image + voice provisioning. The actual
+//  per-beat synth runs lazily via requestBeatAudio so MiMo's tail
+//  latency never blocks the UI.
 // ──────────────────────────────────────────────────────────────────────
 
 export async function startSession(
   config: EngineConfig,
   req: StartRequest,
 ): Promise<StartResponse> {
+  const tTotal = Date.now();
   const session: Session = {
     id: newSessionId(),
     createdAt: Date.now(),
@@ -79,28 +81,41 @@ export async function startSession(
     characters: [],
   };
 
+  const tDirect = Date.now();
   const { scene, characterUpdates } = await directScene(config.text, session);
+  tlog("[start] directScene", tDirect);
+
   const preVoiceSession: Session = {
     ...session,
     characters: mergeCharacters(session.characters, characterUpdates),
   };
 
-  const [imageBase64, voiceRes] = await Promise.all([
-    renderImage(config, scene, preVoiceSession.styleGuide),
-    runVoiceScene(config, preVoiceSession, scene),
-  ]);
+  const tImage = Date.now();
+  const tProv = Date.now();
+  const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide)
+    .then((r) => {
+      tlog("[start] renderImage", tImage);
+      return r;
+    });
+  const provPromise = provisionForScene(config, preVoiceSession, scene)
+    .then((r) => {
+      tlog("[start] provisionForScene", tProv);
+      return r;
+    });
+  const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]);
+
+  tlog("[start] TOTAL", tTotal);
 
   return {
     sessionId: session.id,
     scene,
     imageBase64,
-    characters: voiceRes.characters,
-    beatAudio: voiceRes.beatAudio,
+    characters: provRes.characters,
   };
 }
 
 // ──────────────────────────────────────────────────────────────────────
-//  requestScene — generate the NEXT scene + image + per-beat voice.
+//  requestScene — generate the NEXT scene + image + voice provisioning.
 //  Used both on real scene transitions and on speculative prefetch.
 // ──────────────────────────────────────────────────────────────────────
 
@@ -108,22 +123,37 @@ export async function requestScene(
   config: EngineConfig,
   req: SceneRequest,
 ): Promise<SceneResponse> {
+  const tTotal = Date.now();
+
+  const tDirect = Date.now();
   const { scene, characterUpdates } = await directScene(config.text, req.session);
+  tlog("[scene] directScene", tDirect);
+
   const preVoiceSession: Session = {
     ...req.session,
     characters: mergeCharacters(req.session.characters, characterUpdates),
   };
 
-  const [imageBase64, voiceRes] = await Promise.all([
-    renderImage(config, scene, preVoiceSession.styleGuide),
-    runVoiceScene(config, preVoiceSession, scene),
-  ]);
+  const tImage = Date.now();
+  const tProv = Date.now();
+  const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide)
+    .then((r) => {
+      tlog("[scene] renderImage", tImage);
+      return r;
+    });
+  const provPromise = provisionForScene(config, preVoiceSession, scene)
+    .then((r) => {
+      tlog("[scene] provisionForScene", tProv);
+      return r;
+    });
+  const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]);
+
+  tlog("[scene] TOTAL", tTotal);
 
   return {
     scene,
     imageBase64,
-    characters: voiceRes.characters,
-    beatAudio: voiceRes.beatAudio,
+    characters: provRes.characters,
   };
 }
 
@@ -141,24 +171,27 @@ export async function visionDecide(
 }
 
 // ──────────────────────────────────────────────────────────────────────
-//  requestInsertBeat — generates a transient in-scene beat (no image regen)
-//  and voices the line if any.
+//  requestInsertBeat — generates a transient in-scene beat (no image
+//  regen, no voice). The client fires /api/beat-audio for the new beat
+//  after this returns.
 // ──────────────────────────────────────────────────────────────────────
 
 export async function requestInsertBeat(
   config: EngineConfig,
   req: InsertBeatRequest,
 ): Promise<InsertBeatResponse> {
+  const tTotal = Date.now();
+
+  const tDirect = Date.now();
   const partial = await directInsertBeat(
     config.text,
     req.session,
     req.freeformAction,
   );
+  tlog("[insert-beat] directInsertBeat", tDirect);
 
-  // INSERT_BEAT prompt forbids new characters — but if the director violates
-  // it, voiceBeat's name-inferred fallback would silently provision and persist
-  // the hallucinated speaker. Strip the speaker attribution and promote the
-  // line into narration so the player still sees the text (the client only
+  // INSERT_BEAT prompt forbids new characters — promote disallowed-speaker
+  // lines to narration so the player still sees the text (the client only
   // renders `line` when there is a `speaker`).
   if (
     partial.speaker &&
@@ -169,6 +202,7 @@ export async function requestInsertBeat(
     );
     const promotedNarration =
       [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
+    tlog("[insert-beat] TOTAL", tTotal);
     return {
       partial: {
         narration: promotedNarration,
@@ -180,23 +214,20 @@ export async function requestInsertBeat(
     };
   }
 
-  if (!config.tts) {
-    // Always echo characters so callers don't need a ?? fallback.
-    return { partial, characters: req.session.characters };
-  }
-
-  // Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
-  // registered cast, so we voice against the existing character set.
-  const voiceRes = await voiceBeat(
-    config.tts,
-    req.session,
-    req.session.characters,
-    partial,
-  );
-
-  return {
-    partial,
-    characters: voiceRes.characters,
-    audio: voiceRes.audio,
-  };
+  tlog("[insert-beat] TOTAL", tTotal);
+  return { partial, characters: req.session.characters };
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  requestBeatAudio — lazy per-beat synth. Returns audio:null on
+//  timeout / failure / TTS disabled, so the client just plays silent.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function requestBeatAudio(
+  config: EngineConfig,
+  req: BeatAudioRequest,
+): Promise<BeatAudioResponse> {
+  if (!config.tts) return { audio: null };
+  const audio = await synthesizeBeat(config.tts, req.voice, req.beat);
+  return { audio };
 }
diff --git a/packages/engine/src/voice.ts b/packages/engine/src/voice.ts
index ac0649b..d61464b 100644
--- a/packages/engine/src/voice.ts
+++ b/packages/engine/src/voice.ts
@@ -8,12 +8,10 @@ import type {
   TtsConfig,
 } from "@yume/types";
 
-export type BeatLike = {
-  id?: string;
-  speaker?: string;
-  line?: string;
-  lineDelivery?: string;
-};
+// Per-beat synth budget. MiMo's median synth is 3–7s; the tail can spike
+// to 30–70s under concurrent load. Capping here means a single bad beat
+// degrades to silent in <15s instead of blocking the whole UI flow.
+const SYNTH_TIMEOUT_MS = 15000;
 
 // When the director references a speaker that was never registered, derive a
 // description from the name + world so the voice's gender/temperament is at
@@ -22,85 +20,130 @@ function inferredSpeakerDescription(name: string, session: Session): string {
   return `请根据角色名「${name}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`;
 }
 
-// Voice a single beat against a mutable character registry.
-// Returns the (possibly-extended) registry plus the audio if synthesized.
-// Narration-only beats and missing-line beats return no audio (VN convention).
-export async function voiceBeat(
-  cfg: TtsConfig,
-  session: Session,
-  characters: Character[],
-  beat: BeatLike,
-): Promise<{ audio?: BeatAudio; characters: Character[] }> {
-  if (!beat.speaker || !beat.line) {
-    return { characters };
-  }
-
-  const speakerName = beat.speaker;
-  const text = beat.line;
-  const delivery = beat.lineDelivery;
-
-  // Hoisted so the catch can return the in-progress registry even if synthesis
-  // fails after provisioning succeeded — otherwise the just-provisioned voice
-  // would be lost and the next beat for this speaker would pay to re-design it
-  // (extra cost, latency, and more 429 risk on rate-limited providers).
-  let nextCharacters: Character[] = characters;
-
+// Race the work against a timer; on either outcome clear the timer (otherwise
+// the success path leaks a 15s-pending reject closure into Node's timer heap,
+// per-synth call). On timeout, abort the supplied controller so the underlying
+// HTTP request is cancelled — otherwise MiMo's 30-70s tail keeps the socket
+// open and the quota burning long after we've returned audio:null.
+async function withTimeout<T>(
+  p: Promise<T>,
+  ms: number,
+  label: string,
+  ctrl: AbortController,
+): Promise<T> {
+  let timer: ReturnType<typeof setTimeout> | undefined;
   try {
-    const idx = characters.findIndex((c) => c.name === speakerName);
-    let voice: CharacterVoice | undefined;
-
-    if (idx !== -1 && characters[idx]?.voice) {
-      voice = characters[idx]!.voice;
-    } else if (idx !== -1) {
-      const target = characters[idx]!;
-      voice = await provisionVoice(cfg, target.description);
-      nextCharacters = characters.map((c, i) =>
-        i === idx ? { ...c, voice } : c,
-      );
-    } else {
-      const description = inferredSpeakerDescription(speakerName, session);
-      voice = await provisionVoice(cfg, description);
-      nextCharacters = [...characters, { name: speakerName, description, voice }];
-    }
-
-    const { audioBase64, mimeType } = await synthesize(
-      cfg,
-      voice,
-      text,
-      delivery,
-    );
-    return {
-      audio: { base64: audioBase64, mime: mimeType },
-      characters: nextCharacters,
-    };
-  } catch (err) {
-    const msg = err instanceof Error ? err.message : String(err);
-    console.error(`[voice] degraded: ${msg}`);
-    return { characters: nextCharacters };
+    return await Promise.race([
+      p,
+      new Promise<T>((_, reject) => {
+        timer = setTimeout(() => {
+          ctrl.abort();
+          reject(new Error(`${label} timed out after ${ms}ms`));
+        }, ms);
+      }),
+    ]);
+  } finally {
+    if (timer) clearTimeout(timer);
   }
 }
 
-// Voice every beat in a scene. Sequential by design: a single speaker
-// appearing in multiple beats must provision exactly once and share that
-// voice across calls — parallel synthesis would race and create duplicates.
-// With 2–6 beats × ~500ms per clone the total cost is well inside the image
-// generation budget (10s+), so the simplicity is worth it.
-export async function voiceScene(
+// Provision voices for all unseen speakers in a scene, in parallel.
+// Does NOT synthesize per-beat audio — that happens lazily via
+// synthesizeBeat from the /api/beat-audio route. Returning the populated
+// registry lets the client fire per-beat synth without re-provisioning.
+//
+// Why dedupe before fanning out: the SAME unseen speaker appearing in 3
+// beats must run voicedesign once; parallel design of the same speaker
+// would burn three voices' worth of budget and pick whichever raced last.
+export async function provisionVoicesForScene(
   cfg: TtsConfig,
   session: Session,
   scene: Scene,
-): Promise<{
-  beatAudio: Record<string, BeatAudio>;
-  characters: Character[];
-}> {
-  let characters = session.characters;
-  const beatAudio: Record<string, BeatAudio> = {};
+): Promise<{ characters: Character[] }> {
+  const tScene = Date.now();
+  const speakingBeats = scene.beats.filter(
+    (b): b is typeof b & { speaker: string; line: string } =>
+      Boolean(b.speaker && b.line),
+  );
 
-  for (const beat of scene.beats) {
-    const res = await voiceBeat(cfg, session, characters, beat);
-    characters = res.characters;
-    if (res.audio) beatAudio[beat.id] = res.audio;
+  let characters: Character[] = [...session.characters];
+  const toProvision = new Map<string, string>(); // name -> description
+  for (const b of speakingBeats) {
+    if (toProvision.has(b.speaker)) continue;
+    const existing = characters.find((c) => c.name === b.speaker);
+    if (existing?.voice) continue;
+    toProvision.set(
+      b.speaker,
+      existing?.description ?? inferredSpeakerDescription(b.speaker, session),
+    );
   }
 
-  return { beatAudio, characters };
+  if (toProvision.size === 0) {
+    console.log(
+      `[voice] provisionVoicesForScene total=${Date.now() - tScene}ms (no new speakers)`,
+    );
+    return { characters };
+  }
+
+  const tProvision = Date.now();
+  const provisioned = await Promise.all(
+    Array.from(toProvision.entries()).map(async ([name, description]) => {
+      try {
+        const voice = await provisionVoice(cfg, description);
+        return { name, description, voice };
+      } catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        console.error(`[voice] provision degraded for ${name}: ${msg}`);
+        return { name, description, voice: undefined };
+      }
+    }),
+  );
+  console.log(
+    `[voice] provision: ${toProvision.size} speakers parallel max=${Date.now() - tProvision}ms`,
+  );
+
+  for (const p of provisioned) {
+    if (!p.voice) continue;
+    const idx = characters.findIndex((c) => c.name === p.name);
+    if (idx === -1) {
+      characters.push({ name: p.name, description: p.description, voice: p.voice });
+    } else {
+      characters[idx] = { ...characters[idx]!, voice: p.voice };
+    }
+  }
+
+  console.log(
+    `[voice] provisionVoicesForScene total=${Date.now() - tScene}ms`,
+  );
+  return { characters };
+}
+
+// Synthesize audio for one beat. Caller is expected to have already
+// resolved the speaker's voice (from session.characters in the client) —
+// passing it directly here keeps the /api/beat-audio payload small and
+// makes this function pure with respect to session state.
+// Returns null on error or timeout; caller treats null as "play silent."
+export async function synthesizeBeat(
+  cfg: TtsConfig,
+  voice: CharacterVoice,
+  beat: { id: string; line: string; lineDelivery?: string },
+): Promise<BeatAudio | null> {
+  const t = Date.now();
+  const ctrl = new AbortController();
+  try {
+    const { audioBase64, mimeType } = await withTimeout(
+      synthesize(cfg, voice, beat.line, beat.lineDelivery, ctrl.signal),
+      SYNTH_TIMEOUT_MS,
+      `synth ${beat.id}`,
+      ctrl,
+    );
+    console.log(`  [voice ${beat.id}] synth=${Date.now() - t}ms`);
+    return { base64: audioBase64, mime: mimeType };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(
+      `[voice] synth degraded for ${beat.id} (after ${Date.now() - t}ms): ${msg}`,
+    );
+    return null;
+  }
 }
diff --git a/packages/tts-client/src/xiaomi.ts b/packages/tts-client/src/xiaomi.ts
index 33dc953..d957f14 100644
--- a/packages/tts-client/src/xiaomi.ts
+++ b/packages/tts-client/src/xiaomi.ts
@@ -77,6 +77,7 @@ export async function xiaomiSynthesize(
   voice: CharacterVoice,
   text: string,
   delivery?: string,
+  signal?: AbortSignal,
 ): Promise<{ audioBase64: string; mimeType: string }> {
   const url = joinUrl(cfg.baseUrl, "/chat/completions");
 
@@ -99,6 +100,7 @@ export async function xiaomiSynthesize(
     method: "POST",
     headers: buildHeaders(cfg),
     body: JSON.stringify(body),
+    signal,
   });
 
   if (!res.ok) {
diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts
index 9f372e8..5d0e86b 100644
--- a/packages/types/src/index.ts
+++ b/packages/types/src/index.ts
@@ -145,10 +145,8 @@ export type StartResponse = {
   sessionId: string;
   scene: Scene;
   imageBase64: string;
-  /** Post-voice character registry (with provisioned voices). */
+  /** Character registry with voice references provisioned for new speakers. */
   characters: Character[];
-  /** Per-beat synthesized audio, keyed by beat.id. */
-  beatAudio?: Record<string, BeatAudio>;
 };
 
 // /api/scene — generates the next Scene, given session whose latest
@@ -162,7 +160,27 @@ export type SceneResponse = {
   scene: Scene;
   imageBase64: string;
   characters: Character[];
-  beatAudio?: Record<string, BeatAudio>;
+};
+
+// /api/beat-audio — lazily synthesize one beat's voice. Client fires this
+// per beat after a scene loads; server has a per-call timeout so MiMo
+// tail-latency cannot block the UI. A null audio response means "play silent."
+//
+// Payload deliberately slim: just the line to speak and the speaker's voice
+// reference. The client extracts the voice from its local session.characters
+// before posting — sending the full Session would force ~160KB of base64 per
+// OTHER speaker plus the entire scene history to ride along for nothing.
+export type BeatAudioRequest = {
+  beat: {
+    id: string;
+    line: string;
+    lineDelivery?: string;
+  };
+  voice: CharacterVoice;
+};
+
+export type BeatAudioResponse = {
+  audio: BeatAudio | null;
 };
 
 // /api/vision — interprets a background click on the current image and
@@ -197,5 +215,4 @@ export type InsertBeatPartial = {
 export type InsertBeatResponse = {
   partial: InsertBeatPartial;
   characters: Character[];
-  audio?: BeatAudio;
 };