From e261f4a346c1cd8b3b3ac79b93e94a17d9527d61 Mon Sep 17 00:00:00 2001 From: Zonghao Yuan <64521992+zonghaoyuan@users.noreply.github.com> Date: Thu, 28 May 2026 23:43:51 +0800 Subject: [PATCH] feat: Runware FLUX.2 image + lazy per-beat TTS (#5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce median scene-load latency from ~30-80s to ~17-25s by switching image generation to Runware FLUX.2 [klein] 9B KV and moving per-beat TTS synthesis off the scene response into a new lazy /api/beat-audio endpoint with hard timeout + abort support. - feat(image): migrate to Runware FLUX.2 [klein] 9B KV โ€” task-array API, $0.001/image, sub-second inference. - feat(tts): split /api/scene into directScene + image + voicedesign-provisioning; lazily synth per beat via /api/beat-audio with 15s hard timeout + AbortSignal threaded to MiMo so timed-out calls don't keep burning sockets/quota; client fans out per-beat fetches on scene-id change with abort + identity-check finally to prevent cross-scene beat-id collisions. - refactor(tts): slim BeatAudioRequest to { beat, voice } โ€” ~800KB per-beat upload dropped to ~160KB by sending only the speaker's voice instead of the full session. ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) --- README.md | 6 +- apps/web/.env.example | 24 ++-- apps/web/app/api/beat-audio/route.ts | 36 +++++ apps/web/app/play/page.tsx | 118 +++++++++++++--- packages/ai-client/src/image.ts | 99 +++++++------- packages/engine/src/index.ts | 3 +- packages/engine/src/orchestrator.ts | 133 +++++++++++------- packages/engine/src/voice.ts | 197 ++++++++++++++++----------- packages/tts-client/src/xiaomi.ts | 2 + packages/types/src/index.ts | 27 +++- 10 files changed, 431 insertions(+), 214 deletions(-) create mode 100644 apps/web/app/api/beat-audio/route.ts diff --git a/README.md b/README.md index c700021..5b8bc23 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,12 @@ After deploy, set the nine environment variables (see below) in your Vercel proj ## Environment variables -Three providers, all independently configurable. Any OpenAI-compatible chat / image endpoint works (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, โ€ฆ). +Three providers, all independently configurable. Text and Vision accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, โ€ฆ). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible). | Provider | Variables | Recommended | |---|---|---| | Text ยท story director | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL` | `claude-opus-4-7` via Anthropic | -| Image ยท UI renderer | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `gpt-image-2` via OpenAI | +| Image ยท UI renderer | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) | | Vision ยท click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google | See `apps/web/.env.example` for the exact shape. @@ -88,4 +88,4 @@ yume/ ## Cost & limits -Each **scene** costs roughly **\$0.15โ€“0.25** in API fees with the recommended model trio (one text + one image call); tapping through a scene's beats is free. To keep transitions instant, the engine also **pre-generates scenes you might pick but don't** โ€” so real spend runs somewhat higher than the scenes you actually see. There is no rate limiting or auth out of the box โ€” if you make your deployment public, your bill will reflect that. Add limits (and consider lowering the prefetch depth) before sharing widely. +With the recommended trio, each **scene** is dominated by the text-LLM call. The FLUX.2 [klein] 9B KV image is roughly **\$0.001** per scene (1792ร—1024, 4 steps, sub-second); the text call is the rest. Tapping through a scene's beats is free. To keep transitions instant, the engine also **pre-generates scenes you might pick but don't** โ€” so real spend runs somewhat higher than the scenes you actually see. There is no rate limiting or auth out of the box โ€” if you make your deployment public, your bill will reflect that. Add limits (and consider lowering the prefetch depth) before sharing widely. diff --git a/apps/web/.env.example b/apps/web/.env.example index aa0d983..20b7700 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -1,12 +1,14 @@ # ============================================================= # ไบ‘ๆขฆ โ€” AI ่ง†่ง‰ๅฐ่ฏด # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS -# (one API key covers all three) + any image provider for IMAGE. +# (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]). # -# Any OpenAI-compatible endpoint works for any slot โ€” OpenRouter, -# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc. -# Image generation uses the chat-completions + modalities API -# (OpenRouter-style), NOT the legacy /images/generations endpoint. +# TEXT / VISION / TTS use OpenAI-compatible endpoints (any OpenAI- +# compatible host works: OpenRouter, OpenAI, Anthropic via proxy, +# Gemini, DeepSeek, Ollama, ...). +# +# IMAGE uses Runware's own task-array protocol (not OpenAI-compatible); +# the adapter posts an `imageInference` task to IMAGE_BASE_URL. # ============================================================= # ---- 1. Text LLM ยท scene director ---------------------------------- @@ -18,10 +20,14 @@ TEXT_API_KEY=tp-xxx TEXT_MODEL=mimo-v2.5-pro # ---- 2. Image generator (renders the scene background) ------------- -# Any provider supporting chat-completions + modalities image output. -IMAGE_BASE_URL=https://openrouter.ai/api/v1 -IMAGE_API_KEY=sk-or-v1-xxx -IMAGE_MODEL=openai/gpt-5.4-image-2 +# Recommended: Runware + FLUX.2 [klein] 9B KV โ€” distilled 4-step model, +# sub-second inference at ~$0.0008/image. Sign up at https://runware.ai +# AIR ids for FLUX.2 [klein] variants: +# runware:400@1 ยท 4B (smaller) +# runware:400@6 ยท 9B KV (recommended โ€” fastest at 16:9) +IMAGE_BASE_URL=https://api.runware.ai/v1 +IMAGE_API_KEY=runware-xxx +IMAGE_MODEL=runware:400@6 # ---- 3. Vision model ยท multimodal click interpretation ------------- # Recommended: MiMo V2.5 omni โ€” multimodal. diff --git a/apps/web/app/api/beat-audio/route.ts b/apps/web/app/api/beat-audio/route.ts new file mode 100644 index 0000000..a41fd33 --- /dev/null +++ b/apps/web/app/api/beat-audio/route.ts @@ -0,0 +1,36 @@ +import { requestBeatAudio } from "@yume/engine"; +import type { BeatAudioRequest } from "@yume/types"; +import { NextResponse } from "next/server"; +import { loadEngineConfig } from "@/lib/config"; + +export const runtime = "nodejs"; +// The synth itself has a 15s per-call ceiling in the engine. 30s here just +// covers JSON parsing + outbound network buffer. +export const maxDuration = 30; + +export async function POST(req: Request) { + let body: BeatAudioRequest; + try { + body = (await req.json()) as BeatAudioRequest; + } catch { + return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); + } + + if (!body.beat?.id || !body.beat?.line || !body.voice?.referenceAudioBase64) { + return NextResponse.json( + { error: "beat.id, beat.line and voice.referenceAudioBase64 are required" }, + { status: 400 }, + ); + } + + try { + const config = loadEngineConfig(); + const result = await requestBeatAudio(config, body); + return NextResponse.json(result); + } catch (err) { + // Engine already swallows synth errors and returns audio:null. Anything + // that reaches here is config-level โ€” surface so the client can log it. + const message = err instanceof Error ? err.message : "Unknown error"; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx index 586f4ac..b71f21e 100644 --- a/apps/web/app/play/page.tsx +++ b/apps/web/app/play/page.tsx @@ -15,6 +15,7 @@ import { PRESETS } from "@/lib/presets"; import type { Beat, BeatAudio, + BeatAudioResponse, BeatChoice, InsertBeatResponse, Scene, @@ -215,6 +216,10 @@ function PlayInner() { const startedRef = useRef(false); const poolRef = useRef>(new Map()); + // Lazy per-beat audio fetches keyed by beat.id. Aborted when the scene + // changes so stale in-flight requests can't poison the new scene's map + // (beat ids like "b1" are scene-local and would collide across scenes). + const beatAudioAbortRef = useRef>(new Map()); // Mirrors for use inside async handlers (closure-stable) const sessionRef = useRef(null); @@ -259,6 +264,79 @@ function PlayInner() { }); }, [currentBeatId]); + // โ”€โ”€ Lazy per-beat audio fetch โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + // Returns silently on any failure โ€” the UI never waits for audio, so a + // null result just means that beat plays without voice. + // Sends only the speaker's voice + the line to speak โ€” NOT the whole + // session โ€” so the per-beat payload stays small even with many characters + // (each voice.referenceAudioBase64 is ~160KB). + const fetchBeatAudio = useCallback( + async ( + sess: Session, + beat: { id: string; speaker?: string; line?: string; lineDelivery?: string }, + ): Promise => { + if (!beat.speaker || !beat.line) return; + const speaker = sess.characters.find((c) => c.name === beat.speaker); + if (!speaker?.voice) return; // not yet provisioned โ€” server can't synth anyway + if (beatAudioAbortRef.current.has(beat.id)) return; + const abort = new AbortController(); + beatAudioAbortRef.current.set(beat.id, abort); + try { + const res = await fetch("/api/beat-audio", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery }, + voice: speaker.voice, + }), + signal: abort.signal, + }); + if (!res.ok) return; + const json = (await res.json()) as BeatAudioResponse; + // Skip the state write if we've been aborted between the .ok check and + // here โ€” beat ids are scene-local, so a late arrival from a prior + // scene would otherwise overwrite the current scene's audio under the + // same id. + if (json.audio && !abort.signal.aborted) { + setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio })); + } + } catch { + // aborted or network error โ€” silent fallback + } finally { + // Only clear the slot if it's still ours. An aborted prior fetch + // running its finally late could otherwise delete the controller of a + // new fetch that took the same beat id, leaving the new one + // unabortable on the next scene change. + if (beatAudioAbortRef.current.get(beat.id) === abort) { + beatAudioAbortRef.current.delete(beat.id); + } + } + }, + [], + ); + + function cancelBeatAudioFetches(): void { + for (const c of beatAudioAbortRef.current.values()) c.abort(); + beatAudioAbortRef.current.clear(); + } + + // Fire one /api/beat-audio request per speaking beat each time the scene + // changes. Cancel any in-flight requests from the prior scene first โ€” + // beat ids are scene-local ("b1" repeats across scenes) so a late arrival + // would land under the wrong beat in the audio map otherwise. + useEffect(() => { + cancelBeatAudioFetches(); + setBeatAudioMap({}); + const scene = currentScene; + const sess = sessionRef.current; + if (!scene || !sess) return; + for (const b of scene.beats) { + if (b.speaker && b.line) { + void fetchBeatAudio(sess, b); + } + } + }, [currentScene?.id, fetchBeatAudio]); + // โ”€โ”€ Mute persistence (read is via the useState lazy initializer above) โ”€ const toggleMuted = useCallback(() => { setMuted((prev) => { @@ -375,7 +453,8 @@ function PlayInner() { setCurrentScene(data.scene); setCurrentBeatId(data.scene.entryBeatId); setImageBase64(data.imageBase64); - setBeatAudioMap(data.beatAudio ?? {}); + // beatAudioMap is populated lazily by the per-beat fetch effect once + // currentScene becomes non-null (see fetchBeatAudio). setPhase("ready"); }) .catch((e) => setError(String(e))); @@ -410,8 +489,11 @@ function PlayInner() { // consumeChoice keeping the re-rooted survivor prefetches alive. useEffect(() => { const pool = poolRef.current; + const beatAborts = beatAudioAbortRef.current; return () => { clearPool(pool); + for (const c of beatAborts.values()) c.abort(); + beatAborts.clear(); }; }, []); @@ -459,7 +541,7 @@ function PlayInner() { setCurrentScene(result.scene); setCurrentBeatId(result.scene.entryBeatId); setImageBase64(result.imageBase64); - setBeatAudioMap(result.beatAudio ?? {}); + // beatAudioMap reset + per-beat fetches kicked off by the scene effect. setLastExitLabel(exitLabel); setPhase("ready"); } catch (e) { @@ -559,7 +641,7 @@ function PlayInner() { }; throw new Error(j.error ?? insertRes.statusText); } - const { partial, characters: insertChars, audio } = + const { partial, characters: insertChars } = (await insertRes.json()) as InsertBeatResponse; const fromBeatId = @@ -581,21 +663,25 @@ function PlayInner() { beats: [...currentScene.beats, newBeat], }; - setSession((s) => - s - ? { - ...s, - history: s.history.map((h, i, arr) => - i === arr.length - 1 ? { ...h, scene: patched } : h, - ), - characters: insertChars, - } - : s, - ); + const nextSession: Session = { + ...session, + history: session.history.map((h, i, arr) => + i === arr.length - 1 ? { ...h, scene: patched } : h, + ), + characters: insertChars, + }; + setSession(nextSession); setCurrentScene(patched); setCurrentBeatId(newBeatId); - if (audio) { - setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio })); + // Insert-beat doesn't change scene.id, so the scene effect won't + // re-fire โ€” manually kick off the audio fetch for the new beat. + if (newBeat.speaker && newBeat.line) { + void fetchBeatAudio(nextSession, { + id: newBeatId, + speaker: newBeat.speaker, + line: newBeat.line, + lineDelivery: newBeat.lineDelivery, + }); } setLastExitLabel(decision.intent.freeformAction); setPhase("ready"); diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts index 72cddb6..837e8de 100644 --- a/packages/ai-client/src/image.ts +++ b/packages/ai-client/src/image.ts @@ -1,28 +1,43 @@ import type { ProviderConfig } from "@yume/types"; import { fetchWithRetry } from "./fetchWithRetry"; -type ImageUrlPart = { type: string; image_url?: { url?: string } }; -type ChatResponse = { - choices: { - message: { - content: string | ImageUrlPart[]; - images?: ImageUrlPart[]; - }; - }[]; +// Runware uses its own task-array protocol (not OpenAI-compatible). +// POST with [{ taskType: "imageInference", ... }]; errors come +// back as a 200 with `errors[]`, so we have to inspect the body either way. +type RunwareImageResult = { + imageBase64Data?: string; +}; +type RunwareError = { + code?: string; + message?: string; + parameter?: string; +}; +type RunwareResponse = { + data?: RunwareImageResult[]; + errors?: RunwareError[]; }; export async function generateImage( config: ProviderConfig, prompt: string, ): Promise { - const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`; + const url = config.baseUrl.replace(/\/$/, ""); - const body = { - model: config.model, - modalities: ["image", "text"], - size: "1792x1024", - messages: [{ role: "user", content: prompt }], - }; + const body = [ + { + taskType: "imageInference", + taskUUID: crypto.randomUUID(), + model: config.model, + positivePrompt: prompt, + width: 1792, + height: 1024, + steps: 4, + CFGScale: 3.5, + numberResults: 1, + outputType: "base64Data", + outputFormat: "PNG", + }, + ]; const res = await fetchWithRetry(url, { method: "POST", @@ -33,47 +48,27 @@ export async function generateImage( body: JSON.stringify(body), }); - if (!res.ok) { - const text = await res.text(); + const text = await res.text(); + let json: RunwareResponse; + try { + json = JSON.parse(text) as RunwareResponse; + } catch { throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`); } - const json = (await res.json()) as ChatResponse; - const msg = json.choices[0]?.message; - if (!msg) throw new Error("Image API returned no message"); - - // 1) OpenRouter-style: msg.images = [{ image_url: { url } }] - // 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }] - const structured: ImageUrlPart[] = []; - if (msg.images) structured.push(...msg.images); - if (Array.isArray(msg.content)) structured.push(...msg.content); - for (const part of structured) { - const u = part.image_url?.url; - if (u) return await urlToBase64(u); + if (json.errors?.length) { + const e = json.errors[0]!; + throw new Error( + `Runware error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}` + + (e.parameter ? ` (parameter: ${e.parameter})` : ""), + ); } - // 3) provider-style: content is a string with markdown image ![alt](url) - // or a bare URL fragment - if (typeof msg.content === "string") { - const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/); - if (md?.[1]) return await urlToBase64(md[1]); - const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i); - if (bare?.[0]) return await urlToBase64(bare[0]); + const b64 = json.data?.[0]?.imageBase64Data; + if (!b64) { + throw new Error( + `No image in Runware response: ${text.slice(0, 300)}`, + ); } - - throw new Error( - `No image found in response: ${JSON.stringify(msg).slice(0, 300)}`, - ); -} - -async function urlToBase64(url: string): Promise { - if (url.startsWith("data:")) { - const idx = url.indexOf("base64,"); - if (idx === -1) throw new Error("data URL is not base64-encoded"); - return url.slice(idx + "base64,".length); - } - const res = await fetch(url); - if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`); - const buf = await res.arrayBuffer(); - return Buffer.from(buf).toString("base64"); + return b64; } diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts index 3dcf386..9d96a48 100644 --- a/packages/engine/src/index.ts +++ b/packages/engine/src/index.ts @@ -3,9 +3,10 @@ export { requestScene, visionDecide, requestInsertBeat, + requestBeatAudio, } from "./orchestrator"; export { annotateClick } from "./annotate"; -export { voiceBeat, voiceScene } from "./voice"; +export { provisionVoicesForScene, synthesizeBeat } from "./voice"; export type { SceneResult } from "./director"; export type { InsertBeatPartial } from "@yume/types"; export * from "./prompts"; diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts index 0f9c290..d75e17b 100644 --- a/packages/engine/src/orchestrator.ts +++ b/packages/engine/src/orchestrator.ts @@ -1,5 +1,6 @@ import type { - BeatAudio, + BeatAudioRequest, + BeatAudioResponse, Character, EngineConfig, InsertBeatRequest, @@ -18,12 +19,17 @@ import { directInsertBeat, directScene } from "./director"; import { mockImageBase64 } from "./mockImage"; import { render } from "./renderer"; import { interpret } from "./vision"; -import { voiceBeat, voiceScene } from "./voice"; +import { provisionVoicesForScene, synthesizeBeat } from "./voice"; function newSessionId(): string { return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`; } +// TEMP: per-phase timing for latency diagnosis. Remove after we have data. +function tlog(label: string, t0: number): void { + console.log(`${label}: ${Date.now() - t0}ms`); +} + // Merge new character entries into the registry by name. If a name already // exists we preserve the existing voice (so a description revision never // silently re-provisions a voice the player has already heard). @@ -46,30 +52,26 @@ async function renderImage( return render(config.image, scene, styleGuide); } -async function runVoiceScene( +async function provisionForScene( config: EngineConfig, session: Session, scene: Scene, -): Promise<{ - beatAudio?: Record; - characters: Character[]; -}> { +): Promise<{ characters: Character[] }> { if (!config.tts) return { characters: session.characters }; - const res = await voiceScene(config.tts, session, scene); - return { - beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined, - characters: res.characters, - }; + return provisionVoicesForScene(config.tts, session, scene); } // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -// startSession โ€” first scene + image + per-beat voice +// startSession โ€” first scene + image + voice provisioning. The actual +// per-beat synth runs lazily via requestBeatAudio so MiMo's tail +// latency never blocks the UI. // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ export async function startSession( config: EngineConfig, req: StartRequest, ): Promise { + const tTotal = Date.now(); const session: Session = { id: newSessionId(), createdAt: Date.now(), @@ -79,28 +81,41 @@ export async function startSession( characters: [], }; + const tDirect = Date.now(); const { scene, characterUpdates } = await directScene(config.text, session); + tlog("[start] directScene", tDirect); + const preVoiceSession: Session = { ...session, characters: mergeCharacters(session.characters, characterUpdates), }; - const [imageBase64, voiceRes] = await Promise.all([ - renderImage(config, scene, preVoiceSession.styleGuide), - runVoiceScene(config, preVoiceSession, scene), - ]); + const tImage = Date.now(); + const tProv = Date.now(); + const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide) + .then((r) => { + tlog("[start] renderImage", tImage); + return r; + }); + const provPromise = provisionForScene(config, preVoiceSession, scene) + .then((r) => { + tlog("[start] provisionForScene", tProv); + return r; + }); + const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]); + + tlog("[start] TOTAL", tTotal); return { sessionId: session.id, scene, imageBase64, - characters: voiceRes.characters, - beatAudio: voiceRes.beatAudio, + characters: provRes.characters, }; } // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -// requestScene โ€” generate the NEXT scene + image + per-beat voice. +// requestScene โ€” generate the NEXT scene + image + voice provisioning. // Used both on real scene transitions and on speculative prefetch. // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ @@ -108,22 +123,37 @@ export async function requestScene( config: EngineConfig, req: SceneRequest, ): Promise { + const tTotal = Date.now(); + + const tDirect = Date.now(); const { scene, characterUpdates } = await directScene(config.text, req.session); + tlog("[scene] directScene", tDirect); + const preVoiceSession: Session = { ...req.session, characters: mergeCharacters(req.session.characters, characterUpdates), }; - const [imageBase64, voiceRes] = await Promise.all([ - renderImage(config, scene, preVoiceSession.styleGuide), - runVoiceScene(config, preVoiceSession, scene), - ]); + const tImage = Date.now(); + const tProv = Date.now(); + const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide) + .then((r) => { + tlog("[scene] renderImage", tImage); + return r; + }); + const provPromise = provisionForScene(config, preVoiceSession, scene) + .then((r) => { + tlog("[scene] provisionForScene", tProv); + return r; + }); + const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]); + + tlog("[scene] TOTAL", tTotal); return { scene, imageBase64, - characters: voiceRes.characters, - beatAudio: voiceRes.beatAudio, + characters: provRes.characters, }; } @@ -141,24 +171,27 @@ export async function visionDecide( } // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -// requestInsertBeat โ€” generates a transient in-scene beat (no image regen) -// and voices the line if any. +// requestInsertBeat โ€” generates a transient in-scene beat (no image +// regen, no voice). The client fires /api/beat-audio for the new beat +// after this returns. // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ export async function requestInsertBeat( config: EngineConfig, req: InsertBeatRequest, ): Promise { + const tTotal = Date.now(); + + const tDirect = Date.now(); const partial = await directInsertBeat( config.text, req.session, req.freeformAction, ); + tlog("[insert-beat] directInsertBeat", tDirect); - // INSERT_BEAT prompt forbids new characters โ€” but if the director violates - // it, voiceBeat's name-inferred fallback would silently provision and persist - // the hallucinated speaker. Strip the speaker attribution and promote the - // line into narration so the player still sees the text (the client only + // INSERT_BEAT prompt forbids new characters โ€” promote disallowed-speaker + // lines to narration so the player still sees the text (the client only // renders `line` when there is a `speaker`). if ( partial.speaker && @@ -169,6 +202,7 @@ export async function requestInsertBeat( ); const promotedNarration = [partial.narration, partial.line].filter(Boolean).join("\n") || undefined; + tlog("[insert-beat] TOTAL", tTotal); return { partial: { narration: promotedNarration, @@ -180,23 +214,20 @@ export async function requestInsertBeat( }; } - if (!config.tts) { - // Always echo characters so callers don't need a ?? fallback. - return { partial, characters: req.session.characters }; - } - - // Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the - // registered cast, so we voice against the existing character set. - const voiceRes = await voiceBeat( - config.tts, - req.session, - req.session.characters, - partial, - ); - - return { - partial, - characters: voiceRes.characters, - audio: voiceRes.audio, - }; + tlog("[insert-beat] TOTAL", tTotal); + return { partial, characters: req.session.characters }; +} + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// requestBeatAudio โ€” lazy per-beat synth. Returns audio:null on +// timeout / failure / TTS disabled, so the client just plays silent. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function requestBeatAudio( + config: EngineConfig, + req: BeatAudioRequest, +): Promise { + if (!config.tts) return { audio: null }; + const audio = await synthesizeBeat(config.tts, req.voice, req.beat); + return { audio }; } diff --git a/packages/engine/src/voice.ts b/packages/engine/src/voice.ts index ac0649b..d61464b 100644 --- a/packages/engine/src/voice.ts +++ b/packages/engine/src/voice.ts @@ -8,12 +8,10 @@ import type { TtsConfig, } from "@yume/types"; -export type BeatLike = { - id?: string; - speaker?: string; - line?: string; - lineDelivery?: string; -}; +// Per-beat synth budget. MiMo's median synth is 3โ€“7s; the tail can spike +// to 30โ€“70s under concurrent load. Capping here means a single bad beat +// degrades to silent in <15s instead of blocking the whole UI flow. +const SYNTH_TIMEOUT_MS = 15000; // When the director references a speaker that was never registered, derive a // description from the name + world so the voice's gender/temperament is at @@ -22,85 +20,130 @@ function inferredSpeakerDescription(name: string, session: Session): string { return `่ฏทๆ นๆฎ่ง’่‰ฒๅใ€Œ${name}ใ€ๆŽจๆ–ญๅ…ถๆ€งๅˆซใ€ๅนด้พ„ไธŽๆฐ”่ดจ๏ผŒ็”Ÿๆˆๆœ€่ดดๅˆ็š„้Ÿณ่‰ฒใ€‚ๆ‰€ๅฑžไธ–็•Œ่ง‚๏ผš${session.worldSetting}`; } -// Voice a single beat against a mutable character registry. -// Returns the (possibly-extended) registry plus the audio if synthesized. -// Narration-only beats and missing-line beats return no audio (VN convention). -export async function voiceBeat( - cfg: TtsConfig, - session: Session, - characters: Character[], - beat: BeatLike, -): Promise<{ audio?: BeatAudio; characters: Character[] }> { - if (!beat.speaker || !beat.line) { - return { characters }; - } - - const speakerName = beat.speaker; - const text = beat.line; - const delivery = beat.lineDelivery; - - // Hoisted so the catch can return the in-progress registry even if synthesis - // fails after provisioning succeeded โ€” otherwise the just-provisioned voice - // would be lost and the next beat for this speaker would pay to re-design it - // (extra cost, latency, and more 429 risk on rate-limited providers). - let nextCharacters: Character[] = characters; - +// Race the work against a timer; on either outcome clear the timer (otherwise +// the success path leaks a 15s-pending reject closure into Node's timer heap, +// per-synth call). On timeout, abort the supplied controller so the underlying +// HTTP request is cancelled โ€” otherwise MiMo's 30-70s tail keeps the socket +// open and the quota burning long after we've returned audio:null. +async function withTimeout( + p: Promise, + ms: number, + label: string, + ctrl: AbortController, +): Promise { + let timer: ReturnType | undefined; try { - const idx = characters.findIndex((c) => c.name === speakerName); - let voice: CharacterVoice | undefined; - - if (idx !== -1 && characters[idx]?.voice) { - voice = characters[idx]!.voice; - } else if (idx !== -1) { - const target = characters[idx]!; - voice = await provisionVoice(cfg, target.description); - nextCharacters = characters.map((c, i) => - i === idx ? { ...c, voice } : c, - ); - } else { - const description = inferredSpeakerDescription(speakerName, session); - voice = await provisionVoice(cfg, description); - nextCharacters = [...characters, { name: speakerName, description, voice }]; - } - - const { audioBase64, mimeType } = await synthesize( - cfg, - voice, - text, - delivery, - ); - return { - audio: { base64: audioBase64, mime: mimeType }, - characters: nextCharacters, - }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error(`[voice] degraded: ${msg}`); - return { characters: nextCharacters }; + return await Promise.race([ + p, + new Promise((_, reject) => { + timer = setTimeout(() => { + ctrl.abort(); + reject(new Error(`${label} timed out after ${ms}ms`)); + }, ms); + }), + ]); + } finally { + if (timer) clearTimeout(timer); } } -// Voice every beat in a scene. Sequential by design: a single speaker -// appearing in multiple beats must provision exactly once and share that -// voice across calls โ€” parallel synthesis would race and create duplicates. -// With 2โ€“6 beats ร— ~500ms per clone the total cost is well inside the image -// generation budget (10s+), so the simplicity is worth it. -export async function voiceScene( +// Provision voices for all unseen speakers in a scene, in parallel. +// Does NOT synthesize per-beat audio โ€” that happens lazily via +// synthesizeBeat from the /api/beat-audio route. Returning the populated +// registry lets the client fire per-beat synth without re-provisioning. +// +// Why dedupe before fanning out: the SAME unseen speaker appearing in 3 +// beats must run voicedesign once; parallel design of the same speaker +// would burn three voices' worth of budget and pick whichever raced last. +export async function provisionVoicesForScene( cfg: TtsConfig, session: Session, scene: Scene, -): Promise<{ - beatAudio: Record; - characters: Character[]; -}> { - let characters = session.characters; - const beatAudio: Record = {}; +): Promise<{ characters: Character[] }> { + const tScene = Date.now(); + const speakingBeats = scene.beats.filter( + (b): b is typeof b & { speaker: string; line: string } => + Boolean(b.speaker && b.line), + ); - for (const beat of scene.beats) { - const res = await voiceBeat(cfg, session, characters, beat); - characters = res.characters; - if (res.audio) beatAudio[beat.id] = res.audio; + let characters: Character[] = [...session.characters]; + const toProvision = new Map(); // name -> description + for (const b of speakingBeats) { + if (toProvision.has(b.speaker)) continue; + const existing = characters.find((c) => c.name === b.speaker); + if (existing?.voice) continue; + toProvision.set( + b.speaker, + existing?.description ?? inferredSpeakerDescription(b.speaker, session), + ); } - return { beatAudio, characters }; + if (toProvision.size === 0) { + console.log( + `[voice] provisionVoicesForScene total=${Date.now() - tScene}ms (no new speakers)`, + ); + return { characters }; + } + + const tProvision = Date.now(); + const provisioned = await Promise.all( + Array.from(toProvision.entries()).map(async ([name, description]) => { + try { + const voice = await provisionVoice(cfg, description); + return { name, description, voice }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`[voice] provision degraded for ${name}: ${msg}`); + return { name, description, voice: undefined }; + } + }), + ); + console.log( + `[voice] provision: ${toProvision.size} speakers parallel max=${Date.now() - tProvision}ms`, + ); + + for (const p of provisioned) { + if (!p.voice) continue; + const idx = characters.findIndex((c) => c.name === p.name); + if (idx === -1) { + characters.push({ name: p.name, description: p.description, voice: p.voice }); + } else { + characters[idx] = { ...characters[idx]!, voice: p.voice }; + } + } + + console.log( + `[voice] provisionVoicesForScene total=${Date.now() - tScene}ms`, + ); + return { characters }; +} + +// Synthesize audio for one beat. Caller is expected to have already +// resolved the speaker's voice (from session.characters in the client) โ€” +// passing it directly here keeps the /api/beat-audio payload small and +// makes this function pure with respect to session state. +// Returns null on error or timeout; caller treats null as "play silent." +export async function synthesizeBeat( + cfg: TtsConfig, + voice: CharacterVoice, + beat: { id: string; line: string; lineDelivery?: string }, +): Promise { + const t = Date.now(); + const ctrl = new AbortController(); + try { + const { audioBase64, mimeType } = await withTimeout( + synthesize(cfg, voice, beat.line, beat.lineDelivery, ctrl.signal), + SYNTH_TIMEOUT_MS, + `synth ${beat.id}`, + ctrl, + ); + console.log(` [voice ${beat.id}] synth=${Date.now() - t}ms`); + return { base64: audioBase64, mime: mimeType }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[voice] synth degraded for ${beat.id} (after ${Date.now() - t}ms): ${msg}`, + ); + return null; + } } diff --git a/packages/tts-client/src/xiaomi.ts b/packages/tts-client/src/xiaomi.ts index 33dc953..d957f14 100644 --- a/packages/tts-client/src/xiaomi.ts +++ b/packages/tts-client/src/xiaomi.ts @@ -77,6 +77,7 @@ export async function xiaomiSynthesize( voice: CharacterVoice, text: string, delivery?: string, + signal?: AbortSignal, ): Promise<{ audioBase64: string; mimeType: string }> { const url = joinUrl(cfg.baseUrl, "/chat/completions"); @@ -99,6 +100,7 @@ export async function xiaomiSynthesize( method: "POST", headers: buildHeaders(cfg), body: JSON.stringify(body), + signal, }); if (!res.ok) { diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 9f372e8..5d0e86b 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -145,10 +145,8 @@ export type StartResponse = { sessionId: string; scene: Scene; imageBase64: string; - /** Post-voice character registry (with provisioned voices). */ + /** Character registry with voice references provisioned for new speakers. */ characters: Character[]; - /** Per-beat synthesized audio, keyed by beat.id. */ - beatAudio?: Record; }; // /api/scene โ€” generates the next Scene, given session whose latest @@ -162,7 +160,27 @@ export type SceneResponse = { scene: Scene; imageBase64: string; characters: Character[]; - beatAudio?: Record; +}; + +// /api/beat-audio โ€” lazily synthesize one beat's voice. Client fires this +// per beat after a scene loads; server has a per-call timeout so MiMo +// tail-latency cannot block the UI. A null audio response means "play silent." +// +// Payload deliberately slim: just the line to speak and the speaker's voice +// reference. The client extracts the voice from its local session.characters +// before posting โ€” sending the full Session would force ~160KB of base64 per +// OTHER speaker plus the entire scene history to ride along for nothing. +export type BeatAudioRequest = { + beat: { + id: string; + line: string; + lineDelivery?: string; + }; + voice: CharacterVoice; +}; + +export type BeatAudioResponse = { + audio: BeatAudio | null; }; // /api/vision โ€” interprets a background click on the current image and @@ -197,5 +215,4 @@ export type InsertBeatPartial = { export type InsertBeatResponse = { partial: InsertBeatPartial; characters: Character[]; - audio?: BeatAudio; };