feat: Runware FLUX.2 image + lazy per-beat TTS (#5)
Reduce median scene-load latency from ~30-80s to ~17-25s by switching image generation to Runware FLUX.2 [klein] 9B KV and moving per-beat TTS synthesis off the scene response into a new lazy /api/beat-audio endpoint with hard timeout + abort support.
- feat(image): migrate to Runware FLUX.2 [klein] 9B KV — task-array API, $0.001/image, sub-second inference.
- feat(tts): split /api/scene into directScene + image + voicedesign-provisioning; lazily synth per beat via /api/beat-audio with 15s hard timeout + AbortSignal threaded to MiMo so timed-out calls don't keep burning sockets/quota; client fans out per-beat fetches on scene-id change with abort + identity-check finally to prevent cross-scene beat-id collisions.
- refactor(tts): slim BeatAudioRequest to { beat, voice } — ~800KB per-beat upload dropped to ~160KB by sending only the speaker's voice instead of the full session.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
+15
-9
@@ -1,12 +1,14 @@
|
||||
# =============================================================
|
||||
# 云梦 — AI 视觉小说
|
||||
# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
|
||||
# (one API key covers all three) + any image provider for IMAGE.
|
||||
# (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]).
|
||||
#
|
||||
# Any OpenAI-compatible endpoint works for any slot — OpenRouter,
|
||||
# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
|
||||
# Image generation uses the chat-completions + modalities API
|
||||
# (OpenRouter-style), NOT the legacy /images/generations endpoint.
|
||||
# TEXT / VISION / TTS use OpenAI-compatible endpoints (any OpenAI-
|
||||
# compatible host works: OpenRouter, OpenAI, Anthropic via proxy,
|
||||
# Gemini, DeepSeek, Ollama, ...).
|
||||
#
|
||||
# IMAGE uses Runware's own task-array protocol (not OpenAI-compatible);
|
||||
# the adapter posts an `imageInference` task to IMAGE_BASE_URL.
|
||||
# =============================================================
|
||||
|
||||
# ---- 1. Text LLM · scene director ----------------------------------
|
||||
@@ -18,10 +20,14 @@ TEXT_API_KEY=tp-xxx
|
||||
TEXT_MODEL=mimo-v2.5-pro
|
||||
|
||||
# ---- 2. Image generator (renders the scene background) -------------
|
||||
# Any provider supporting chat-completions + modalities image output.
|
||||
IMAGE_BASE_URL=https://openrouter.ai/api/v1
|
||||
IMAGE_API_KEY=sk-or-v1-xxx
|
||||
IMAGE_MODEL=openai/gpt-5.4-image-2
|
||||
# Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model,
|
||||
# sub-second inference at ~$0.0008/image. Sign up at https://runware.ai
|
||||
# AIR ids for FLUX.2 [klein] variants:
|
||||
# runware:400@1 · 4B (smaller)
|
||||
# runware:400@6 · 9B KV (recommended — fastest at 16:9)
|
||||
IMAGE_BASE_URL=https://api.runware.ai/v1
|
||||
IMAGE_API_KEY=runware-xxx
|
||||
IMAGE_MODEL=runware:400@6
|
||||
|
||||
# ---- 3. Vision model · multimodal click interpretation -------------
|
||||
# Recommended: MiMo V2.5 omni — multimodal.
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
import { requestBeatAudio } from "@yume/engine";
|
||||
import type { BeatAudioRequest } from "@yume/types";
|
||||
import { NextResponse } from "next/server";
|
||||
import { loadEngineConfig } from "@/lib/config";
|
||||
|
||||
export const runtime = "nodejs";
|
||||
// The synth itself has a 15s per-call ceiling in the engine. 30s here just
|
||||
// covers JSON parsing + outbound network buffer.
|
||||
export const maxDuration = 30;
|
||||
|
||||
export async function POST(req: Request) {
|
||||
let body: BeatAudioRequest;
|
||||
try {
|
||||
body = (await req.json()) as BeatAudioRequest;
|
||||
} catch {
|
||||
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
|
||||
}
|
||||
|
||||
if (!body.beat?.id || !body.beat?.line || !body.voice?.referenceAudioBase64) {
|
||||
return NextResponse.json(
|
||||
{ error: "beat.id, beat.line and voice.referenceAudioBase64 are required" },
|
||||
{ status: 400 },
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const config = loadEngineConfig();
|
||||
const result = await requestBeatAudio(config, body);
|
||||
return NextResponse.json(result);
|
||||
} catch (err) {
|
||||
// Engine already swallows synth errors and returns audio:null. Anything
|
||||
// that reaches here is config-level — surface so the client can log it.
|
||||
const message = err instanceof Error ? err.message : "Unknown error";
|
||||
return NextResponse.json({ error: message }, { status: 500 });
|
||||
}
|
||||
}
|
||||
+102
-16
@@ -15,6 +15,7 @@ import { PRESETS } from "@/lib/presets";
|
||||
import type {
|
||||
Beat,
|
||||
BeatAudio,
|
||||
BeatAudioResponse,
|
||||
BeatChoice,
|
||||
InsertBeatResponse,
|
||||
Scene,
|
||||
@@ -215,6 +216,10 @@ function PlayInner() {
|
||||
|
||||
const startedRef = useRef(false);
|
||||
const poolRef = useRef<Map<string, PrefetchEntry>>(new Map());
|
||||
// Lazy per-beat audio fetches keyed by beat.id. Aborted when the scene
|
||||
// changes so stale in-flight requests can't poison the new scene's map
|
||||
// (beat ids like "b1" are scene-local and would collide across scenes).
|
||||
const beatAudioAbortRef = useRef<Map<string, AbortController>>(new Map());
|
||||
|
||||
// Mirrors for use inside async handlers (closure-stable)
|
||||
const sessionRef = useRef<Session | null>(null);
|
||||
@@ -259,6 +264,79 @@ function PlayInner() {
|
||||
});
|
||||
}, [currentBeatId]);
|
||||
|
||||
// ── Lazy per-beat audio fetch ────────────────────────────────────────
|
||||
// Returns silently on any failure — the UI never waits for audio, so a
|
||||
// null result just means that beat plays without voice.
|
||||
// Sends only the speaker's voice + the line to speak — NOT the whole
|
||||
// session — so the per-beat payload stays small even with many characters
|
||||
// (each voice.referenceAudioBase64 is ~160KB).
|
||||
const fetchBeatAudio = useCallback(
|
||||
async (
|
||||
sess: Session,
|
||||
beat: { id: string; speaker?: string; line?: string; lineDelivery?: string },
|
||||
): Promise<void> => {
|
||||
if (!beat.speaker || !beat.line) return;
|
||||
const speaker = sess.characters.find((c) => c.name === beat.speaker);
|
||||
if (!speaker?.voice) return; // not yet provisioned — server can't synth anyway
|
||||
if (beatAudioAbortRef.current.has(beat.id)) return;
|
||||
const abort = new AbortController();
|
||||
beatAudioAbortRef.current.set(beat.id, abort);
|
||||
try {
|
||||
const res = await fetch("/api/beat-audio", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
|
||||
voice: speaker.voice,
|
||||
}),
|
||||
signal: abort.signal,
|
||||
});
|
||||
if (!res.ok) return;
|
||||
const json = (await res.json()) as BeatAudioResponse;
|
||||
// Skip the state write if we've been aborted between the .ok check and
|
||||
// here — beat ids are scene-local, so a late arrival from a prior
|
||||
// scene would otherwise overwrite the current scene's audio under the
|
||||
// same id.
|
||||
if (json.audio && !abort.signal.aborted) {
|
||||
setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio }));
|
||||
}
|
||||
} catch {
|
||||
// aborted or network error — silent fallback
|
||||
} finally {
|
||||
// Only clear the slot if it's still ours. An aborted prior fetch
|
||||
// running its finally late could otherwise delete the controller of a
|
||||
// new fetch that took the same beat id, leaving the new one
|
||||
// unabortable on the next scene change.
|
||||
if (beatAudioAbortRef.current.get(beat.id) === abort) {
|
||||
beatAudioAbortRef.current.delete(beat.id);
|
||||
}
|
||||
}
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
function cancelBeatAudioFetches(): void {
|
||||
for (const c of beatAudioAbortRef.current.values()) c.abort();
|
||||
beatAudioAbortRef.current.clear();
|
||||
}
|
||||
|
||||
// Fire one /api/beat-audio request per speaking beat each time the scene
|
||||
// changes. Cancel any in-flight requests from the prior scene first —
|
||||
// beat ids are scene-local ("b1" repeats across scenes) so a late arrival
|
||||
// would land under the wrong beat in the audio map otherwise.
|
||||
useEffect(() => {
|
||||
cancelBeatAudioFetches();
|
||||
setBeatAudioMap({});
|
||||
const scene = currentScene;
|
||||
const sess = sessionRef.current;
|
||||
if (!scene || !sess) return;
|
||||
for (const b of scene.beats) {
|
||||
if (b.speaker && b.line) {
|
||||
void fetchBeatAudio(sess, b);
|
||||
}
|
||||
}
|
||||
}, [currentScene?.id, fetchBeatAudio]);
|
||||
|
||||
// ── Mute persistence (read is via the useState lazy initializer above) ─
|
||||
const toggleMuted = useCallback(() => {
|
||||
setMuted((prev) => {
|
||||
@@ -375,7 +453,8 @@ function PlayInner() {
|
||||
setCurrentScene(data.scene);
|
||||
setCurrentBeatId(data.scene.entryBeatId);
|
||||
setImageBase64(data.imageBase64);
|
||||
setBeatAudioMap(data.beatAudio ?? {});
|
||||
// beatAudioMap is populated lazily by the per-beat fetch effect once
|
||||
// currentScene becomes non-null (see fetchBeatAudio).
|
||||
setPhase("ready");
|
||||
})
|
||||
.catch((e) => setError(String(e)));
|
||||
@@ -410,8 +489,11 @@ function PlayInner() {
|
||||
// consumeChoice keeping the re-rooted survivor prefetches alive.
|
||||
useEffect(() => {
|
||||
const pool = poolRef.current;
|
||||
const beatAborts = beatAudioAbortRef.current;
|
||||
return () => {
|
||||
clearPool(pool);
|
||||
for (const c of beatAborts.values()) c.abort();
|
||||
beatAborts.clear();
|
||||
};
|
||||
}, []);
|
||||
|
||||
@@ -459,7 +541,7 @@ function PlayInner() {
|
||||
setCurrentScene(result.scene);
|
||||
setCurrentBeatId(result.scene.entryBeatId);
|
||||
setImageBase64(result.imageBase64);
|
||||
setBeatAudioMap(result.beatAudio ?? {});
|
||||
// beatAudioMap reset + per-beat fetches kicked off by the scene effect.
|
||||
setLastExitLabel(exitLabel);
|
||||
setPhase("ready");
|
||||
} catch (e) {
|
||||
@@ -559,7 +641,7 @@ function PlayInner() {
|
||||
};
|
||||
throw new Error(j.error ?? insertRes.statusText);
|
||||
}
|
||||
const { partial, characters: insertChars, audio } =
|
||||
const { partial, characters: insertChars } =
|
||||
(await insertRes.json()) as InsertBeatResponse;
|
||||
|
||||
const fromBeatId =
|
||||
@@ -581,21 +663,25 @@ function PlayInner() {
|
||||
beats: [...currentScene.beats, newBeat],
|
||||
};
|
||||
|
||||
setSession((s) =>
|
||||
s
|
||||
? {
|
||||
...s,
|
||||
history: s.history.map((h, i, arr) =>
|
||||
i === arr.length - 1 ? { ...h, scene: patched } : h,
|
||||
),
|
||||
characters: insertChars,
|
||||
}
|
||||
: s,
|
||||
);
|
||||
const nextSession: Session = {
|
||||
...session,
|
||||
history: session.history.map((h, i, arr) =>
|
||||
i === arr.length - 1 ? { ...h, scene: patched } : h,
|
||||
),
|
||||
characters: insertChars,
|
||||
};
|
||||
setSession(nextSession);
|
||||
setCurrentScene(patched);
|
||||
setCurrentBeatId(newBeatId);
|
||||
if (audio) {
|
||||
setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
|
||||
// Insert-beat doesn't change scene.id, so the scene effect won't
|
||||
// re-fire — manually kick off the audio fetch for the new beat.
|
||||
if (newBeat.speaker && newBeat.line) {
|
||||
void fetchBeatAudio(nextSession, {
|
||||
id: newBeatId,
|
||||
speaker: newBeat.speaker,
|
||||
line: newBeat.line,
|
||||
lineDelivery: newBeat.lineDelivery,
|
||||
});
|
||||
}
|
||||
setLastExitLabel(decision.intent.freeformAction);
|
||||
setPhase("ready");
|
||||
|
||||
Reference in New Issue
Block a user