feat: Runware FLUX.2 image + lazy per-beat TTS (#5)
Reduce median scene-load latency from ~30-80s to ~17-25s by switching image generation to Runware FLUX.2 [klein] 9B KV and moving per-beat TTS synthesis off the scene response into a new lazy /api/beat-audio endpoint with hard timeout + abort support.
- feat(image): migrate to Runware FLUX.2 [klein] 9B KV — task-array API, $0.001/image, sub-second inference.
- feat(tts): split /api/scene into directScene + image + voicedesign-provisioning; lazily synth per beat via /api/beat-audio with 15s hard timeout + AbortSignal threaded to MiMo so timed-out calls don't keep burning sockets/quota; client fans out per-beat fetches on scene-id change with abort + identity-check finally to prevent cross-scene beat-id collisions.
- refactor(tts): slim BeatAudioRequest to { beat, voice } — ~800KB per-beat upload dropped to ~160KB by sending only the speaker's voice instead of the full session.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -3,9 +3,10 @@ export {
|
||||
requestScene,
|
||||
visionDecide,
|
||||
requestInsertBeat,
|
||||
requestBeatAudio,
|
||||
} from "./orchestrator";
|
||||
export { annotateClick } from "./annotate";
|
||||
export { voiceBeat, voiceScene } from "./voice";
|
||||
export { provisionVoicesForScene, synthesizeBeat } from "./voice";
|
||||
export type { SceneResult } from "./director";
|
||||
export type { InsertBeatPartial } from "@yume/types";
|
||||
export * from "./prompts";
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type {
|
||||
BeatAudio,
|
||||
BeatAudioRequest,
|
||||
BeatAudioResponse,
|
||||
Character,
|
||||
EngineConfig,
|
||||
InsertBeatRequest,
|
||||
@@ -18,12 +19,17 @@ import { directInsertBeat, directScene } from "./director";
|
||||
import { mockImageBase64 } from "./mockImage";
|
||||
import { render } from "./renderer";
|
||||
import { interpret } from "./vision";
|
||||
import { voiceBeat, voiceScene } from "./voice";
|
||||
import { provisionVoicesForScene, synthesizeBeat } from "./voice";
|
||||
|
||||
function newSessionId(): string {
|
||||
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||
}
|
||||
|
||||
// TEMP: per-phase timing for latency diagnosis. Remove after we have data.
|
||||
function tlog(label: string, t0: number): void {
|
||||
console.log(`${label}: ${Date.now() - t0}ms`);
|
||||
}
|
||||
|
||||
// Merge new character entries into the registry by name. If a name already
|
||||
// exists we preserve the existing voice (so a description revision never
|
||||
// silently re-provisions a voice the player has already heard).
|
||||
@@ -46,30 +52,26 @@ async function renderImage(
|
||||
return render(config.image, scene, styleGuide);
|
||||
}
|
||||
|
||||
async function runVoiceScene(
|
||||
async function provisionForScene(
|
||||
config: EngineConfig,
|
||||
session: Session,
|
||||
scene: Scene,
|
||||
): Promise<{
|
||||
beatAudio?: Record<string, BeatAudio>;
|
||||
characters: Character[];
|
||||
}> {
|
||||
): Promise<{ characters: Character[] }> {
|
||||
if (!config.tts) return { characters: session.characters };
|
||||
const res = await voiceScene(config.tts, session, scene);
|
||||
return {
|
||||
beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
|
||||
characters: res.characters,
|
||||
};
|
||||
return provisionVoicesForScene(config.tts, session, scene);
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// startSession — first scene + image + per-beat voice
|
||||
// startSession — first scene + image + voice provisioning. The actual
|
||||
// per-beat synth runs lazily via requestBeatAudio so MiMo's tail
|
||||
// latency never blocks the UI.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function startSession(
|
||||
config: EngineConfig,
|
||||
req: StartRequest,
|
||||
): Promise<StartResponse> {
|
||||
const tTotal = Date.now();
|
||||
const session: Session = {
|
||||
id: newSessionId(),
|
||||
createdAt: Date.now(),
|
||||
@@ -79,28 +81,41 @@ export async function startSession(
|
||||
characters: [],
|
||||
};
|
||||
|
||||
const tDirect = Date.now();
|
||||
const { scene, characterUpdates } = await directScene(config.text, session);
|
||||
tlog("[start] directScene", tDirect);
|
||||
|
||||
const preVoiceSession: Session = {
|
||||
...session,
|
||||
characters: mergeCharacters(session.characters, characterUpdates),
|
||||
};
|
||||
|
||||
const [imageBase64, voiceRes] = await Promise.all([
|
||||
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||
runVoiceScene(config, preVoiceSession, scene),
|
||||
]);
|
||||
const tImage = Date.now();
|
||||
const tProv = Date.now();
|
||||
const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide)
|
||||
.then((r) => {
|
||||
tlog("[start] renderImage", tImage);
|
||||
return r;
|
||||
});
|
||||
const provPromise = provisionForScene(config, preVoiceSession, scene)
|
||||
.then((r) => {
|
||||
tlog("[start] provisionForScene", tProv);
|
||||
return r;
|
||||
});
|
||||
const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]);
|
||||
|
||||
tlog("[start] TOTAL", tTotal);
|
||||
|
||||
return {
|
||||
sessionId: session.id,
|
||||
scene,
|
||||
imageBase64,
|
||||
characters: voiceRes.characters,
|
||||
beatAudio: voiceRes.beatAudio,
|
||||
characters: provRes.characters,
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// requestScene — generate the NEXT scene + image + per-beat voice.
|
||||
// requestScene — generate the NEXT scene + image + voice provisioning.
|
||||
// Used both on real scene transitions and on speculative prefetch.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -108,22 +123,37 @@ export async function requestScene(
|
||||
config: EngineConfig,
|
||||
req: SceneRequest,
|
||||
): Promise<SceneResponse> {
|
||||
const tTotal = Date.now();
|
||||
|
||||
const tDirect = Date.now();
|
||||
const { scene, characterUpdates } = await directScene(config.text, req.session);
|
||||
tlog("[scene] directScene", tDirect);
|
||||
|
||||
const preVoiceSession: Session = {
|
||||
...req.session,
|
||||
characters: mergeCharacters(req.session.characters, characterUpdates),
|
||||
};
|
||||
|
||||
const [imageBase64, voiceRes] = await Promise.all([
|
||||
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||
runVoiceScene(config, preVoiceSession, scene),
|
||||
]);
|
||||
const tImage = Date.now();
|
||||
const tProv = Date.now();
|
||||
const imagePromise = renderImage(config, scene, preVoiceSession.styleGuide)
|
||||
.then((r) => {
|
||||
tlog("[scene] renderImage", tImage);
|
||||
return r;
|
||||
});
|
||||
const provPromise = provisionForScene(config, preVoiceSession, scene)
|
||||
.then((r) => {
|
||||
tlog("[scene] provisionForScene", tProv);
|
||||
return r;
|
||||
});
|
||||
const [imageBase64, provRes] = await Promise.all([imagePromise, provPromise]);
|
||||
|
||||
tlog("[scene] TOTAL", tTotal);
|
||||
|
||||
return {
|
||||
scene,
|
||||
imageBase64,
|
||||
characters: voiceRes.characters,
|
||||
beatAudio: voiceRes.beatAudio,
|
||||
characters: provRes.characters,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -141,24 +171,27 @@ export async function visionDecide(
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// requestInsertBeat — generates a transient in-scene beat (no image regen)
|
||||
// and voices the line if any.
|
||||
// requestInsertBeat — generates a transient in-scene beat (no image
|
||||
// regen, no voice). The client fires /api/beat-audio for the new beat
|
||||
// after this returns.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function requestInsertBeat(
|
||||
config: EngineConfig,
|
||||
req: InsertBeatRequest,
|
||||
): Promise<InsertBeatResponse> {
|
||||
const tTotal = Date.now();
|
||||
|
||||
const tDirect = Date.now();
|
||||
const partial = await directInsertBeat(
|
||||
config.text,
|
||||
req.session,
|
||||
req.freeformAction,
|
||||
);
|
||||
tlog("[insert-beat] directInsertBeat", tDirect);
|
||||
|
||||
// INSERT_BEAT prompt forbids new characters — but if the director violates
|
||||
// it, voiceBeat's name-inferred fallback would silently provision and persist
|
||||
// the hallucinated speaker. Strip the speaker attribution and promote the
|
||||
// line into narration so the player still sees the text (the client only
|
||||
// INSERT_BEAT prompt forbids new characters — promote disallowed-speaker
|
||||
// lines to narration so the player still sees the text (the client only
|
||||
// renders `line` when there is a `speaker`).
|
||||
if (
|
||||
partial.speaker &&
|
||||
@@ -169,6 +202,7 @@ export async function requestInsertBeat(
|
||||
);
|
||||
const promotedNarration =
|
||||
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
|
||||
tlog("[insert-beat] TOTAL", tTotal);
|
||||
return {
|
||||
partial: {
|
||||
narration: promotedNarration,
|
||||
@@ -180,23 +214,20 @@ export async function requestInsertBeat(
|
||||
};
|
||||
}
|
||||
|
||||
if (!config.tts) {
|
||||
// Always echo characters so callers don't need a ?? fallback.
|
||||
return { partial, characters: req.session.characters };
|
||||
}
|
||||
|
||||
// Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
|
||||
// registered cast, so we voice against the existing character set.
|
||||
const voiceRes = await voiceBeat(
|
||||
config.tts,
|
||||
req.session,
|
||||
req.session.characters,
|
||||
partial,
|
||||
);
|
||||
|
||||
return {
|
||||
partial,
|
||||
characters: voiceRes.characters,
|
||||
audio: voiceRes.audio,
|
||||
};
|
||||
tlog("[insert-beat] TOTAL", tTotal);
|
||||
return { partial, characters: req.session.characters };
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// requestBeatAudio — lazy per-beat synth. Returns audio:null on
|
||||
// timeout / failure / TTS disabled, so the client just plays silent.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function requestBeatAudio(
|
||||
config: EngineConfig,
|
||||
req: BeatAudioRequest,
|
||||
): Promise<BeatAudioResponse> {
|
||||
if (!config.tts) return { audio: null };
|
||||
const audio = await synthesizeBeat(config.tts, req.voice, req.beat);
|
||||
return { audio };
|
||||
}
|
||||
|
||||
+120
-77
@@ -8,12 +8,10 @@ import type {
|
||||
TtsConfig,
|
||||
} from "@yume/types";
|
||||
|
||||
export type BeatLike = {
|
||||
id?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
lineDelivery?: string;
|
||||
};
|
||||
// Per-beat synth budget. MiMo's median synth is 3–7s; the tail can spike
|
||||
// to 30–70s under concurrent load. Capping here means a single bad beat
|
||||
// degrades to silent in <15s instead of blocking the whole UI flow.
|
||||
const SYNTH_TIMEOUT_MS = 15000;
|
||||
|
||||
// When the director references a speaker that was never registered, derive a
|
||||
// description from the name + world so the voice's gender/temperament is at
|
||||
@@ -22,85 +20,130 @@ function inferredSpeakerDescription(name: string, session: Session): string {
|
||||
return `请根据角色名「${name}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
|
||||
}
|
||||
|
||||
// Voice a single beat against a mutable character registry.
|
||||
// Returns the (possibly-extended) registry plus the audio if synthesized.
|
||||
// Narration-only beats and missing-line beats return no audio (VN convention).
|
||||
export async function voiceBeat(
|
||||
cfg: TtsConfig,
|
||||
session: Session,
|
||||
characters: Character[],
|
||||
beat: BeatLike,
|
||||
): Promise<{ audio?: BeatAudio; characters: Character[] }> {
|
||||
if (!beat.speaker || !beat.line) {
|
||||
return { characters };
|
||||
}
|
||||
|
||||
const speakerName = beat.speaker;
|
||||
const text = beat.line;
|
||||
const delivery = beat.lineDelivery;
|
||||
|
||||
// Hoisted so the catch can return the in-progress registry even if synthesis
|
||||
// fails after provisioning succeeded — otherwise the just-provisioned voice
|
||||
// would be lost and the next beat for this speaker would pay to re-design it
|
||||
// (extra cost, latency, and more 429 risk on rate-limited providers).
|
||||
let nextCharacters: Character[] = characters;
|
||||
|
||||
// Race the work against a timer; on either outcome clear the timer (otherwise
|
||||
// the success path leaks a 15s-pending reject closure into Node's timer heap,
|
||||
// per-synth call). On timeout, abort the supplied controller so the underlying
|
||||
// HTTP request is cancelled — otherwise MiMo's 30-70s tail keeps the socket
|
||||
// open and the quota burning long after we've returned audio:null.
|
||||
async function withTimeout<T>(
|
||||
p: Promise<T>,
|
||||
ms: number,
|
||||
label: string,
|
||||
ctrl: AbortController,
|
||||
): Promise<T> {
|
||||
let timer: ReturnType<typeof setTimeout> | undefined;
|
||||
try {
|
||||
const idx = characters.findIndex((c) => c.name === speakerName);
|
||||
let voice: CharacterVoice | undefined;
|
||||
|
||||
if (idx !== -1 && characters[idx]?.voice) {
|
||||
voice = characters[idx]!.voice;
|
||||
} else if (idx !== -1) {
|
||||
const target = characters[idx]!;
|
||||
voice = await provisionVoice(cfg, target.description);
|
||||
nextCharacters = characters.map((c, i) =>
|
||||
i === idx ? { ...c, voice } : c,
|
||||
);
|
||||
} else {
|
||||
const description = inferredSpeakerDescription(speakerName, session);
|
||||
voice = await provisionVoice(cfg, description);
|
||||
nextCharacters = [...characters, { name: speakerName, description, voice }];
|
||||
}
|
||||
|
||||
const { audioBase64, mimeType } = await synthesize(
|
||||
cfg,
|
||||
voice,
|
||||
text,
|
||||
delivery,
|
||||
);
|
||||
return {
|
||||
audio: { base64: audioBase64, mime: mimeType },
|
||||
characters: nextCharacters,
|
||||
};
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[voice] degraded: ${msg}`);
|
||||
return { characters: nextCharacters };
|
||||
return await Promise.race([
|
||||
p,
|
||||
new Promise<T>((_, reject) => {
|
||||
timer = setTimeout(() => {
|
||||
ctrl.abort();
|
||||
reject(new Error(`${label} timed out after ${ms}ms`));
|
||||
}, ms);
|
||||
}),
|
||||
]);
|
||||
} finally {
|
||||
if (timer) clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
// Voice every beat in a scene. Sequential by design: a single speaker
|
||||
// appearing in multiple beats must provision exactly once and share that
|
||||
// voice across calls — parallel synthesis would race and create duplicates.
|
||||
// With 2–6 beats × ~500ms per clone the total cost is well inside the image
|
||||
// generation budget (10s+), so the simplicity is worth it.
|
||||
export async function voiceScene(
|
||||
// Provision voices for all unseen speakers in a scene, in parallel.
|
||||
// Does NOT synthesize per-beat audio — that happens lazily via
|
||||
// synthesizeBeat from the /api/beat-audio route. Returning the populated
|
||||
// registry lets the client fire per-beat synth without re-provisioning.
|
||||
//
|
||||
// Why dedupe before fanning out: the SAME unseen speaker appearing in 3
|
||||
// beats must run voicedesign once; parallel design of the same speaker
|
||||
// would burn three voices' worth of budget and pick whichever raced last.
|
||||
export async function provisionVoicesForScene(
|
||||
cfg: TtsConfig,
|
||||
session: Session,
|
||||
scene: Scene,
|
||||
): Promise<{
|
||||
beatAudio: Record<string, BeatAudio>;
|
||||
characters: Character[];
|
||||
}> {
|
||||
let characters = session.characters;
|
||||
const beatAudio: Record<string, BeatAudio> = {};
|
||||
): Promise<{ characters: Character[] }> {
|
||||
const tScene = Date.now();
|
||||
const speakingBeats = scene.beats.filter(
|
||||
(b): b is typeof b & { speaker: string; line: string } =>
|
||||
Boolean(b.speaker && b.line),
|
||||
);
|
||||
|
||||
for (const beat of scene.beats) {
|
||||
const res = await voiceBeat(cfg, session, characters, beat);
|
||||
characters = res.characters;
|
||||
if (res.audio) beatAudio[beat.id] = res.audio;
|
||||
let characters: Character[] = [...session.characters];
|
||||
const toProvision = new Map<string, string>(); // name -> description
|
||||
for (const b of speakingBeats) {
|
||||
if (toProvision.has(b.speaker)) continue;
|
||||
const existing = characters.find((c) => c.name === b.speaker);
|
||||
if (existing?.voice) continue;
|
||||
toProvision.set(
|
||||
b.speaker,
|
||||
existing?.description ?? inferredSpeakerDescription(b.speaker, session),
|
||||
);
|
||||
}
|
||||
|
||||
return { beatAudio, characters };
|
||||
if (toProvision.size === 0) {
|
||||
console.log(
|
||||
`[voice] provisionVoicesForScene total=${Date.now() - tScene}ms (no new speakers)`,
|
||||
);
|
||||
return { characters };
|
||||
}
|
||||
|
||||
const tProvision = Date.now();
|
||||
const provisioned = await Promise.all(
|
||||
Array.from(toProvision.entries()).map(async ([name, description]) => {
|
||||
try {
|
||||
const voice = await provisionVoice(cfg, description);
|
||||
return { name, description, voice };
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[voice] provision degraded for ${name}: ${msg}`);
|
||||
return { name, description, voice: undefined };
|
||||
}
|
||||
}),
|
||||
);
|
||||
console.log(
|
||||
`[voice] provision: ${toProvision.size} speakers parallel max=${Date.now() - tProvision}ms`,
|
||||
);
|
||||
|
||||
for (const p of provisioned) {
|
||||
if (!p.voice) continue;
|
||||
const idx = characters.findIndex((c) => c.name === p.name);
|
||||
if (idx === -1) {
|
||||
characters.push({ name: p.name, description: p.description, voice: p.voice });
|
||||
} else {
|
||||
characters[idx] = { ...characters[idx]!, voice: p.voice };
|
||||
}
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[voice] provisionVoicesForScene total=${Date.now() - tScene}ms`,
|
||||
);
|
||||
return { characters };
|
||||
}
|
||||
|
||||
// Synthesize audio for one beat. Caller is expected to have already
|
||||
// resolved the speaker's voice (from session.characters in the client) —
|
||||
// passing it directly here keeps the /api/beat-audio payload small and
|
||||
// makes this function pure with respect to session state.
|
||||
// Returns null on error or timeout; caller treats null as "play silent."
|
||||
export async function synthesizeBeat(
|
||||
cfg: TtsConfig,
|
||||
voice: CharacterVoice,
|
||||
beat: { id: string; line: string; lineDelivery?: string },
|
||||
): Promise<BeatAudio | null> {
|
||||
const t = Date.now();
|
||||
const ctrl = new AbortController();
|
||||
try {
|
||||
const { audioBase64, mimeType } = await withTimeout(
|
||||
synthesize(cfg, voice, beat.line, beat.lineDelivery, ctrl.signal),
|
||||
SYNTH_TIMEOUT_MS,
|
||||
`synth ${beat.id}`,
|
||||
ctrl,
|
||||
);
|
||||
console.log(` [voice ${beat.id}] synth=${Date.now() - t}ms`);
|
||||
return { base64: audioBase64, mime: mimeType };
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.error(
|
||||
`[voice] synth degraded for ${beat.id} (after ${Date.now() - t}ms): ${msg}`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user