import { chat } from "@infiplot/ai-client"; import type { Character, EngineConfig, InsertBeatPartial, ProviderConfig, Scene, Session, StoryState, StoryStatePatch, } from "@infiplot/types"; import type { CharacterCard } from "./agents/characterDesigner"; import { designCharacterCard, provisionCharacterVoice, provisionVoiceForName, renderCharacterPortrait, } from "./agents/characterDesigner"; import { runCinematographer } from "./agents/cinematographer"; import { runPainter } from "./agents/painter"; import { collectActiveCharacterNames, isPovName, normalizeSpeakerName, POV_DISPLAY_NAME, runWriter, } from "./agents/writer"; import { parseJsonLoose } from "./jsonParser"; import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; // ══════════════════════════════════════════════════════════════════════ // director.ts — multi-agent orchestrator for one full Scene generation. // // Critical path (per Scene call): // // Writer LLM (~3s, serial) // │ // ├─ CharacterCard LLM × N (parallel per new char — TEXT only) // ├─ Cinematographer LLM (parallel with the cards) // │ // └─ wait for cards + cinema // │ // ├─ entry-beat portraits ──┐ (block the Painter — its refs) // ▼ │ // Painter — generateImage │ (overlapped, NOT on the paint path): // with referenceImages ├─ non-entry-beat portraits // │ └─ ALL voice provisioning + orphan voices // ▼ // await the overlapped work, fold into the registry // │ // ▼ // return { scene, sceneImageUrl, characters, storyState } // // Two deliberate decouplings unlock the parallelism: // 1. The Cinematographer only POSITIONS named characters, so it needs no // visualDescription and runs alongside the card LLMs. // 2. The Painter only needs visualDescription TEXT (all on-stage) + the // entry-beat characters' PORTRAITS (its referenceImages). Voices are // never needed to paint, and non-entry portraits are never referenced — // so both overlap the (longest) paint call instead of blocking it. // ══════════════════════════════════════════════════════════════════════ function newSceneId(): string { return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`; } function tlog(label: string, t0: number): void { console.log(`${label}: ${Date.now() - t0}ms`); } // Merge a freshly-designed Character into a registry, preserving any // previously-set voice/portrait that the new design didn't fill in (so // re-designing a known character can't silently drop their voice or wipe // out an already-generated portrait UUID). Match by name. export function mergeCharacters( existing: Character[], updates: Character[], ): Character[] { if (updates.length === 0) return existing; const byName = new Map(existing.map((c) => [c.name, c])); for (const u of updates) { const prev = byName.get(u.name); if (!prev) { byName.set(u.name, u); continue; } // Preserve any prior provisioned resource that the new design omitted. byName.set(u.name, { ...u, voice: u.voice ?? prev.voice, visualDescription: u.visualDescription ?? prev.visualDescription, basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl, basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid, voiceDescription: u.voiceDescription || prev.voiceDescription, }); } return Array.from(byName.values()); } // Pick a reference to the prior scene image when sceneKey matches a prior // scene — used by the Painter as one of the `referenceImages` (NOT as a // seedImage, because FLUX.2 [klein] 9B KV does not support seedImage). // // Prefer URL over UUID for the same reason painter.collectReferenceImages // does: the UUID returned by `imageInference` isn't always recognized by // Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`. // The URL is Runware's own CDN link — they can always fetch it. UUID is kept // as a backstop. Returns undefined when no prior scene shares the sceneKey. function pickPriorSceneReference( session: Session, currentSceneKey: string | undefined, ): { priorSceneReference?: string; priorSceneKey?: string } { if (!currentSceneKey) return {}; for (let i = session.history.length - 1; i >= 0; i--) { const prior = session.history[i]!.scene; if (prior.sceneKey === currentSceneKey) { const ref = prior.imageUrl ?? prior.imageUuid; if (ref) { return { priorSceneReference: ref, priorSceneKey: prior.sceneKey }; } } } return {}; } // Merge the Writer's volatile story-memory patch onto the carried StoryState. // The stable spine (logline/genreTags/protagonist/castNotes) is preserved; // only the volatile fields the Writer is allowed to rewrite are overwritten, // and only when the patch actually provided them. A missing carried state // (legacy session from before the Architect existed) degrades to an empty // spine rather than throwing. function applyStoryStatePatch( base: StoryState | undefined, patch: StoryStatePatch | undefined, ): StoryState { const start: StoryState = base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" }; if (!patch) return start; return { ...start, synopsis: patch.synopsis ?? start.synopsis, openThreads: patch.openThreads ?? start.openThreads, relationships: patch.relationships ?? start.relationships, nextHook: patch.nextHook ?? start.nextHook, }; } export type SceneResult = { scene: Scene; sceneImageUrl: string; characters: Character[]; storyState: StoryState; }; // ────────────────────────────────────────────────────────────────────── // directScene — the multi-agent pipeline. Used by orchestrator's // startSession and requestScene. // ────────────────────────────────────────────────────────────────────── export async function directScene( config: EngineConfig, session: Session, ): Promise { const tTotal = Date.now(); // Stage 1 — Writer (serial; everything downstream needs sceneSummary + // beats[] to know who's on stage and what to compose around). const tWriter = Date.now(); const writerOut = await runWriter(config.text, session); tlog("[directScene] Writer", tWriter); // Identify NEW characters introduced by this scene that need to be // designed (LLM + portrait + voice). Existing characters in the registry // are skipped — their cards / portraits / voices persist across scenes. const allActiveNames = collectActiveCharacterNames(writerOut.beats); const newCharNames = allActiveNames.filter( (n) => !session.characters.some((c) => c.name === n), ); // Find the entry beat for the Cinematographer (which characters are // on-screen in the establishing shot). const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId); const entryBeatActive = entryBeat?.activeCharacters ?? []; // For sceneKey-based visual continuity, look up the prior matching scene's // image to slot into Painter's referenceImages (max 4 of which include // character portraits too). const { priorSceneReference, priorSceneKey } = pickPriorSceneReference( session, writerOut.sceneKey, ); // ── Stage 2 — character cards (LLM) ∥ Cinematographer ────────────────── // Both are cheap LLM calls and neither needs the other's output, so they // run concurrently. The cards give us each new character's visualDescription // TEXT; portraits + voices are deferred to Stage 3 so they can overlap the // paint instead of blocking it. const tParallel = Date.now(); const cardPromises = newCharNames.map((name) => designCharacterCard(config, session, name).catch((err): CharacterCard => { const msg = err instanceof Error ? err.message : String(err); console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`); // Last-resort fallback: a name + generic voice card so the speaker isn't // unknown. No visualDescription → no portrait is attempted for them. return { name, voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`, }; }), ); const cinemaPromise = runCinematographer(config.text, { sceneSummary: writerOut.sceneSummary, styleGuide: session.styleGuide, entryBeatActive, entryBeatSpeaker: entryBeat?.speaker, priorSceneKey, currentSceneKey: writerOut.sceneKey, }); const [cards, cinemaOut] = await Promise.all([ Promise.all(cardPromises), cinemaPromise, ]); tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel); // Working registry: existing characters + new cards. visualDescription text // is present now; portraits + voices fill in over the next two phases. let characters = mergeCharacters( session.characters, cards.map((c) => ({ name: c.name, voiceDescription: c.voiceDescription, visualDescription: c.visualDescription, })), ); // ── Stage 3 — portraits + voices, scheduled around the Painter ───────── const tProvision = Date.now(); // Entry-beat character names: the ONLY portraits the Painter references // (collectReferenceImages slots in the entry beat's speaker + activeChars). const entryNames = new Set(); if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) { entryNames.add(entryBeat.speaker); } for (const c of entryBeatActive) { if (!isPovName(c.name)) entryNames.add(c.name); } type NamedPortrait = { name: string; basePortraitUrl?: string; basePortraitUuid?: string; }; // Kick off portrait gen for every NEW char that has a visualDescription. // Entry-beat portraits block the Painter; the rest overlap it. const entryPortraitPromises: Promise[] = []; const restPortraitPromises: Promise[] = []; for (const card of cards) { const vd = card.visualDescription; if (!vd) continue; const p = renderCharacterPortrait( config, card.name, vd, session.styleGuide, ).then((res): NamedPortrait => ({ name: card.name, ...res })); (entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p); } // Kick off voice provisioning for every NEW char (never on the paint path). const voicePromises = cards.map((card) => provisionCharacterVoice(config, card.voiceDescription, card.name).then( (voice): Character => ({ name: card.name, voiceDescription: card.voiceDescription, voice, }), ), ); // Edge case: a speaker the Writer referenced without listing in any beat's // activeCharacters. collectActiveCharacterNames already includes speakers, // so this is a rare defensive net. Provision a voice only (never on-screen). const speakerNames = new Set( writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)), ); const orphanSpeakers = [...speakerNames].filter( // Pattern B: "你" (player) is a valid speaker but never gets a Character // record — TTS is intentionally skipped on the client. (n) => !isPovName(n) && !characters.some((c) => c.name === n) && !cards.some((c) => c.name === n), ); const orphanPromises = orphanSpeakers.map((n) => provisionVoiceForName(config, session, n), ); // Block the Painter ONLY on entry-beat portraits (its referenceImages). const entryPortraits = await Promise.all(entryPortraitPromises); characters = mergeCharacters( characters, entryPortraits.map((p) => ({ name: p.name, voiceDescription: "", // preserved from the card by mergeCharacters basePortraitUrl: p.basePortraitUrl, basePortraitUuid: p.basePortraitUuid, })), ); tlog("[directScene] entry-beat portraits", tProvision); // ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards + // entry portraits). On-stage = everyone named in any beat, so the archetype // block covers anyone the player might encounter in this scene. const onStageCharacters = characters.filter((c) => allActiveNames.includes(c.name), ); const tPainter = Date.now(); const painted = await runPainter( config, { integratedPrompt: cinemaOut.integratedPrompt, styleGuide: session.styleGuide, onStageCharacters, priorSceneImage: priorSceneReference, styleReferenceImage: session.styleReferenceImage, }, entryBeat, ); tlog("[directScene] Painter", tPainter); // Fold in the work that overlapped the paint: remaining portraits, all // voices, and any orphan-speaker voices. Awaited before returning so the // session the client persists is fully provisioned for later scenes. const tOverlap = Date.now(); const [restPortraits, voicedChars, orphanChars] = await Promise.all([ Promise.all(restPortraitPromises), Promise.all(voicePromises), Promise.all(orphanPromises), ]); characters = mergeCharacters( characters, restPortraits.map((p) => ({ name: p.name, voiceDescription: "", basePortraitUrl: p.basePortraitUrl, basePortraitUuid: p.basePortraitUuid, })), ); characters = mergeCharacters(characters, voicedChars); if (orphanChars.length > 0) { characters = mergeCharacters(characters, orphanChars); } tlog("[directScene] overlapped portraits+voices", tOverlap); const scene: Scene = { id: newSceneId(), // scenePrompt is the cinematographer's English compositional output; // the Writer's sceneSummary stays in the session log via beats[]/ // history. Keeping the original field name preserves compat with // anything that already reads scene.scenePrompt (e.g., insert-beat // user prompt). scenePrompt: cinemaOut.integratedPrompt, beats: writerOut.beats, entryBeatId: writerOut.entryBeatId, sceneKey: writerOut.sceneKey, imageUuid: painted.kind === "real" ? painted.imageUuid : undefined, imageUrl: painted.imageUrl, }; // Merge the Writer's volatile memory rewrite onto the carried bible so the // throughline survives the next scene cut (orchestrator returns it; the // client persists it back into the session). const storyState = applyStoryStatePatch( session.storyState, writerOut.storyStatePatch, ); tlog("[directScene] TOTAL", tTotal); return { scene, sceneImageUrl: painted.imageUrl, characters, storyState }; } // ────────────────────────────────────────────────────────────────────── // directInsertBeat — single-agent path for vision-driven in-scene // exploration. Generates ONE transient beat with NO new image, NO new // characters. Multi-agent pipeline doesn't apply here (no rendering, no // character introduction allowed by the prompt). // ────────────────────────────────────────────────────────────────────── export async function directInsertBeat( config: ProviderConfig, session: Session, freeformAction: string, ): Promise { const raw = await chat( config, [ { role: "system", content: INSERT_BEAT_SYSTEM }, { role: "user", content: buildInsertBeatUserMessage(session, freeformAction), }, ], { temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" }, ); const parsed = parseJsonLoose(raw); const narration = parsed.narration?.trim() || undefined; const rawSpeaker = parsed.speaker?.trim() || undefined; // Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through. const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined; const line = parsed.line?.trim() || undefined; // lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你") // TTS is intentionally skipped on the client, so lineDelivery is dropped. const lineDelivery = line && speaker !== POV_DISPLAY_NAME ? parsed.lineDelivery?.trim() || undefined : undefined; if (!narration && !speaker && !line) { return { narration: "(你停下脚步,环视片刻。)" }; } return { narration, speaker, line, lineDelivery }; }