import { chat } from "@infiplot/ai-client";
import { coerceOrientation } from "@infiplot/types";
import type {
Beat,
BeatChoice,
Character,
CharacterIntent,
EngineConfig,
InsertBeatMulti,
InsertBeatPartial,
ProviderConfig,
Scene,
SceneStreamEvent,
Session,
StoryState,
StoryStatePatch,
WriterScenePlan,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
designCharacterCard,
provisionCharacterVoice,
provisionVoiceForName,
renderCharacterPortrait,
} from "./agents/characterDesigner";
import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import type { WriterBeatsOutput } from "./agents/writer";
import {
coercePlanFromRaw,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriterStream,
} from "./agents/writer";
import { routeTaggedStream } from "./stream";
import { splitProseToBeats } from "./stream/proseSplitter";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// ══════════════════════════════════════════════════════════════════════
// director.ts — multi-agent orchestrator for one full Scene generation.
//
// Critical path (per Scene call):
//
// Writer PHASE A — plan LLM (scene skeleton only, serial)
// │
// ├──────────────────────────┬───────────────────────────────────────┐
// ▼ ▼ │
// Writer PHASE B image pipeline (concurrent): │
// beats LLM CharacterCard LLM × N ∥ Cinematographer │
// (full dialogue, → entry-beat portraits (block Painter) │
// overlapped) → Painter (generateImage w/ refs) │
// │ → await overlapped: rest portraits+voices │
// └──────────────────────────► await Phase B ◄────────────────────────┘
// ▼
// assemble Scene → { scene, sceneImageUrl, characters, storyState }
//
// Why split the Writer (the latency win): the image pipeline only needs the
// scene SUMMARY + entry roster + cast (Phase A) — NOT the dialogue (Phase B).
// Writing beats used to sit serially in FRONT of the image; now it overlaps
// it, so the floor is max(beats, image) instead of beats + image.
//
// The decouplings that unlock the rest of the parallelism:
// 1. The Cinematographer only POSITIONS named characters, so it needs no
// visualDescription and runs alongside the card LLMs.
// 2. The Painter only needs visualDescription TEXT (all on-stage) + the
// entry-beat characters' PORTRAITS (its referenceImages). Voices are
// never needed to paint, and non-entry portraits are never referenced —
// so both overlap the (longest) paint call instead of blocking it.
// ══════════════════════════════════════════════════════════════════════
function newSceneId(): string {
return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// Merge a freshly-designed Character into a registry, preserving any
// previously-set voice/portrait that the new design didn't fill in (so
// re-designing a known character can't silently drop their voice or wipe
// out an already-generated portrait UUID). Match by name.
export function mergeCharacters(
existing: Character[],
updates: Character[],
): Character[] {
if (updates.length === 0) return existing;
const byName = new Map(existing.map((c) => [c.name, c]));
for (const u of updates) {
const prev = byName.get(u.name);
if (!prev) {
byName.set(u.name, u);
continue;
}
// Preserve any prior provisioned resource that the new design omitted.
byName.set(u.name, {
...u,
voice: u.voice ?? prev.voice,
visualDescription: u.visualDescription ?? prev.visualDescription,
basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
// Paradigm D: preserve persona fields when later designs omit them
// (same logic as portrait/voice preservation).
persona: u.persona ?? prev.persona,
personalityTraits: u.personalityTraits ?? prev.personalityTraits,
speakingStyle: u.speakingStyle ?? prev.speakingStyle,
sampleDialogue: u.sampleDialogue ?? prev.sampleDialogue,
relationshipToPlayer: u.relationshipToPlayer ?? prev.relationshipToPlayer,
secrets: u.secrets ?? prev.secrets,
});
}
return Array.from(byName.values());
}
// Pick a reference to the prior scene image when sceneKey matches a prior
// scene — used by the Painter as one of the `referenceImages` (NOT as a
// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
//
// Prefer URL over UUID for the same reason painter.collectReferenceImages
// does: the UUID returned by `imageInference` isn't always recognized by
// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
// as a backstop. Returns undefined when no prior scene shares the sceneKey.
function pickPriorSceneReference(
session: Session,
currentSceneKey: string | undefined,
): { priorSceneReference?: string; priorSceneKey?: string } {
if (!currentSceneKey) return {};
for (let i = session.history.length - 1; i >= 0; i--) {
const prior = session.history[i]!.scene;
if (prior.sceneKey === currentSceneKey) {
const ref = prior.imageUrl ?? prior.imageUuid;
if (ref) {
return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
}
}
}
return {};
}
// Merge the Writer's volatile story-memory patch onto the carried StoryState.
// The stable spine (logline/genreTags/protagonist/castNotes) is preserved;
// only the volatile fields the Writer is allowed to rewrite are overwritten,
// and only when the patch actually provided them. A missing carried state
// (legacy session from before the Architect existed) degrades to an empty
// spine rather than throwing.
function applyStoryStatePatch(
base: StoryState | undefined,
patch: StoryStatePatch | undefined,
): StoryState {
const start: StoryState =
base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" };
if (!patch) return start;
return {
...start,
synopsis: patch.synopsis ?? start.synopsis,
openThreads: patch.openThreads ?? start.openThreads,
relationships: patch.relationships ?? start.relationships,
nextHook: patch.nextHook ?? start.nextHook,
};
}
export type SceneResult = {
scene: Scene;
sceneImageUrl: string;
characters: Character[];
storyState: StoryState;
};
// Absolute-worst-case plan when the stream produced no usable at all
// (StreamRouter degraded with no extractable plan). Keeps the pipeline alive.
function minimalFallbackPlan(): WriterScenePlan {
return {
sceneSummary: "未指定场景概要",
sceneKey: undefined,
entryBeatId: "b1",
cast: [],
entryActiveCharacters: [],
entrySpeaker: undefined,
};
}
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
// ──────────────────────────────────────────────────────────────────────
export async function directScene(
config: EngineConfig,
session: Session,
emit?: (event: SceneStreamEvent) => void,
): Promise {
const tTotal = Date.now();
// ══════════════════════════════════════════════════════════════════════
// Paradigm D — single Writer stream + StreamRouter dispatch
//
// One LLM call produces → → . StreamRouter
// cuts the tags; closure resolves the plan deferred, unlocking
// the downstream image pipeline IN PARALLEL with the still-streaming
// . Prose is split into Beat[] after routing completes.
// ══════════════════════════════════════════════════════════════════════
// ── Step 1 — kick off the Writer stream + routing ─────────────────
const tStream = Date.now();
const writerResult = runWriterStream(config.text, session);
// Deferred that settles when onPlan fires (or when routing completes
// without a plan — degraded fallback).
let planSettled = false;
let resolvePlan!: (p: WriterScenePlan) => void;
const planPromise = new Promise((res) => {
resolvePlan = res;
});
// Closure-captured coerced plan so onStoryComplete can split+emit beats
// DURING streaming (before painter finishes → text-first progressive play).
let coercedPlanRef: WriterScenePlan | undefined;
let earlyBeatsOut: WriterBeatsOutput | undefined;
// Opening-scene story bible from the Writer's (replaces the old
// Architect). Undefined on subsequent scenes (carried StoryState wins).
let bibleFromPlan: WriterScenePlan["storyBible"];
const routingPromise = routeTaggedStream(writerResult.textStream, {
onPlan: (rawPlan) => {
try {
const coerced = coercePlanFromRaw(rawPlan as unknown as Record);
coercedPlanRef = coerced;
if (coerced.storyBible) bibleFromPlan = coerced.storyBible;
planSettled = true;
emit?.({ type: "plan", plan: coerced });
resolvePlan(coerced);
} catch {
planSettled = true;
resolvePlan(minimalFallbackPlan());
}
},
onStoryComplete: (rawStory) => {
// Tags are ordered (plan before story), so the plan is already coerced.
const p = coercedPlanRef ?? minimalFallbackPlan();
try {
const out = splitProseToBeats(rawStory, p);
earlyBeatsOut = out;
for (const b of out.beats) emit?.({ type: "beat", beat: b });
} catch {
// split failure → Step 6 re-splits from rawStorySegment
}
},
}).then((result) => {
// If plan never fired (stream error / no plan tag), settle the deferred
// from the degraded extraction or a minimal fallback.
if (!planSettled) {
const extracted = result.plan
? coercePlanFromRaw(result.plan as unknown as Record)
: minimalFallbackPlan();
if (extracted.storyBible) bibleFromPlan = extracted.storyBible;
resolvePlan(extracted);
}
return result;
});
// ── Step 2 — await plan (settles at close — EARLY) ────────
const plan = await planPromise;
tlog("[directScene] plan (stream → )", tStream);
// From here the pipeline is structurally identical to the old Phase A
// flow: plan drives character design + cinematographer + painter, all
// overlapping with the Writer's still-streaming .
const newCharNames = plan.cast.filter(
(n) => !session.characters.some((c) => c.name === n),
);
const entryBeatActive = plan.entryActiveCharacters;
const entryBeatSpeaker = plan.entrySpeaker;
const entryBeatForPaint: Beat = {
id: plan.entryBeatId,
speaker: entryBeatSpeaker,
activeCharacters: entryBeatActive.length > 0 ? entryBeatActive : undefined,
next: { type: "continue", nextBeatId: plan.entryBeatId },
};
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
plan.sceneKey,
);
// ── Step 3 — character cards (LLM) ∥ Cinematographer (parallel) ───
// CharacterDesigner now receives the Writer's intent for each character
// (paradigm D: media translator, not inventor).
const tParallel = Date.now();
const findIntent = (name: string): CharacterIntent | undefined =>
plan.characterIntents?.find((ci) => ci.name === name);
const cardPromises = newCharNames.map((name) =>
designCharacterCard(config, session, name, findIntent(name)).catch(
(err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
},
),
);
const cinemaPromise = runCinematographer(config.text, {
sceneSummary: plan.sceneSummary,
styleGuide: session.styleGuide,
entryBeatActive,
entryBeatSpeaker,
priorSceneKey,
currentSceneKey: plan.sceneKey,
});
const [cards, cinemaOut] = await Promise.all([
Promise.all(cardPromises),
cinemaPromise,
]);
tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
let characters = mergeCharacters(
session.characters,
cards.map((c) => ({
name: c.name,
voiceDescription: c.voiceDescription,
visualDescription: c.visualDescription,
})),
);
// ── Step 4 — portraits + voices, scheduled around Painter ─────────
const tProvision = Date.now();
const entryNames = new Set();
if (entryBeatSpeaker && !isPovName(entryBeatSpeaker)) {
entryNames.add(entryBeatSpeaker);
}
for (const c of entryBeatActive) {
if (!isPovName(c.name)) entryNames.add(c.name);
}
type NamedPortrait = {
name: string;
basePortraitUrl?: string;
basePortraitUuid?: string;
};
const entryPortraitPromises: Promise[] = [];
const restPortraitPromises: Promise[] = [];
for (const card of cards) {
const vd = card.visualDescription;
if (!vd) continue;
const p = renderCharacterPortrait(
config,
card.name,
vd,
session.styleGuide,
).then((res): NamedPortrait => ({ name: card.name, ...res }));
(entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p);
}
// Kick off voice provisioning for every NEW char (never on the paint path).
// On the StepFun path, thread the LLM-selected stepfunVoiceId from the card
// into provision — it lets stepfunProvision honor the catalog pick instead
// of falling back to the keyword scorer (same network cost: still zero).
const voicePromises = cards.map((card) =>
provisionCharacterVoice(config, card.voiceDescription, card.name, {
stepfunVoiceId: card.stepfunVoiceId,
}).then(
(voice): Character => {
const result: Character = {
name: card.name,
voiceDescription: card.voiceDescription,
voice,
stepfunVoiceId: card.stepfunVoiceId,
};
if (voice) emit?.({ type: "voice", name: card.name, voice });
return result;
},
),
);
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
characters,
entryPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
tlog("[directScene] entry-beat portraits", tProvision);
// ── Step 5 — Painter ──────────────────────────────────────────────
const onStageCharacters = characters.filter((c) => plan.cast.includes(c.name));
const orientation = coerceOrientation(session.orientation);
const tPainter = Date.now();
const painted = await runPainter(
config,
{
integratedPrompt: cinemaOut.integratedPrompt,
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
styleReferenceImage: session.styleReferenceImage,
orientation,
},
entryBeatForPaint,
);
tlog("[directScene] Painter", tPainter);
// Emit background as soon as it's painted — the client can swap the
// placeholder for the real scene image while beats/voices are still settling.
emit?.({ type: "background", imageUrl: painted.imageUrl, sceneKey: plan.sceneKey });
// Overlapped: rest portraits + voices
const tOverlap = Date.now();
const [restPortraits, voicedChars] = await Promise.all([
Promise.all(restPortraitPromises),
Promise.all(voicePromises),
]);
characters = mergeCharacters(
characters,
restPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
characters = mergeCharacters(characters, voicedChars);
tlog("[directScene] overlapped portraits+voices", tOverlap);
// ── Step 6 — await routing completion + split prose into beats ────
// routeTaggedStream ran concurrently with the entire image pipeline.
// onStoryComplete likely already fired (splitting + emitting beats for
// progressive playback); this await retrieves the final result + rawStorySegment.
const streamResult = await routingPromise;
// Reuse early-split beats when available (onStoryComplete path); otherwise
// split from rawStorySegment (degrade / onStoryComplete missed).
const beatsOut: WriterBeatsOutput = earlyBeatsOut
?? splitProseToBeats(streamResult.rawStorySegment ?? "", plan);
let beats = beatsOut.beats;
// If earlyBeatsOut was missed but rawStorySegment is available, emit beats
// now (late but still before done — the client gets them for rendering).
if (!earlyBeatsOut && beats.length > 0) {
for (const b of beats) emit?.({ type: "beat", beat: b });
}
// Emit choices (from streamResult or from the last beat's choice exits).
if (streamResult.choices?.length) {
emit?.({ type: "choices", choices: streamResult.choices });
}
// ── C1-ext: merge segment into the last beat's `next` ────
// The Writer's segment produces scene-level exits that are NOT
// embedded in the beats graph. Attach them to the final beat so the player
// can actually pick them.
//
// IMPORTANT: Only change-scene exits are valid here. The prose paradigm
// assigns beat ids automatically (b1, b2, ...) in proseSplitter — the LLM
// has no knowledge of these ids, so any advance-beat targetBeatId it emits
// in will point at the wrong beat, causing a loop.
if (streamResult.choices?.length && beats.length > 0) {
const validChoices = streamResult.choices.filter(
(c): c is BeatChoice =>
typeof c.label === "string" &&
c.label.length > 0 &&
c.effect != null &&
c.effect.kind === "change-scene",
);
if (validChoices.length > 0) {
const withIds = validChoices.map((c, i) => ({
...c,
id: c.id || `sc${i + 1}`,
}));
const lastIdx = beats.length - 1;
const last = beats[lastIdx]!;
const existing =
last.next.type === "choice" ? last.next.choices : [];
const isFallbackOnly =
existing.length <= 1 &&
existing.every((c) => c.label === "继续");
const merged = isFallbackOnly ? withIds : [...existing, ...withIds];
const seen = new Set();
const deduped = merged.filter((c) => {
if (seen.has(c.label)) return false;
seen.add(c.label);
return true;
});
beats = beats.map((b, i) =>
i === lastIdx
? { ...b, next: { type: "choice" as const, choices: deduped } }
: b,
);
}
}
if (streamResult.degraded) {
console.warn("[directScene] Writer stream was degraded — beats may be fallback");
}
const entryBeatId = beats.some((b) => b.id === plan.entryBeatId)
? plan.entryBeatId
: beats[0]!.id;
// Orphan-speaker voices (defensive net — should be rare).
const orphanSpeakers = [
...new Set(beats.map((b) => b.speaker).filter((n): n is string => Boolean(n))),
].filter((n) => !isPovName(n) && !characters.some((c) => c.name === n));
if (orphanSpeakers.length > 0) {
const orphanChars = await Promise.all(
orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)),
);
characters = mergeCharacters(characters, orphanChars);
// Emit orphan voices so the client can preload their audio.
for (const oc of orphanChars) {
if (oc.voice) emit?.({ type: "voice", name: oc.name, voice: oc.voice });
}
}
const scene: Scene = {
id: newSceneId(),
scenePrompt: cinemaOut.integratedPrompt,
beats,
entryBeatId,
sceneKey: plan.sceneKey,
imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
imageUrl: painted.imageUrl,
orientation,
};
// storyState: opening scene seeds the stable spine from the Writer's
// storyBible (replacing the old Architect); subsequent scenes carry the
// existing spine. Volatile fields always come from this scene's patch.
const baseStoryState: StoryState | undefined = session.storyState
?? (bibleFromPlan
? {
logline: bibleFromPlan.logline,
genreTags: bibleFromPlan.genreTags,
protagonist: bibleFromPlan.protagonist,
castNotes: bibleFromPlan.castNotes,
synopsis: "",
}
: undefined);
const storyState = applyStoryStatePatch(
baseStoryState,
beatsOut.storyStatePatch,
);
tlog("[directScene] TOTAL", tTotal);
return { scene, sceneImageUrl: painted.imageUrl, characters, storyState };
}
// ──────────────────────────────────────────────────────────────────────
// directInsertBeat — single-agent path for in-scene exploration.
// Generates 1-3 beats with NO new image, NO new characters, plus
// follow-up choices so the player isn't dumped back to the old options.
// ──────────────────────────────────────────────────────────────────────
function coerceBeatPartial(raw: Record): InsertBeatPartial | null {
const narration = (typeof raw.narration === "string" ? raw.narration.trim() : undefined) || undefined;
const rawSpeaker = (typeof raw.speaker === "string" ? raw.speaker.trim() : undefined) || undefined;
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = (typeof raw.line === "string" ? raw.line.trim() : undefined) || undefined;
const lineDelivery =
line && speaker !== POV_DISPLAY_NAME
? ((typeof raw.lineDelivery === "string" ? raw.lineDelivery.trim() : undefined) || undefined)
: undefined;
if (!narration && !speaker && !line) return null;
return { narration, speaker, line, lineDelivery };
}
export async function directInsertBeat(
config: ProviderConfig,
session: Session,
freeformAction: string,
): Promise<{ beats: InsertBeatPartial[]; choices?: { label: string; effect: string }[] }> {
const raw = await chat(
config,
[
{ role: "system", content: INSERT_BEAT_SYSTEM },
{
role: "user",
content: buildInsertBeatUserMessage(session, freeformAction),
},
],
{ temperature: 0.9, tag: "insert-beat" },
);
const parsed = parseJsonLoose(raw);
// New multi-beat format: { beats: [...], choices: [...] }
if (Array.isArray(parsed.beats) && parsed.beats.length > 0) {
const beats = parsed.beats
.slice(0, 3)
.map((b) => coerceBeatPartial(b as Record))
.filter((b): b is InsertBeatPartial => b !== null);
if (beats.length === 0) {
beats.push({ narration: "(你停下脚步,环视片刻。)" });
}
const choices = Array.isArray(parsed.choices)
? parsed.choices
.slice(0, 2)
.filter((c) => c && typeof c.label === "string" && c.label.trim() && typeof c.effect === "string" && c.effect.trim())
.map((c) => ({ label: c.label.trim(), effect: c.effect.trim() }))
: undefined;
return { beats, choices: choices?.length ? choices : undefined };
}
// Legacy single-beat fallback
const single = coerceBeatPartial(parsed as Record);
return { beats: [single ?? { narration: "(你停下脚步,环视片刻。)" }] };
}