Files
infiplot-web/lib/engine/director.ts
T
DESKTOP-I1T6TF3\Q 37c911f510 chore(engine): log prompt-cache hit/miss per chat call
Add a `tag` option to chat() and have it print one `[cache] <tag>
hit=X miss=Y rate=Z%` line per call. Three Usage-shape variants are
probed in order so the same logger works across providers:

  - DeepSeek (v3+):  usage.prompt_cache_hit_tokens / *_miss_tokens
  - OpenAI / o-series: usage.prompt_tokens_details.cached_tokens
  - Anthropic:        usage.cache_read_input_tokens / *_creation_*

When none of them are present (MiMo / local Ollama / others) we still
print prompt + completion totals so the cost baseline is visible.

Tag every callsite so the log is greppable:
  architect / writer / character-designer / cinematographer / insert-beat

This is the prerequisite for the prefix-cache reordering work that
follows — without per-agent visibility there's no way to tell if a
prompt rearrangement actually moved the needle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-03 10:42:33 +08:00

430 lines
17 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chat } from "@infiplot/ai-client";
import type {
Character,
EngineConfig,
InsertBeatPartial,
ProviderConfig,
Scene,
Session,
StoryState,
StoryStatePatch,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
designCharacterCard,
provisionCharacterVoice,
provisionVoiceForName,
renderCharacterPortrait,
} from "./agents/characterDesigner";
import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import {
collectActiveCharacterNames,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriter,
} from "./agents/writer";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// ══════════════════════════════════════════════════════════════════════
// director.ts — multi-agent orchestrator for one full Scene generation.
//
// Critical path (per Scene call):
//
// Writer LLM (~3s, serial)
// │
// ├─ CharacterCard LLM × N (parallel per new char — TEXT only)
// ├─ Cinematographer LLM (parallel with the cards)
// │
// └─ wait for cards + cinema
// │
// ├─ entry-beat portraits ──┐ (block the Painter — its refs)
// ▼ │
// Painter — generateImage │ (overlapped, NOT on the paint path):
// with referenceImages ├─ non-entry-beat portraits
// │ └─ ALL voice provisioning + orphan voices
// ▼
// await the overlapped work, fold into the registry
// │
// ▼
// return { scene, sceneImageUrl, characters, storyState }
//
// Two deliberate decouplings unlock the parallelism:
// 1. The Cinematographer only POSITIONS named characters, so it needs no
// visualDescription and runs alongside the card LLMs.
// 2. The Painter only needs visualDescription TEXT (all on-stage) + the
// entry-beat characters' PORTRAITS (its referenceImages). Voices are
// never needed to paint, and non-entry portraits are never referenced —
// so both overlap the (longest) paint call instead of blocking it.
// ══════════════════════════════════════════════════════════════════════
function newSceneId(): string {
return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// Merge a freshly-designed Character into a registry, preserving any
// previously-set voice/portrait that the new design didn't fill in (so
// re-designing a known character can't silently drop their voice or wipe
// out an already-generated portrait UUID). Match by name.
export function mergeCharacters(
existing: Character[],
updates: Character[],
): Character[] {
if (updates.length === 0) return existing;
const byName = new Map(existing.map((c) => [c.name, c]));
for (const u of updates) {
const prev = byName.get(u.name);
if (!prev) {
byName.set(u.name, u);
continue;
}
// Preserve any prior provisioned resource that the new design omitted.
byName.set(u.name, {
...u,
voice: u.voice ?? prev.voice,
visualDescription: u.visualDescription ?? prev.visualDescription,
basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
});
}
return Array.from(byName.values());
}
// Pick a reference to the prior scene image when sceneKey matches a prior
// scene — used by the Painter as one of the `referenceImages` (NOT as a
// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
//
// Prefer URL over UUID for the same reason painter.collectReferenceImages
// does: the UUID returned by `imageInference` isn't always recognized by
// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
// as a backstop. Returns undefined when no prior scene shares the sceneKey.
function pickPriorSceneReference(
session: Session,
currentSceneKey: string | undefined,
): { priorSceneReference?: string; priorSceneKey?: string } {
if (!currentSceneKey) return {};
for (let i = session.history.length - 1; i >= 0; i--) {
const prior = session.history[i]!.scene;
if (prior.sceneKey === currentSceneKey) {
const ref = prior.imageUrl ?? prior.imageUuid;
if (ref) {
return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
}
}
}
return {};
}
// Merge the Writer's volatile story-memory patch onto the carried StoryState.
// The stable spine (logline/genreTags/protagonist/castNotes) is preserved;
// only the volatile fields the Writer is allowed to rewrite are overwritten,
// and only when the patch actually provided them. A missing carried state
// (legacy session from before the Architect existed) degrades to an empty
// spine rather than throwing.
function applyStoryStatePatch(
base: StoryState | undefined,
patch: StoryStatePatch | undefined,
): StoryState {
const start: StoryState =
base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" };
if (!patch) return start;
return {
...start,
synopsis: patch.synopsis ?? start.synopsis,
openThreads: patch.openThreads ?? start.openThreads,
relationships: patch.relationships ?? start.relationships,
nextHook: patch.nextHook ?? start.nextHook,
};
}
export type SceneResult = {
scene: Scene;
sceneImageUrl: string;
characters: Character[];
storyState: StoryState;
};
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
// ──────────────────────────────────────────────────────────────────────
export async function directScene(
config: EngineConfig,
session: Session,
): Promise<SceneResult> {
const tTotal = Date.now();
// Stage 1 — Writer (serial; everything downstream needs sceneSummary +
// beats[] to know who's on stage and what to compose around).
const tWriter = Date.now();
const writerOut = await runWriter(config.text, session);
tlog("[directScene] Writer", tWriter);
// Identify NEW characters introduced by this scene that need to be
// designed (LLM + portrait + voice). Existing characters in the registry
// are skipped — their cards / portraits / voices persist across scenes.
const allActiveNames = collectActiveCharacterNames(writerOut.beats);
const newCharNames = allActiveNames.filter(
(n) => !session.characters.some((c) => c.name === n),
);
// Find the entry beat for the Cinematographer (which characters are
// on-screen in the establishing shot).
const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId);
const entryBeatActive = entryBeat?.activeCharacters ?? [];
// For sceneKey-based visual continuity, look up the prior matching scene's
// image to slot into Painter's referenceImages (max 4 of which include
// character portraits too).
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
writerOut.sceneKey,
);
// ── Stage 2 — character cards (LLM) ∥ Cinematographer ──────────────────
// Both are cheap LLM calls and neither needs the other's output, so they
// run concurrently. The cards give us each new character's visualDescription
// TEXT; portraits + voices are deferred to Stage 3 so they can overlap the
// paint instead of blocking it.
const tParallel = Date.now();
const cardPromises = newCharNames.map((name) =>
designCharacterCard(config, session, name).catch((err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
// Last-resort fallback: a name + generic voice card so the speaker isn't
// unknown. No visualDescription → no portrait is attempted for them.
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
}),
);
const cinemaPromise = runCinematographer(config.text, {
sceneSummary: writerOut.sceneSummary,
styleGuide: session.styleGuide,
entryBeatActive,
entryBeatSpeaker: entryBeat?.speaker,
priorSceneKey,
currentSceneKey: writerOut.sceneKey,
});
const [cards, cinemaOut] = await Promise.all([
Promise.all(cardPromises),
cinemaPromise,
]);
tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
// Working registry: existing characters + new cards. visualDescription text
// is present now; portraits + voices fill in over the next two phases.
let characters = mergeCharacters(
session.characters,
cards.map((c) => ({
name: c.name,
voiceDescription: c.voiceDescription,
visualDescription: c.visualDescription,
})),
);
// ── Stage 3 — portraits + voices, scheduled around the Painter ─────────
const tProvision = Date.now();
// Entry-beat character names: the ONLY portraits the Painter references
// (collectReferenceImages slots in the entry beat's speaker + activeChars).
const entryNames = new Set<string>();
if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) {
entryNames.add(entryBeat.speaker);
}
for (const c of entryBeatActive) {
if (!isPovName(c.name)) entryNames.add(c.name);
}
type NamedPortrait = {
name: string;
basePortraitUrl?: string;
basePortraitUuid?: string;
};
// Kick off portrait gen for every NEW char that has a visualDescription.
// Entry-beat portraits block the Painter; the rest overlap it.
const entryPortraitPromises: Promise<NamedPortrait>[] = [];
const restPortraitPromises: Promise<NamedPortrait>[] = [];
for (const card of cards) {
const vd = card.visualDescription;
if (!vd) continue;
const p = renderCharacterPortrait(
config,
card.name,
vd,
session.styleGuide,
).then((res): NamedPortrait => ({ name: card.name, ...res }));
(entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p);
}
// Kick off voice provisioning for every NEW char (never on the paint path).
const voicePromises = cards.map((card) =>
provisionCharacterVoice(config, card.voiceDescription, card.name).then(
(voice): Character => ({
name: card.name,
voiceDescription: card.voiceDescription,
voice,
}),
),
);
// Edge case: a speaker the Writer referenced without listing in any beat's
// activeCharacters. collectActiveCharacterNames already includes speakers,
// so this is a rare defensive net. Provision a voice only (never on-screen).
const speakerNames = new Set(
writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)),
);
const orphanSpeakers = [...speakerNames].filter(
// Pattern B: "你" (player) is a valid speaker but never gets a Character
// record — TTS is intentionally skipped on the client.
(n) =>
!isPovName(n) &&
!characters.some((c) => c.name === n) &&
!cards.some((c) => c.name === n),
);
const orphanPromises = orphanSpeakers.map((n) =>
provisionVoiceForName(config, session, n),
);
// Block the Painter ONLY on entry-beat portraits (its referenceImages).
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
characters,
entryPortraits.map((p) => ({
name: p.name,
voiceDescription: "", // preserved from the card by mergeCharacters
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
tlog("[directScene] entry-beat portraits", tProvision);
// ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards +
// entry portraits). On-stage = everyone named in any beat, so the archetype
// block covers anyone the player might encounter in this scene.
const onStageCharacters = characters.filter((c) =>
allActiveNames.includes(c.name),
);
const tPainter = Date.now();
const painted = await runPainter(
config,
{
integratedPrompt: cinemaOut.integratedPrompt,
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
},
entryBeat,
);
tlog("[directScene] Painter", tPainter);
// Fold in the work that overlapped the paint: remaining portraits, all
// voices, and any orphan-speaker voices. Awaited before returning so the
// session the client persists is fully provisioned for later scenes.
const tOverlap = Date.now();
const [restPortraits, voicedChars, orphanChars] = await Promise.all([
Promise.all(restPortraitPromises),
Promise.all(voicePromises),
Promise.all(orphanPromises),
]);
characters = mergeCharacters(
characters,
restPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
characters = mergeCharacters(characters, voicedChars);
if (orphanChars.length > 0) {
characters = mergeCharacters(characters, orphanChars);
}
tlog("[directScene] overlapped portraits+voices", tOverlap);
const scene: Scene = {
id: newSceneId(),
// scenePrompt is the cinematographer's English compositional output;
// the Writer's sceneSummary stays in the session log via beats[]/
// history. Keeping the original field name preserves compat with
// anything that already reads scene.scenePrompt (e.g., insert-beat
// user prompt).
scenePrompt: cinemaOut.integratedPrompt,
beats: writerOut.beats,
entryBeatId: writerOut.entryBeatId,
sceneKey: writerOut.sceneKey,
imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
imageUrl: painted.imageUrl,
};
// Merge the Writer's volatile memory rewrite onto the carried bible so the
// throughline survives the next scene cut (orchestrator returns it; the
// client persists it back into the session).
const storyState = applyStoryStatePatch(
session.storyState,
writerOut.storyStatePatch,
);
tlog("[directScene] TOTAL", tTotal);
return { scene, sceneImageUrl: painted.imageUrl, characters, storyState };
}
// ──────────────────────────────────────────────────────────────────────
// directInsertBeat — single-agent path for vision-driven in-scene
// exploration. Generates ONE transient beat with NO new image, NO new
// characters. Multi-agent pipeline doesn't apply here (no rendering, no
// character introduction allowed by the prompt).
// ──────────────────────────────────────────────────────────────────────
export async function directInsertBeat(
config: ProviderConfig,
session: Session,
freeformAction: string,
): Promise<InsertBeatPartial> {
const raw = await chat(
config,
[
{ role: "system", content: INSERT_BEAT_SYSTEM },
{
role: "user",
content: buildInsertBeatUserMessage(session, freeformAction),
},
],
{ temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" },
);
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
const narration = parsed.narration?.trim() || undefined;
const rawSpeaker = parsed.speaker?.trim() || undefined;
// Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through.
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = parsed.line?.trim() || undefined;
// lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你")
// TTS is intentionally skipped on the client, so lineDelivery is dropped.
const lineDelivery =
line && speaker !== POV_DISPLAY_NAME
? parsed.lineDelivery?.trim() || undefined
: undefined;
if (!narration && !speaker && !line) {
return { narration: "(你停下脚步,环视片刻。)" };
}
return { narration, speaker, line, lineDelivery };
}