Files
infiplot-web/lib/engine/director.ts
T
Zonghao Yuan 0e4c2ebef4 feat(engine): merge cloudflare-migration — paradigm D engine, BYOK proxy, story persistence (#95)
Squash-merge the cloudflare-migration branch (7 commits by Kai ki) into
staging with conflict resolution, feature integration, and bug fixes.

Engine:
- Paradigm D: single-stream Writer replacing dual-phase Plan/Beats
- Delete Architect agent; story bible generated via Writer <plan> tag
- Modular prompt architecture (segments/registry/builder)
- StreamRouter for tagged stream splitting (<plan>/<story>/<choices>)

Infrastructure:
- Cloudflare Workers deployment (wrangler.jsonc, OpenNext adapter)
- D1 database schema + Drizzle ORM (scaffolded, not yet active)
- R2 storage helpers (scaffolded, not yet active)
- Story persistence API routes + client-side persistence

BYOK (Bring Your Own Key):
- /api/llm/user-proxy with SSRF-protected LLM proxy (+ requireUser auth)
- CORS-aware fetch in ai-client: auto-detect CORS failure, fallback to
  server proxy transparently via OpenAI SDK custom fetch
- BYO config support added to classify-freeform and vision routes
- SettingsModal CORS privacy notice (keys never logged/stored)

SSE streaming:
- engineClient.ts: fetchSSE helper for progressive scene events
- startSession/requestScene accept optional emit callback
- Fix SSE error event field name (error → message) in scene/start routes

i18n integration:
- Wire buildLanguageDirective into paradigm D's prompt builder
- Update corsNotice i18n keys (zh-CN/en/ja) with CORS proxy privacy text
- Preserve Session.language + LanguageSwitcher from i18n commit

Co-authored-by: Kai ki <155355644+zbf1009@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-18 18:05:38 +08:00

607 lines
24 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chat } from "@infiplot/ai-client";
import { coerceOrientation } from "@infiplot/types";
import type {
Beat,
BeatChoice,
Character,
CharacterIntent,
EngineConfig,
InsertBeatPartial,
ProviderConfig,
Scene,
SceneStreamEvent,
Session,
StoryState,
StoryStatePatch,
WriterScenePlan,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
designCharacterCard,
provisionCharacterVoice,
provisionVoiceForName,
renderCharacterPortrait,
} from "./agents/characterDesigner";
import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import type { WriterBeatsOutput } from "./agents/writer";
import {
coercePlanFromRaw,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriterStream,
} from "./agents/writer";
import { routeTaggedStream } from "./stream";
import { splitProseToBeats } from "./stream/proseSplitter";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// ══════════════════════════════════════════════════════════════════════
// director.ts — multi-agent orchestrator for one full Scene generation.
//
// Critical path (per Scene call):
//
// Writer PHASE A — plan LLM (scene skeleton only, serial)
// │
// ├──────────────────────────┬───────────────────────────────────────┐
// ▼ ▼ │
// Writer PHASE B image pipeline (concurrent): │
// beats LLM CharacterCard LLM × N ∥ Cinematographer │
// (full dialogue, → entry-beat portraits (block Painter) │
// overlapped) → Painter (generateImage w/ refs) │
// │ → await overlapped: rest portraits+voices │
// └──────────────────────────► await Phase B ◄────────────────────────┘
// ▼
// assemble Scene → { scene, sceneImageUrl, characters, storyState }
//
// Why split the Writer (the latency win): the image pipeline only needs the
// scene SUMMARY + entry roster + cast (Phase A) — NOT the dialogue (Phase B).
// Writing beats used to sit serially in FRONT of the image; now it overlaps
// it, so the floor is max(beats, image) instead of beats + image.
//
// The decouplings that unlock the rest of the parallelism:
// 1. The Cinematographer only POSITIONS named characters, so it needs no
// visualDescription and runs alongside the card LLMs.
// 2. The Painter only needs visualDescription TEXT (all on-stage) + the
// entry-beat characters' PORTRAITS (its referenceImages). Voices are
// never needed to paint, and non-entry portraits are never referenced —
// so both overlap the (longest) paint call instead of blocking it.
// ══════════════════════════════════════════════════════════════════════
function newSceneId(): string {
return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// Merge a freshly-designed Character into a registry, preserving any
// previously-set voice/portrait that the new design didn't fill in (so
// re-designing a known character can't silently drop their voice or wipe
// out an already-generated portrait UUID). Match by name.
export function mergeCharacters(
existing: Character[],
updates: Character[],
): Character[] {
if (updates.length === 0) return existing;
const byName = new Map(existing.map((c) => [c.name, c]));
for (const u of updates) {
const prev = byName.get(u.name);
if (!prev) {
byName.set(u.name, u);
continue;
}
// Preserve any prior provisioned resource that the new design omitted.
byName.set(u.name, {
...u,
voice: u.voice ?? prev.voice,
visualDescription: u.visualDescription ?? prev.visualDescription,
basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
// Paradigm D: preserve persona fields when later designs omit them
// (same logic as portrait/voice preservation).
persona: u.persona ?? prev.persona,
personalityTraits: u.personalityTraits ?? prev.personalityTraits,
speakingStyle: u.speakingStyle ?? prev.speakingStyle,
sampleDialogue: u.sampleDialogue ?? prev.sampleDialogue,
relationshipToPlayer: u.relationshipToPlayer ?? prev.relationshipToPlayer,
secrets: u.secrets ?? prev.secrets,
});
}
return Array.from(byName.values());
}
// Pick a reference to the prior scene image when sceneKey matches a prior
// scene — used by the Painter as one of the `referenceImages` (NOT as a
// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
//
// Prefer URL over UUID for the same reason painter.collectReferenceImages
// does: the UUID returned by `imageInference` isn't always recognized by
// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
// as a backstop. Returns undefined when no prior scene shares the sceneKey.
function pickPriorSceneReference(
session: Session,
currentSceneKey: string | undefined,
): { priorSceneReference?: string; priorSceneKey?: string } {
if (!currentSceneKey) return {};
for (let i = session.history.length - 1; i >= 0; i--) {
const prior = session.history[i]!.scene;
if (prior.sceneKey === currentSceneKey) {
const ref = prior.imageUrl ?? prior.imageUuid;
if (ref) {
return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
}
}
}
return {};
}
// Merge the Writer's volatile story-memory patch onto the carried StoryState.
// The stable spine (logline/genreTags/protagonist/castNotes) is preserved;
// only the volatile fields the Writer is allowed to rewrite are overwritten,
// and only when the patch actually provided them. A missing carried state
// (legacy session from before the Architect existed) degrades to an empty
// spine rather than throwing.
function applyStoryStatePatch(
base: StoryState | undefined,
patch: StoryStatePatch | undefined,
): StoryState {
const start: StoryState =
base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" };
if (!patch) return start;
return {
...start,
synopsis: patch.synopsis ?? start.synopsis,
openThreads: patch.openThreads ?? start.openThreads,
relationships: patch.relationships ?? start.relationships,
nextHook: patch.nextHook ?? start.nextHook,
};
}
export type SceneResult = {
scene: Scene;
sceneImageUrl: string;
characters: Character[];
storyState: StoryState;
};
// Absolute-worst-case plan when the stream produced no usable <plan> at all
// (StreamRouter degraded with no extractable plan). Keeps the pipeline alive.
function minimalFallbackPlan(): WriterScenePlan {
return {
sceneSummary: "未指定场景概要",
sceneKey: undefined,
entryBeatId: "b1",
cast: [],
entryActiveCharacters: [],
entrySpeaker: undefined,
};
}
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
// ──────────────────────────────────────────────────────────────────────
export async function directScene(
config: EngineConfig,
session: Session,
emit?: (event: SceneStreamEvent) => void,
): Promise<SceneResult> {
const tTotal = Date.now();
// ══════════════════════════════════════════════════════════════════════
// Paradigm D — single Writer stream + StreamRouter dispatch
//
// One LLM call produces <plan> → <story> → <choices>. StreamRouter
// cuts the tags; </plan> closure resolves the plan deferred, unlocking
// the downstream image pipeline IN PARALLEL with the still-streaming
// <story>. Prose is split into Beat[] after routing completes.
// ══════════════════════════════════════════════════════════════════════
// ── Step 1 — kick off the Writer stream + routing ─────────────────
const tStream = Date.now();
const writerResult = runWriterStream(config.text, session);
// Deferred that settles when onPlan fires (or when routing completes
// without a plan — degraded fallback).
let planSettled = false;
let resolvePlan!: (p: WriterScenePlan) => void;
const planPromise = new Promise<WriterScenePlan>((res) => {
resolvePlan = res;
});
// Closure-captured coerced plan so onStoryComplete can split+emit beats
// DURING streaming (before painter finishes → text-first progressive play).
let coercedPlanRef: WriterScenePlan | undefined;
let earlyBeatsOut: WriterBeatsOutput | undefined;
// Opening-scene story bible from the Writer's <plan> (replaces the old
// Architect). Undefined on subsequent scenes (carried StoryState wins).
let bibleFromPlan: WriterScenePlan["storyBible"];
const routingPromise = routeTaggedStream(writerResult.textStream, {
onPlan: (rawPlan) => {
try {
const coerced = coercePlanFromRaw(rawPlan as unknown as Record<string, unknown>);
coercedPlanRef = coerced;
if (coerced.storyBible) bibleFromPlan = coerced.storyBible;
planSettled = true;
emit?.({ type: "plan", plan: coerced });
resolvePlan(coerced);
} catch {
planSettled = true;
resolvePlan(minimalFallbackPlan());
}
},
onStoryComplete: (rawStory) => {
// Tags are ordered (plan before story), so the plan is already coerced.
const p = coercedPlanRef ?? minimalFallbackPlan();
try {
const out = splitProseToBeats(rawStory, p);
earlyBeatsOut = out;
for (const b of out.beats) emit?.({ type: "beat", beat: b });
} catch {
// split failure → Step 6 re-splits from rawStorySegment
}
},
}).then((result) => {
// If plan never fired (stream error / no plan tag), settle the deferred
// from the degraded extraction or a minimal fallback.
if (!planSettled) {
const extracted = result.plan
? coercePlanFromRaw(result.plan as unknown as Record<string, unknown>)
: minimalFallbackPlan();
if (extracted.storyBible) bibleFromPlan = extracted.storyBible;
resolvePlan(extracted);
}
return result;
});
// ── Step 2 — await plan (settles at </plan> close — EARLY) ────────
const plan = await planPromise;
tlog("[directScene] plan (stream → </plan>)", tStream);
// From here the pipeline is structurally identical to the old Phase A
// flow: plan drives character design + cinematographer + painter, all
// overlapping with the Writer's still-streaming <story>.
const newCharNames = plan.cast.filter(
(n) => !session.characters.some((c) => c.name === n),
);
const entryBeatActive = plan.entryActiveCharacters;
const entryBeatSpeaker = plan.entrySpeaker;
const entryBeatForPaint: Beat = {
id: plan.entryBeatId,
speaker: entryBeatSpeaker,
activeCharacters: entryBeatActive.length > 0 ? entryBeatActive : undefined,
next: { type: "continue", nextBeatId: plan.entryBeatId },
};
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
plan.sceneKey,
);
// ── Step 3 — character cards (LLM) ∥ Cinematographer (parallel) ───
// CharacterDesigner now receives the Writer's intent for each character
// (paradigm D: media translator, not inventor).
const tParallel = Date.now();
const findIntent = (name: string): CharacterIntent | undefined =>
plan.characterIntents?.find((ci) => ci.name === name);
const cardPromises = newCharNames.map((name) =>
designCharacterCard(config, session, name, findIntent(name)).catch(
(err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
},
),
);
const cinemaPromise = runCinematographer(config.text, {
sceneSummary: plan.sceneSummary,
styleGuide: session.styleGuide,
entryBeatActive,
entryBeatSpeaker,
priorSceneKey,
currentSceneKey: plan.sceneKey,
});
const [cards, cinemaOut] = await Promise.all([
Promise.all(cardPromises),
cinemaPromise,
]);
tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
let characters = mergeCharacters(
session.characters,
cards.map((c) => ({
name: c.name,
voiceDescription: c.voiceDescription,
visualDescription: c.visualDescription,
})),
);
// ── Step 4 — portraits + voices, scheduled around Painter ─────────
const tProvision = Date.now();
const entryNames = new Set<string>();
if (entryBeatSpeaker && !isPovName(entryBeatSpeaker)) {
entryNames.add(entryBeatSpeaker);
}
for (const c of entryBeatActive) {
if (!isPovName(c.name)) entryNames.add(c.name);
}
type NamedPortrait = {
name: string;
basePortraitUrl?: string;
basePortraitUuid?: string;
};
const entryPortraitPromises: Promise<NamedPortrait>[] = [];
const restPortraitPromises: Promise<NamedPortrait>[] = [];
for (const card of cards) {
const vd = card.visualDescription;
if (!vd) continue;
const p = renderCharacterPortrait(
config,
card.name,
vd,
session.styleGuide,
).then((res): NamedPortrait => ({ name: card.name, ...res }));
(entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p);
}
// Kick off voice provisioning for every NEW char (never on the paint path).
// On the StepFun path, thread the LLM-selected stepfunVoiceId from the card
// into provision — it lets stepfunProvision honor the catalog pick instead
// of falling back to the keyword scorer (same network cost: still zero).
const voicePromises = cards.map((card) =>
provisionCharacterVoice(config, card.voiceDescription, card.name, {
stepfunVoiceId: card.stepfunVoiceId,
}).then(
(voice): Character => {
const result: Character = {
name: card.name,
voiceDescription: card.voiceDescription,
voice,
stepfunVoiceId: card.stepfunVoiceId,
};
if (voice) emit?.({ type: "voice", name: card.name, voice });
return result;
},
),
);
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
characters,
entryPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
tlog("[directScene] entry-beat portraits", tProvision);
// ── Step 5 — Painter ──────────────────────────────────────────────
const onStageCharacters = characters.filter((c) => plan.cast.includes(c.name));
const orientation = coerceOrientation(session.orientation);
const tPainter = Date.now();
const painted = await runPainter(
config,
{
integratedPrompt: cinemaOut.integratedPrompt,
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
styleReferenceImage: session.styleReferenceImage,
orientation,
},
entryBeatForPaint,
);
tlog("[directScene] Painter", tPainter);
// Emit background as soon as it's painted — the client can swap the
// placeholder for the real scene image while beats/voices are still settling.
emit?.({ type: "background", imageUrl: painted.imageUrl, sceneKey: plan.sceneKey });
// Overlapped: rest portraits + voices
const tOverlap = Date.now();
const [restPortraits, voicedChars] = await Promise.all([
Promise.all(restPortraitPromises),
Promise.all(voicePromises),
]);
characters = mergeCharacters(
characters,
restPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
characters = mergeCharacters(characters, voicedChars);
tlog("[directScene] overlapped portraits+voices", tOverlap);
// ── Step 6 — await routing completion + split prose into beats ────
// routeTaggedStream ran concurrently with the entire image pipeline.
// onStoryComplete likely already fired (splitting + emitting beats for
// progressive playback); this await retrieves the final result + rawStorySegment.
const streamResult = await routingPromise;
// Reuse early-split beats when available (onStoryComplete path); otherwise
// split from rawStorySegment (degrade / onStoryComplete missed).
const beatsOut: WriterBeatsOutput = earlyBeatsOut
?? splitProseToBeats(streamResult.rawStorySegment ?? "", plan);
let beats = beatsOut.beats;
// If earlyBeatsOut was missed but rawStorySegment is available, emit beats
// now (late but still before done — the client gets them for rendering).
if (!earlyBeatsOut && beats.length > 0) {
for (const b of beats) emit?.({ type: "beat", beat: b });
}
// Emit choices (from streamResult or from the last beat's choice exits).
if (streamResult.choices?.length) {
emit?.({ type: "choices", choices: streamResult.choices });
}
// ── C1-ext: merge <choices> segment into the last beat's `next` ────
// The Writer's <choices> segment produces scene-level exits that are NOT
// embedded in the beats graph. Attach them to the final beat so the player
// can actually pick them.
//
// IMPORTANT: Only change-scene exits are valid here. The prose paradigm
// assigns beat ids automatically (b1, b2, ...) in proseSplitter — the LLM
// has no knowledge of these ids, so any advance-beat targetBeatId it emits
// in <choices> will point at the wrong beat, causing a loop.
if (streamResult.choices?.length && beats.length > 0) {
const validChoices = streamResult.choices.filter(
(c): c is BeatChoice =>
typeof c.label === "string" &&
c.label.length > 0 &&
c.effect != null &&
c.effect.kind === "change-scene",
);
if (validChoices.length > 0) {
const withIds = validChoices.map((c, i) => ({
...c,
id: c.id || `sc${i + 1}`,
}));
const lastIdx = beats.length - 1;
const last = beats[lastIdx]!;
const existing =
last.next.type === "choice" ? last.next.choices : [];
const isFallbackOnly =
existing.length <= 1 &&
existing.every((c) => c.label === "继续");
const merged = isFallbackOnly ? withIds : [...existing, ...withIds];
const seen = new Set<string>();
const deduped = merged.filter((c) => {
if (seen.has(c.label)) return false;
seen.add(c.label);
return true;
});
beats = beats.map((b, i) =>
i === lastIdx
? { ...b, next: { type: "choice" as const, choices: deduped } }
: b,
);
}
}
if (streamResult.degraded) {
console.warn("[directScene] Writer stream was degraded — beats may be fallback");
}
const entryBeatId = beats.some((b) => b.id === plan.entryBeatId)
? plan.entryBeatId
: beats[0]!.id;
// Orphan-speaker voices (defensive net — should be rare).
const orphanSpeakers = [
...new Set(beats.map((b) => b.speaker).filter((n): n is string => Boolean(n))),
].filter((n) => !isPovName(n) && !characters.some((c) => c.name === n));
if (orphanSpeakers.length > 0) {
const orphanChars = await Promise.all(
orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)),
);
characters = mergeCharacters(characters, orphanChars);
// Emit orphan voices so the client can preload their audio.
for (const oc of orphanChars) {
if (oc.voice) emit?.({ type: "voice", name: oc.name, voice: oc.voice });
}
}
const scene: Scene = {
id: newSceneId(),
scenePrompt: cinemaOut.integratedPrompt,
beats,
entryBeatId,
sceneKey: plan.sceneKey,
imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
imageUrl: painted.imageUrl,
orientation,
};
// storyState: opening scene seeds the stable spine from the Writer's
// storyBible (replacing the old Architect); subsequent scenes carry the
// existing spine. Volatile fields always come from this scene's patch.
const baseStoryState: StoryState | undefined = session.storyState
?? (bibleFromPlan
? {
logline: bibleFromPlan.logline,
genreTags: bibleFromPlan.genreTags,
protagonist: bibleFromPlan.protagonist,
castNotes: bibleFromPlan.castNotes,
synopsis: "",
}
: undefined);
const storyState = applyStoryStatePatch(
baseStoryState,
beatsOut.storyStatePatch,
);
tlog("[directScene] TOTAL", tTotal);
return { scene, sceneImageUrl: painted.imageUrl, characters, storyState };
}
// ──────────────────────────────────────────────────────────────────────
// directInsertBeat — single-agent path for vision-driven in-scene
// exploration. Generates ONE transient beat with NO new image, NO new
// characters. Multi-agent pipeline doesn't apply here (no rendering, no
// character introduction allowed by the prompt).
// ──────────────────────────────────────────────────────────────────────
export async function directInsertBeat(
config: ProviderConfig,
session: Session,
freeformAction: string,
): Promise<InsertBeatPartial> {
const raw = await chat(
config,
[
{ role: "system", content: INSERT_BEAT_SYSTEM },
{
role: "user",
content: buildInsertBeatUserMessage(session, freeformAction),
},
],
{ temperature: 0.9, tag: "insert-beat" },
);
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
const narration = parsed.narration?.trim() || undefined;
const rawSpeaker = parsed.speaker?.trim() || undefined;
// Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through.
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = parsed.line?.trim() || undefined;
// lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你")
// TTS is intentionally skipped on the client, so lineDelivery is dropped.
const lineDelivery =
line && speaker !== POV_DISPLAY_NAME
? parsed.lineDelivery?.trim() || undefined
: undefined;
if (!narration && !speaker && !line) {
return { narration: "(你停下脚步,环视片刻。)" };
}
return { narration, speaker, line, lineDelivery };
}