feat(engine): merge cloudflare-migration — paradigm D engine, BYOK proxy, story persistence (#95)

Squash-merge the cloudflare-migration branch (7 commits by Kai ki) into
staging with conflict resolution, feature integration, and bug fixes.

Engine:
- Paradigm D: single-stream Writer replacing dual-phase Plan/Beats
- Delete Architect agent; story bible generated via Writer <plan> tag
- Modular prompt architecture (segments/registry/builder)
- StreamRouter for tagged stream splitting (<plan>/<story>/<choices>)

Infrastructure:
- Cloudflare Workers deployment (wrangler.jsonc, OpenNext adapter)
- D1 database schema + Drizzle ORM (scaffolded, not yet active)
- R2 storage helpers (scaffolded, not yet active)
- Story persistence API routes + client-side persistence

BYOK (Bring Your Own Key):
- /api/llm/user-proxy with SSRF-protected LLM proxy (+ requireUser auth)
- CORS-aware fetch in ai-client: auto-detect CORS failure, fallback to
  server proxy transparently via OpenAI SDK custom fetch
- BYO config support added to classify-freeform and vision routes
- SettingsModal CORS privacy notice (keys never logged/stored)

SSE streaming:
- engineClient.ts: fetchSSE helper for progressive scene events
- startSession/requestScene accept optional emit callback
- Fix SSE error event field name (error → message) in scene/start routes

i18n integration:
- Wire buildLanguageDirective into paradigm D's prompt builder
- Update corsNotice i18n keys (zh-CN/en/ja) with CORS proxy privacy text
- Preserve Session.language + LanguageSwitcher from i18n commit

Co-authored-by: Kai ki <155355644+zbf1009@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Zonghao Yuan
2026-06-18 18:05:38 +08:00
committed by GitHub
parent 05bd7e229c
commit 0e4c2ebef4
78 changed files with 7396 additions and 919 deletions
+227 -100
View File
@@ -2,15 +2,18 @@ import { chat } from "@infiplot/ai-client";
import { coerceOrientation } from "@infiplot/types";
import type {
Beat,
BeatChoice,
Character,
CharacterIntent,
EngineConfig,
InsertBeatPartial,
ProviderConfig,
Scene,
SceneStreamEvent,
Session,
StoryState,
StoryStatePatch,
WriterPlan,
WriterScenePlan,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
@@ -23,13 +26,14 @@ import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import type { WriterBeatsOutput } from "./agents/writer";
import {
coercePlanFromRaw,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriterBeats,
runWriterPlan,
synthesizeFallbackBeats,
runWriterStream,
} from "./agents/writer";
import { routeTaggedStream } from "./stream";
import { splitProseToBeats } from "./stream/proseSplitter";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
@@ -97,6 +101,14 @@ export function mergeCharacters(
basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
// Paradigm D: preserve persona fields when later designs omit them
// (same logic as portrait/voice preservation).
persona: u.persona ?? prev.persona,
personalityTraits: u.personalityTraits ?? prev.personalityTraits,
speakingStyle: u.speakingStyle ?? prev.speakingStyle,
sampleDialogue: u.sampleDialogue ?? prev.sampleDialogue,
relationshipToPlayer: u.relationshipToPlayer ?? prev.relationshipToPlayer,
secrets: u.secrets ?? prev.secrets,
});
}
return Array.from(byName.values());
@@ -157,6 +169,19 @@ export type SceneResult = {
storyState: StoryState;
};
// Absolute-worst-case plan when the stream produced no usable <plan> at all
// (StreamRouter degraded with no extractable plan). Keeps the pipeline alive.
function minimalFallbackPlan(): WriterScenePlan {
return {
sceneSummary: "未指定场景概要",
sceneKey: undefined,
entryBeatId: "b1",
cast: [],
entryActiveCharacters: [],
entrySpeaker: undefined,
};
}
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
@@ -165,48 +190,89 @@ export type SceneResult = {
export async function directScene(
config: EngineConfig,
session: Session,
emit?: (event: SceneStreamEvent) => void,
): Promise<SceneResult> {
const tTotal = Date.now();
// ── Phase A — Writer PLAN (serial). The image pipeline needs the scene
// summary + entry roster + cast to start, but NOT the dialogue beats. This
// call is small (skeleton only), so it returns fast and unblocks everything.
const tPlan = Date.now();
const plan = await runWriterPlan(config.text, session);
tlog("[directScene] Phase A (plan)", tPlan);
// ══════════════════════════════════════════════════════════════════════
// Paradigm D — single Writer stream + StreamRouter dispatch
//
// One LLM call produces <plan> → <story> → <choices>. StreamRouter
// cuts the tags; </plan> closure resolves the plan deferred, unlocking
// the downstream image pipeline IN PARALLEL with the still-streaming
// <story>. Prose is split into Beat[] after routing completes.
// ══════════════════════════════════════════════════════════════════════
// ── Phase B — Writer BEATS, launched NOW so its (longer) output overlaps the
// ENTIRE image pipeline below. Only needed to assemble the final Scene, so we
// await it last. A failure degrades to a single playable beat from the plan.
const tBeats = Date.now();
const beatsPromise: Promise<WriterBeatsOutput> = runWriterBeats(
config.text,
session,
plan,
)
.then((out) => {
tlog("[directScene] Phase B (beats)", tBeats);
return out;
})
.catch((err): WriterBeatsOutput => {
const msg = err instanceof Error ? err.message : String(err);
console.error(
`[directScene] Phase B (beats) failed, using fallback: ${msg}`,
);
return { beats: synthesizeFallbackBeats(plan), storyStatePatch: undefined };
});
// ── Step 1 — kick off the Writer stream + routing ─────────────────
const tStream = Date.now();
const writerResult = runWriterStream(config.text, session);
// Deferred that settles when onPlan fires (or when routing completes
// without a plan — degraded fallback).
let planSettled = false;
let resolvePlan!: (p: WriterScenePlan) => void;
const planPromise = new Promise<WriterScenePlan>((res) => {
resolvePlan = res;
});
// Closure-captured coerced plan so onStoryComplete can split+emit beats
// DURING streaming (before painter finishes → text-first progressive play).
let coercedPlanRef: WriterScenePlan | undefined;
let earlyBeatsOut: WriterBeatsOutput | undefined;
// Opening-scene story bible from the Writer's <plan> (replaces the old
// Architect). Undefined on subsequent scenes (carried StoryState wins).
let bibleFromPlan: WriterScenePlan["storyBible"];
const routingPromise = routeTaggedStream(writerResult.textStream, {
onPlan: (rawPlan) => {
try {
const coerced = coercePlanFromRaw(rawPlan as unknown as Record<string, unknown>);
coercedPlanRef = coerced;
if (coerced.storyBible) bibleFromPlan = coerced.storyBible;
planSettled = true;
emit?.({ type: "plan", plan: coerced });
resolvePlan(coerced);
} catch {
planSettled = true;
resolvePlan(minimalFallbackPlan());
}
},
onStoryComplete: (rawStory) => {
// Tags are ordered (plan before story), so the plan is already coerced.
const p = coercedPlanRef ?? minimalFallbackPlan();
try {
const out = splitProseToBeats(rawStory, p);
earlyBeatsOut = out;
for (const b of out.beats) emit?.({ type: "beat", beat: b });
} catch {
// split failure → Step 6 re-splits from rawStorySegment
}
},
}).then((result) => {
// If plan never fired (stream error / no plan tag), settle the deferred
// from the degraded extraction or a minimal fallback.
if (!planSettled) {
const extracted = result.plan
? coercePlanFromRaw(result.plan as unknown as Record<string, unknown>)
: minimalFallbackPlan();
if (extracted.storyBible) bibleFromPlan = extracted.storyBible;
resolvePlan(extracted);
}
return result;
});
// ── Step 2 — await plan (settles at </plan> close — EARLY) ────────
const plan = await planPromise;
tlog("[directScene] plan (stream → </plan>)", tStream);
// From here the pipeline is structurally identical to the old Phase A
// flow: plan drives character design + cinematographer + painter, all
// overlapping with the Writer's still-streaming <story>.
// NEW characters to design come from the PLAN's cast (so design fires in
// parallel with Phase B, not after the beats are written). Existing
// characters keep their cards / portraits / voices across scenes.
const newCharNames = plan.cast.filter(
(n) => !session.characters.some((c) => c.name === n),
);
// Entry-beat composition is the PLAN's (Phase B is constrained to honor it).
// The Painter needs a Beat-shaped object for reference collection, but the
// real beat isn't written until Phase B — so synthesize one from the plan
// (collectReferenceImages only reads speaker + activeCharacters).
const entryBeatActive = plan.entryActiveCharacters;
const entryBeatSpeaker = plan.entrySpeaker;
const entryBeatForPaint: Beat = {
@@ -216,32 +282,30 @@ export async function directScene(
next: { type: "continue", nextBeatId: plan.entryBeatId },
};
// For sceneKey-based visual continuity, look up the prior matching scene's
// image to slot into Painter's referenceImages (max 4 of which include
// character portraits too).
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
plan.sceneKey,
);
// ── Stage 2 — character cards (LLM) ∥ Cinematographer ──────────────────
// Both are cheap LLM calls and neither needs the other's output, so they
// run concurrently. The cards give us each new character's visualDescription
// TEXT; portraits + voices are deferred to Stage 3 so they can overlap the
// paint instead of blocking it.
// ── Step 3 — character cards (LLM) ∥ Cinematographer (parallel) ───
// CharacterDesigner now receives the Writer's intent for each character
// (paradigm D: media translator, not inventor).
const tParallel = Date.now();
const findIntent = (name: string): CharacterIntent | undefined =>
plan.characterIntents?.find((ci) => ci.name === name);
const cardPromises = newCharNames.map((name) =>
designCharacterCard(config, session, name).catch((err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
// Last-resort fallback: a name + generic voice card so the speaker isn't
// unknown. No visualDescription → no portrait is attempted for them.
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
}),
designCharacterCard(config, session, name, findIntent(name)).catch(
(err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
},
),
);
const cinemaPromise = runCinematographer(config.text, {
@@ -259,8 +323,6 @@ export async function directScene(
]);
tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
// Working registry: existing characters + new cards. visualDescription text
// is present now; portraits + voices fill in over the next two phases.
let characters = mergeCharacters(
session.characters,
cards.map((c) => ({
@@ -270,11 +332,9 @@ export async function directScene(
})),
);
// ── Stage 3 — portraits + voices, scheduled around the Painter ─────────
// ── Step 4 — portraits + voices, scheduled around Painter ─────────
const tProvision = Date.now();
// Entry-beat character names: the ONLY portraits the Painter references
// (collectReferenceImages slots in the entry beat's speaker + activeChars).
const entryNames = new Set<string>();
if (entryBeatSpeaker && !isPovName(entryBeatSpeaker)) {
entryNames.add(entryBeatSpeaker);
@@ -288,8 +348,6 @@ export async function directScene(
basePortraitUrl?: string;
basePortraitUuid?: string;
};
// Kick off portrait gen for every NEW char that has a visualDescription.
// Entry-beat portraits block the Painter; the rest overlap it.
const entryPortraitPromises: Promise<NamedPortrait>[] = [];
const restPortraitPromises: Promise<NamedPortrait>[] = [];
for (const card of cards) {
@@ -308,42 +366,37 @@ export async function directScene(
// On the StepFun path, thread the LLM-selected stepfunVoiceId from the card
// into provision — it lets stepfunProvision honor the catalog pick instead
// of falling back to the keyword scorer (same network cost: still zero).
// ALSO persist it onto the Character so the client can echo it back on a
// StepFun server (where it skips the ~220KB voice payload) and the server
// resolveVoice honors the LLM pick at synth time instead of re-scoring.
const voicePromises = cards.map((card) =>
provisionCharacterVoice(config, card.voiceDescription, card.name, {
stepfunVoiceId: card.stepfunVoiceId,
}).then(
(voice): Character => ({
name: card.name,
voiceDescription: card.voiceDescription,
voice,
stepfunVoiceId: card.stepfunVoiceId,
}),
(voice): Character => {
const result: Character = {
name: card.name,
voiceDescription: card.voiceDescription,
voice,
stepfunVoiceId: card.stepfunVoiceId,
};
if (voice) emit?.({ type: "voice", name: card.name, voice });
return result;
},
),
);
// Block the Painter ONLY on entry-beat portraits (its referenceImages).
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
characters,
entryPortraits.map((p) => ({
name: p.name,
voiceDescription: "", // preserved from the card by mergeCharacters
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
tlog("[directScene] entry-beat portraits", tProvision);
// ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards +
// entry portraits). On-stage = the plan's cast (everyone who'll appear),
// filtered to those now in the registry, so the archetype block covers them.
// ── Step 5 — Painter ──────────────────────────────────────────────
const onStageCharacters = characters.filter((c) => plan.cast.includes(c.name));
// Session-locked orientation (set at session start). Threads into both the
// Painter prompt's framing rules and the generated image's pixel dimensions.
const orientation = coerceOrientation(session.orientation);
const tPainter = Date.now();
@@ -361,9 +414,11 @@ export async function directScene(
);
tlog("[directScene] Painter", tPainter);
// Fold in the work that overlapped the paint: remaining portraits + all
// voices. Awaited before returning so the session the client persists is
// fully provisioned for later scenes.
// Emit background as soon as it's painted the client can swap the
// placeholder for the real scene image while beats/voices are still settling.
emit?.({ type: "background", imageUrl: painted.imageUrl, sceneKey: plan.sceneKey });
// Overlapped: rest portraits + voices
const tOverlap = Date.now();
const [restPortraits, voicedChars] = await Promise.all([
Promise.all(restPortraitPromises),
@@ -381,20 +436,82 @@ export async function directScene(
characters = mergeCharacters(characters, voicedChars);
tlog("[directScene] overlapped portraits+voices", tOverlap);
// ── Await Phase B — it overlapped the whole image pipeline above. ──────
const beatsOut = await beatsPromise;
const beats = beatsOut.beats;
// ── Step 6 — await routing completion + split prose into beats ────
// routeTaggedStream ran concurrently with the entire image pipeline.
// onStoryComplete likely already fired (splitting + emitting beats for
// progressive playback); this await retrieves the final result + rawStorySegment.
const streamResult = await routingPromise;
// Reuse early-split beats when available (onStoryComplete path); otherwise
// split from rawStorySegment (degrade / onStoryComplete missed).
const beatsOut: WriterBeatsOutput = earlyBeatsOut
?? splitProseToBeats(streamResult.rawStorySegment ?? "", plan);
let beats = beatsOut.beats;
// If earlyBeatsOut was missed but rawStorySegment is available, emit beats
// now (late but still before done — the client gets them for rendering).
if (!earlyBeatsOut && beats.length > 0) {
for (const b of beats) emit?.({ type: "beat", beat: b });
}
// Emit choices (from streamResult or from the last beat's choice exits).
if (streamResult.choices?.length) {
emit?.({ type: "choices", choices: streamResult.choices });
}
// ── C1-ext: merge <choices> segment into the last beat's `next` ────
// The Writer's <choices> segment produces scene-level exits that are NOT
// embedded in the beats graph. Attach them to the final beat so the player
// can actually pick them.
//
// IMPORTANT: Only change-scene exits are valid here. The prose paradigm
// assigns beat ids automatically (b1, b2, ...) in proseSplitter — the LLM
// has no knowledge of these ids, so any advance-beat targetBeatId it emits
// in <choices> will point at the wrong beat, causing a loop.
if (streamResult.choices?.length && beats.length > 0) {
const validChoices = streamResult.choices.filter(
(c): c is BeatChoice =>
typeof c.label === "string" &&
c.label.length > 0 &&
c.effect != null &&
c.effect.kind === "change-scene",
);
if (validChoices.length > 0) {
const withIds = validChoices.map((c, i) => ({
...c,
id: c.id || `sc${i + 1}`,
}));
const lastIdx = beats.length - 1;
const last = beats[lastIdx]!;
const existing =
last.next.type === "choice" ? last.next.choices : [];
const isFallbackOnly =
existing.length <= 1 &&
existing.every((c) => c.label === "继续");
const merged = isFallbackOnly ? withIds : [...existing, ...withIds];
const seen = new Set<string>();
const deduped = merged.filter((c) => {
if (seen.has(c.label)) return false;
seen.add(c.label);
return true;
});
beats = beats.map((b, i) =>
i === lastIdx
? { ...b, next: { type: "choice" as const, choices: deduped } }
: b,
);
}
}
if (streamResult.degraded) {
console.warn("[directScene] Writer stream was degraded — beats may be fallback");
}
// entryBeatId is guaranteed present (runWriterBeats pins it onto a beat), but
// keep the defensive fallback for the synthesized-fallback path.
const entryBeatId = beats.some((b) => b.id === plan.entryBeatId)
? plan.entryBeatId
: beats[0]!.id;
// Orphan-speaker voices: a beat speaker Phase B used that isn't in the
// registry. Should be rare — the prompt constrains speakers to the cast, and
// every cast member was provisioned above — so this is a defensive net,
// serial but skipped entirely (zero latency) in the common case.
// Orphan-speaker voices (defensive net — should be rare).
const orphanSpeakers = [
...new Set(beats.map((b) => b.speaker).filter((n): n is string => Boolean(n))),
].filter((n) => !isPovName(n) && !characters.some((c) => c.name === n));
@@ -403,15 +520,14 @@ export async function directScene(
orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)),
);
characters = mergeCharacters(characters, orphanChars);
// Emit orphan voices so the client can preload their audio.
for (const oc of orphanChars) {
if (oc.voice) emit?.({ type: "voice", name: oc.name, voice: oc.voice });
}
}
const scene: Scene = {
id: newSceneId(),
// scenePrompt is the cinematographer's English compositional output;
// the Writer's sceneSummary stays in the session log via beats[]/
// history. Keeping the original field name preserves compat with
// anything that already reads scene.scenePrompt (e.g., insert-beat
// user prompt).
scenePrompt: cinemaOut.integratedPrompt,
beats,
entryBeatId,
@@ -421,11 +537,22 @@ export async function directScene(
orientation,
};
// Merge the Writer's volatile memory rewrite onto the carried bible so the
// throughline survives the next scene cut (orchestrator returns it; the
// client persists it back into the session).
// storyState: opening scene seeds the stable spine from the Writer's
// storyBible (replacing the old Architect); subsequent scenes carry the
// existing spine. Volatile fields always come from this scene's patch.
const baseStoryState: StoryState | undefined = session.storyState
?? (bibleFromPlan
? {
logline: bibleFromPlan.logline,
genreTags: bibleFromPlan.genreTags,
protagonist: bibleFromPlan.protagonist,
castNotes: bibleFromPlan.castNotes,
synopsis: "",
}
: undefined);
const storyState = applyStoryStatePatch(
session.storyState,
baseStoryState,
beatsOut.storyStatePatch,
);