From 15ce03a9124e385ff0135bdcf4f402a1eeadeba5 Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Tue, 2 Jun 2026 11:44:55 +0800 Subject: [PATCH] feat(engine): Architect agent + cross-scene StoryState coherence Add a dedicated Architect LLM call at session start that expands the terse world/style prompt into a persistent story bible (logline, genre, second- person protagonist, cast, engineered opening hook). The bible seeds a StoryState the Writer reads and patches every scene, carried + merged across cuts (applyStoryStatePatch) so the story keeps a spine from beat one instead of jumping between scenes. - prompts: inject web-novel / short-drama / galgame craft into Writer + Architect; Writer emits storyStatePatch to update the running bible - director: parallelize voice + non-entry portraits with the Painter (only entry-beat portraits block paint) to offset Architect latency - architect: chat/parse guarded so a malformed response never aborts start - types: StoryState / StoryStatePatch; required on Start/SceneResponse Co-Authored-By: Claude Opus 4.7 --- .gitignore | 2 + apps/web/app/play/page.tsx | 3 + packages/engine/src/agents/architect.ts | 90 +++++++ .../engine/src/agents/characterDesigner.ts | 79 +++--- packages/engine/src/agents/writer.ts | 39 +++ packages/engine/src/director.ts | 226 ++++++++++++++---- packages/engine/src/index.ts | 1 + packages/engine/src/orchestrator.ts | 18 +- packages/engine/src/prompts.ts | 166 ++++++++++++- packages/types/src/index.ts | 61 +++++ 10 files changed, 576 insertions(+), 109 deletions(-) create mode 100644 packages/engine/src/agents/architect.ts diff --git a/.gitignore b/.gitignore index 41397b6..46e678a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ npm-debug.log* pnpm-debug.log* repomix-output.xml + +users.md diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx index 3e9c5df..8d0a010 100644 --- a/apps/web/app/play/page.tsx +++ b/apps/web/app/play/page.tsx @@ -187,6 +187,7 @@ function prefetchScenePath( const carriedBase: Session = { ...baseSession, characters: data.characters, + storyState: data.storyState, }; prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1); } @@ -539,6 +540,7 @@ function PlayInner() { }, ], characters: data.characters, + storyState: data.storyState, }; visitedBeatsRef.current = [data.scene.entryBeatId]; setSession(initial); @@ -635,6 +637,7 @@ function PlayInner() { }, ], characters: result.characters, + storyState: result.storyState, }; visitedBeatsRef.current = [result.scene.entryBeatId]; setSession(newSession); diff --git a/packages/engine/src/agents/architect.ts b/packages/engine/src/agents/architect.ts new file mode 100644 index 0000000..d3349e0 --- /dev/null +++ b/packages/engine/src/agents/architect.ts @@ -0,0 +1,90 @@ +import { chat } from "@infiplot/ai-client"; +import type { ProviderConfig, Session, StoryState } from "@infiplot/types"; +import { parseJsonLoose } from "../jsonParser"; +import { ARCHITECT_SYSTEM, buildArchitectUserMessage } from "../prompts"; + +// ────────────────────────────────────────────────────────────────────── +// Architect agent — ONE LLM call at session start. +// +// Expands the user's (often terse) world + style prompt into a real story +// bible: a second-person protagonist with a want and a flaw, a single +// central dramatic question (logline), a genre frame that anchors the +// 爽点 rhythm, an engineered cold-open for scene 1 (nextHook), and a small +// intentional cast. Seeds the StoryState that the Writer reads and updates +// every scene — so the story has a spine from beat one instead of being +// improvised cold. +// +// Everything is best-effort coerced with fallbacks: a malformed LLM +// response can never abort session start — worst case the Writer just gets +// a thinner bible and improvises more. +// ────────────────────────────────────────────────────────────────────── + +type RawStoryState = { + logline?: unknown; + genreTags?: unknown; + protagonist?: unknown; + castNotes?: unknown; + synopsis?: unknown; + openThreads?: unknown; + relationships?: unknown; + nextHook?: unknown; +}; + +function str(raw: unknown): string { + return typeof raw === "string" ? raw.trim() : ""; +} + +function strArray(raw: unknown): string[] | undefined { + if (!Array.isArray(raw)) return undefined; + const out = raw + .map((x) => (typeof x === "string" ? x.trim() : "")) + .filter((x) => x.length > 0); + return out.length > 0 ? out : undefined; +} + +export async function runArchitect( + config: ProviderConfig, + session: Session, +): Promise { + try { + const raw = await chat( + config, + [ + { role: "system", content: ARCHITECT_SYSTEM }, + { role: "user", content: buildArchitectUserMessage(session) }, + ], + { temperature: 0.85, responseFormat: "json_object" }, + ); + + const parsed = parseJsonLoose(raw); + + return { + // Stable spine — fall back to the raw world/style prompt so the bible is + // never wholly empty even if the model returns garbage. + logline: str(parsed.logline) || session.worldSetting, + genreTags: str(parsed.genreTags), + protagonist: + str(parsed.protagonist) || + "你是这个故事的主角(第二人称视角,永不出现在画面里)。", + castNotes: str(parsed.castNotes) || undefined, + // Volatile seeds — the opening Writer will rewrite these via its patch. + synopsis: str(parsed.synopsis) || "故事即将开始。", + openThreads: strArray(parsed.openThreads), + relationships: strArray(parsed.relationships), + nextHook: str(parsed.nextHook) || undefined, + }; + } catch (err) { + // chat() or parseJsonLoose() can throw (network / unrepairable JSON). + // The Architect is best-effort: never let it abort session start — return + // a minimal bible seeded from the raw prompt and let the Writer improvise. + const msg = err instanceof Error ? err.message : String(err); + console.error(`[architect] failed, using minimal bible: ${msg}`); + return { + logline: session.worldSetting, + genreTags: "", + protagonist: + "你是这个故事的主角(第二人称视角,永不出现在画面里)。", + synopsis: "故事即将开始。", + }; + } +} diff --git a/packages/engine/src/agents/characterDesigner.ts b/packages/engine/src/agents/characterDesigner.ts index 8d7056b..152a975 100644 --- a/packages/engine/src/agents/characterDesigner.ts +++ b/packages/engine/src/agents/characterDesigner.ts @@ -15,25 +15,20 @@ import { } from "../prompts"; // ────────────────────────────────────────────────────────────────────── -// CharacterDesigner agent — designs ONE new character end-to-end. +// CharacterDesigner agent — designs ONE new character. // -// Pipeline (per character, all the slow parts are parallelized): +// Exposed as three GRANULAR stages so the director can schedule the slow +// parts around the Painter (a voice is never needed to paint a scene, and +// only entry-beat characters' portraits are referenced by the Painter): // -// 1. LLM call — designs BOTH visual + voice cards in one shot -// (intentional: same agent thinks about who this character IS, -// which keeps appearance and vocal personality coherent) +// 1. designCharacterCard — ONE LLM call → visual + voice TEXT cards +// (intentional bundling: the same agent thinks about who this character +// IS, keeping appearance and vocal personality coherent) +// 2. renderCharacterPortrait — base portrait image (Runware URL + UUID) +// 3. provisionCharacterVoice — Xiaomi MiMo voicedesign → reference audio // -// 2. In parallel: -// a. Image gen — base portrait (Runware returns URL + UUID in one shot; -// no separate upload round-trip is needed for cheap re-reference) -// b. Voice provisioning — Xiaomi MiMo voicedesign from voiceDescription -// → reference audio for later voiceclone synth -// -// 3. Returns merged Character ready to be added to session.characters -// -// Each step degrades gracefully — if image gen fails we return the -// character without a portrait; if voice gen fails we return without -// voice. The game keeps running even when sub-components fail. +// Each step degrades gracefully — if image gen fails the character just has +// no portrait; if voice gen fails it has no voice. The game keeps running. // ────────────────────────────────────────────────────────────────────── type CharacterDesignOutput = { @@ -77,7 +72,7 @@ async function runDesignLLM( // // In mock mode we return the data URI as basePortraitUrl with no UUID // (Painter is short-circuited anyway, so the lack of a UUID is moot). -async function renderPortrait( +export async function renderCharacterPortrait( config: EngineConfig, charName: string, visualDescription: string, @@ -101,7 +96,7 @@ async function renderPortrait( } } -async function provisionVoiceSafe( +export async function provisionCharacterVoice( config: EngineConfig, voiceDescription: string, charName: string, @@ -116,45 +111,31 @@ async function provisionVoiceSafe( } } -// Single-character design pipeline. Called by the orchestrator once per -// NEW character name; multiple characters in the same scene run their -// pipelines in parallel at the orchestrator level. -export async function designCharacter( +// The cheap first stage: design the visual + voice TEXT cards in one LLM +// call. The director then schedules renderCharacterPortrait / +// provisionCharacterVoice around the Painter. Multiple new characters in the +// same scene run this stage in parallel at the director level. +export type CharacterCard = { + name: string; + visualDescription?: string; + voiceDescription: string; +}; + +export async function designCharacterCard( config: EngineConfig, session: Session, charName: string, -): Promise { - const tTotal = Date.now(); - - // Step 1 — LLM design (visual + voice). Must complete first. +): Promise { const tDesign = Date.now(); const design = await runDesignLLM(config, session, charName); tlog(`[charDesigner ${charName}] design LLM`, tDesign); - const visualDescription = design.visualDescription?.trim(); - const voiceDescription = - design.voiceDescription?.trim() || - `请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`; - - // Step 2 — parallel: portrait + voice provisioning. - const tProvision = Date.now(); - const portraitPromise = visualDescription - ? renderPortrait(config, charName, visualDescription, session.styleGuide) - : Promise.resolve({} as Awaited>); - const voicePromise = provisionVoiceSafe(config, voiceDescription, charName); - - const [portrait, voice] = await Promise.all([portraitPromise, voicePromise]); - tlog(`[charDesigner ${charName}] portrait+voice parallel`, tProvision); - - tlog(`[charDesigner ${charName}] TOTAL`, tTotal); - return { name: charName, - voiceDescription, - visualDescription, - basePortraitUrl: portrait.basePortraitUrl, - basePortraitUuid: portrait.basePortraitUuid, - voice, + visualDescription: design.visualDescription?.trim() || undefined, + voiceDescription: + design.voiceDescription?.trim() || + `请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`, }; } @@ -169,6 +150,6 @@ export async function provisionVoiceForName( charName: string, ): Promise { const voiceDescription = `请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`; - const voice = await provisionVoiceSafe(config, voiceDescription, charName); + const voice = await provisionCharacterVoice(config, voiceDescription, charName); return { name: charName, voiceDescription, voice }; } diff --git a/packages/engine/src/agents/writer.ts b/packages/engine/src/agents/writer.ts index 0015817..97a5e4f 100644 --- a/packages/engine/src/agents/writer.ts +++ b/packages/engine/src/agents/writer.ts @@ -7,6 +7,7 @@ import type { BeatNext, ProviderConfig, Session, + StoryStatePatch, } from "@infiplot/types"; import { parseJsonLoose } from "../jsonParser"; import { WRITER_SYSTEM, buildWriterUserMessage } from "../prompts"; @@ -28,6 +29,9 @@ export type WriterOutput = { sceneKey?: string; entryBeatId: string; beats: Beat[]; + /** Rewritten volatile story memory — merged onto the carried StoryState by + * the director. Absent when the model omitted it (rare; bible just stales). */ + storyStatePatch?: StoryStatePatch; }; // Raw shapes — what the LLM produces before validation / coercion. @@ -59,11 +63,18 @@ type RawBeat = { activeCharacters?: RawActiveCharacter[]; next?: RawNext; }; +type RawStoryStatePatch = { + synopsis?: unknown; + openThreads?: unknown; + relationships?: unknown; + nextHook?: unknown; +}; type RawScene = { sceneSummary?: string; sceneKey?: string; entryBeatId?: string; beats?: RawBeat[]; + storyStatePatch?: RawStoryStatePatch; }; // ────────────────────────────────────────────────────────────────────── @@ -321,6 +332,33 @@ function normalizeSceneKey(raw: string | undefined): string | undefined { return slug.length > 0 ? slug : undefined; } +function coerceStringArray(raw: unknown): string[] | undefined { + if (!Array.isArray(raw)) return undefined; + const out = raw + .map((x) => (typeof x === "string" ? x.trim() : "")) + .filter((x) => x.length > 0); + return out.length > 0 ? out : undefined; +} + +// Pull the volatile story-memory rewrite out of the Writer's JSON. Only +// non-empty fields are kept; an all-empty/absent patch returns undefined so +// the director leaves the carried StoryState untouched. +function coerceStoryStatePatch( + raw: RawStoryStatePatch | undefined, +): StoryStatePatch | undefined { + if (!raw || typeof raw !== "object") return undefined; + const patch: StoryStatePatch = {}; + const synopsis = typeof raw.synopsis === "string" ? raw.synopsis.trim() : ""; + if (synopsis) patch.synopsis = synopsis; + const openThreads = coerceStringArray(raw.openThreads); + if (openThreads) patch.openThreads = openThreads; + const relationships = coerceStringArray(raw.relationships); + if (relationships) patch.relationships = relationships; + const nextHook = typeof raw.nextHook === "string" ? raw.nextHook.trim() : ""; + if (nextHook) patch.nextHook = nextHook; + return Object.keys(patch).length > 0 ? patch : undefined; +} + export async function runWriter( config: ProviderConfig, session: Session, @@ -359,6 +397,7 @@ export async function runWriter( sceneKey: normalizeSceneKey(parsed.sceneKey), entryBeatId, beats, + storyStatePatch: coerceStoryStatePatch(parsed.storyStatePatch), }; } diff --git a/packages/engine/src/director.ts b/packages/engine/src/director.ts index 9630a1b..5bfa156 100644 --- a/packages/engine/src/director.ts +++ b/packages/engine/src/director.ts @@ -6,8 +6,16 @@ import type { ProviderConfig, Scene, Session, + StoryState, + StoryStatePatch, } from "@infiplot/types"; -import { designCharacter, provisionVoiceForName } from "./agents/characterDesigner"; +import type { CharacterCard } from "./agents/characterDesigner"; +import { + designCharacterCard, + provisionCharacterVoice, + provisionVoiceForName, + renderCharacterPortrait, +} from "./agents/characterDesigner"; import { runCinematographer } from "./agents/cinematographer"; import { runPainter } from "./agents/painter"; import { @@ -27,26 +35,29 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; // // Writer LLM (~3s, serial) // │ -// ├─ CharacterDesigner LLM × N (parallel per new char) -// │ │ -// │ ├─ portrait gen (Runware returns URL + UUID in one call) -// │ └─ voice provisioning (parallel within agent) +// ├─ CharacterCard LLM × N (parallel per new char — TEXT only) +// ├─ Cinematographer LLM (parallel with the cards) // │ -// ├─ Cinematographer LLM (parallel with all of the above) +// └─ wait for cards + cinema // │ -// └─ wait for all parallel branches +// ├─ entry-beat portraits ──┐ (block the Painter — its refs) +// ▼ │ +// Painter — generateImage │ (overlapped, NOT on the paint path): +// with referenceImages ├─ non-entry-beat portraits +// │ └─ ALL voice provisioning + orphan voices +// ▼ +// await the overlapped work, fold into the registry // │ // ▼ -// Painter — generateImage with referenceImages (UUID/URL refs only; -// no base64 to upload, since outputType=URL gives both back) -// │ -// ▼ -// return { scene, sceneImageUrl, characters } +// return { scene, sceneImageUrl, characters, storyState } // -// The Cinematographer intentionally does NOT depend on CharacterDesigner -// output — it only positions named characters in the frame, not their -// appearance. This unlocks the parallelism that makes the full pipeline -// ~9-12s instead of ~15-18s serial. +// Two deliberate decouplings unlock the parallelism: +// 1. The Cinematographer only POSITIONS named characters, so it needs no +// visualDescription and runs alongside the card LLMs. +// 2. The Painter only needs visualDescription TEXT (all on-stage) + the +// entry-beat characters' PORTRAITS (its referenceImages). Voices are +// never needed to paint, and non-entry portraits are never referenced — +// so both overlap the (longest) paint call instead of blocking it. // ══════════════════════════════════════════════════════════════════════ function newSceneId(): string { @@ -112,10 +123,33 @@ function pickPriorSceneReference( return {}; } +// Merge the Writer's volatile story-memory patch onto the carried StoryState. +// The stable spine (logline/genreTags/protagonist/castNotes) is preserved; +// only the volatile fields the Writer is allowed to rewrite are overwritten, +// and only when the patch actually provided them. A missing carried state +// (legacy session from before the Architect existed) degrades to an empty +// spine rather than throwing. +function applyStoryStatePatch( + base: StoryState | undefined, + patch: StoryStatePatch | undefined, +): StoryState { + const start: StoryState = + base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" }; + if (!patch) return start; + return { + ...start, + synopsis: patch.synopsis ?? start.synopsis, + openThreads: patch.openThreads ?? start.openThreads, + relationships: patch.relationships ?? start.relationships, + nextHook: patch.nextHook ?? start.nextHook, + }; +} + export type SceneResult = { scene: Scene; sceneImageUrl: string; characters: Character[]; + storyState: StoryState; }; // ────────────────────────────────────────────────────────────────────── @@ -156,17 +190,19 @@ export async function directScene( writerOut.sceneKey, ); - // Stage 2 — parallel: CharacterDesigner(s) and Cinematographer. - // Cinematographer doesn't need character visualDescriptions (those are - // appended at Painter stage), so it runs concurrently with chardesign. + // ── Stage 2 — character cards (LLM) ∥ Cinematographer ────────────────── + // Both are cheap LLM calls and neither needs the other's output, so they + // run concurrently. The cards give us each new character's visualDescription + // TEXT; portraits + voices are deferred to Stage 3 so they can overlap the + // paint instead of blocking it. const tParallel = Date.now(); - const designPromises = newCharNames.map((name) => - designCharacter(config, session, name).catch((err): Character => { + const cardPromises = newCharNames.map((name) => + designCharacterCard(config, session, name).catch((err): CharacterCard => { const msg = err instanceof Error ? err.message : String(err); - console.error(`[directScene] designCharacter(${name}) failed: ${msg}`); - // Last-resort fallback: register with name only so the speaker isn't - // unknown. Caller may try voice provisioning later or skip. + console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`); + // Last-resort fallback: a name + generic voice card so the speaker isn't + // unknown. No visualDescription → no portrait is attempted for them. return { name, voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`, @@ -183,40 +219,102 @@ export async function directScene( currentSceneKey: writerOut.sceneKey, }); - const [designedChars, cinemaOut] = await Promise.all([ - Promise.all(designPromises), + const [cards, cinemaOut] = await Promise.all([ + Promise.all(cardPromises), cinemaPromise, ]); - tlog("[directScene] CharacterDesigner+Cinematographer parallel", tParallel); + tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel); - // Merge new chars into a working registry that we'll pass to the Painter. - const characters = mergeCharacters(session.characters, designedChars); + // Working registry: existing characters + new cards. visualDescription text + // is present now; portraits + voices fill in over the next two phases. + let characters = mergeCharacters( + session.characters, + cards.map((c) => ({ + name: c.name, + voiceDescription: c.voiceDescription, + visualDescription: c.visualDescription, + })), + ); - // Edge case: a speaker referenced by the Writer might not have been in - // `activeCharacters` of any beat (LLM oversight), so they got skipped by - // newCharNames. Catch them here and at least provision a voice so the - // beat-audio path doesn't render silent. No portrait — they weren't - // visible in the scene, so visual consistency doesn't matter for them. + // ── Stage 3 — portraits + voices, scheduled around the Painter ───────── + const tProvision = Date.now(); + + // Entry-beat character names: the ONLY portraits the Painter references + // (collectReferenceImages slots in the entry beat's speaker + activeChars). + const entryNames = new Set(); + if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) { + entryNames.add(entryBeat.speaker); + } + for (const c of entryBeatActive) { + if (!isPovName(c.name)) entryNames.add(c.name); + } + + type NamedPortrait = { + name: string; + basePortraitUrl?: string; + basePortraitUuid?: string; + }; + // Kick off portrait gen for every NEW char that has a visualDescription. + // Entry-beat portraits block the Painter; the rest overlap it. + const entryPortraitPromises: Promise[] = []; + const restPortraitPromises: Promise[] = []; + for (const card of cards) { + const vd = card.visualDescription; + if (!vd) continue; + const p = renderCharacterPortrait( + config, + card.name, + vd, + session.styleGuide, + ).then((res): NamedPortrait => ({ name: card.name, ...res })); + (entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p); + } + + // Kick off voice provisioning for every NEW char (never on the paint path). + const voicePromises = cards.map((card) => + provisionCharacterVoice(config, card.voiceDescription, card.name).then( + (voice): Character => ({ + name: card.name, + voiceDescription: card.voiceDescription, + voice, + }), + ), + ); + + // Edge case: a speaker the Writer referenced without listing in any beat's + // activeCharacters. collectActiveCharacterNames already includes speakers, + // so this is a rare defensive net. Provision a voice only (never on-screen). const speakerNames = new Set( writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)), ); const orphanSpeakers = [...speakerNames].filter( // Pattern B: "你" (player) is a valid speaker but never gets a Character - // record — TTS is intentionally skipped on the client. Filter POV out so - // provisionVoiceForName isn't accidentally invoked for the player. - (n) => !isPovName(n) && !characters.some((c) => c.name === n), + // record — TTS is intentionally skipped on the client. + (n) => + !isPovName(n) && + !characters.some((c) => c.name === n) && + !cards.some((c) => c.name === n), + ); + const orphanPromises = orphanSpeakers.map((n) => + provisionVoiceForName(config, session, n), ); - if (orphanSpeakers.length > 0) { - const orphans = await Promise.all( - orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)), - ); - const merged = mergeCharacters(characters, orphans); - characters.splice(0, characters.length, ...merged); - } - // Stage 3 — Painter (depends on cinemaOut + characters). - // On-stage characters for THIS scene are the ones in any beat — pass them - // all so the archetype block covers anyone the player might encounter. + // Block the Painter ONLY on entry-beat portraits (its referenceImages). + const entryPortraits = await Promise.all(entryPortraitPromises); + characters = mergeCharacters( + characters, + entryPortraits.map((p) => ({ + name: p.name, + voiceDescription: "", // preserved from the card by mergeCharacters + basePortraitUrl: p.basePortraitUrl, + basePortraitUuid: p.basePortraitUuid, + })), + ); + tlog("[directScene] entry-beat portraits", tProvision); + + // ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards + + // entry portraits). On-stage = everyone named in any beat, so the archetype + // block covers anyone the player might encounter in this scene. const onStageCharacters = characters.filter((c) => allActiveNames.includes(c.name), ); @@ -234,6 +332,30 @@ export async function directScene( ); tlog("[directScene] Painter", tPainter); + // Fold in the work that overlapped the paint: remaining portraits, all + // voices, and any orphan-speaker voices. Awaited before returning so the + // session the client persists is fully provisioned for later scenes. + const tOverlap = Date.now(); + const [restPortraits, voicedChars, orphanChars] = await Promise.all([ + Promise.all(restPortraitPromises), + Promise.all(voicePromises), + Promise.all(orphanPromises), + ]); + characters = mergeCharacters( + characters, + restPortraits.map((p) => ({ + name: p.name, + voiceDescription: "", + basePortraitUrl: p.basePortraitUrl, + basePortraitUuid: p.basePortraitUuid, + })), + ); + characters = mergeCharacters(characters, voicedChars); + if (orphanChars.length > 0) { + characters = mergeCharacters(characters, orphanChars); + } + tlog("[directScene] overlapped portraits+voices", tOverlap); + const scene: Scene = { id: newSceneId(), // scenePrompt is the cinematographer's English compositional output; @@ -249,9 +371,17 @@ export async function directScene( imageUrl: painted.imageUrl, }; + // Merge the Writer's volatile memory rewrite onto the carried bible so the + // throughline survives the next scene cut (orchestrator returns it; the + // client persists it back into the session). + const storyState = applyStoryStatePatch( + session.storyState, + writerOut.storyStatePatch, + ); + tlog("[directScene] TOTAL", tTotal); - return { scene, sceneImageUrl: painted.imageUrl, characters }; + return { scene, sceneImageUrl: painted.imageUrl, characters, storyState }; } // ────────────────────────────────────────────────────────────────────── diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts index 86adea9..8fac699 100644 --- a/packages/engine/src/index.ts +++ b/packages/engine/src/index.ts @@ -9,6 +9,7 @@ export { annotateClick } from "./annotate"; export { synthesizeBeat } from "./voice"; export { mergeCharacters } from "./director"; export type { SceneResult } from "./director"; +export { runArchitect } from "./agents/architect"; export type { WriterOutput } from "./agents/writer"; export type { CinematographerOutput } from "./agents/cinematographer"; export type { InsertBeatPartial } from "@infiplot/types"; diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts index 49a2098..2b332e0 100644 --- a/packages/engine/src/orchestrator.ts +++ b/packages/engine/src/orchestrator.ts @@ -12,6 +12,7 @@ import type { VisionRequest, VisionResponse, } from "@infiplot/types"; +import { runArchitect } from "./agents/architect"; import { annotateClick } from "./annotate"; import { directInsertBeat, directScene } from "./director"; import { synthesizeBeat } from "./voice"; @@ -49,7 +50,18 @@ export async function startSession( characters: [], }; - const { scene, sceneImageUrl, characters } = await directScene(config, session); + // Stage 0 — Architect: expand the terse world/style prompt into a story + // bible BEFORE the first scene. Serial by necessity (the opening Writer + // reads session.storyState), but it gives the whole story a spine from beat + // one — the latency is offset by the director's portrait/voice overlap win. + const tArchitect = Date.now(); + session.storyState = await runArchitect(config.text, session); + tlog("[start] Architect", tArchitect); + + const { scene, sceneImageUrl, characters, storyState } = await directScene( + config, + session, + ); tlog("[start] TOTAL", tTotal); @@ -58,6 +70,7 @@ export async function startSession( scene, imageUrl: sceneImageUrl, characters, + storyState, }; } @@ -71,7 +84,7 @@ export async function requestScene( ): Promise { const tTotal = Date.now(); - const { scene, sceneImageUrl, characters } = await directScene( + const { scene, sceneImageUrl, characters, storyState } = await directScene( config, req.session, ); @@ -82,6 +95,7 @@ export async function requestScene( scene, imageUrl: sceneImageUrl, characters, + storyState, }; } diff --git a/packages/engine/src/prompts.ts b/packages/engine/src/prompts.ts index ec25743..ef4e358 100644 --- a/packages/engine/src/prompts.ts +++ b/packages/engine/src/prompts.ts @@ -3,20 +3,106 @@ import type { Character, Scene, Session, + StoryState, } from "@infiplot/types"; // ══════════════════════════════════════════════════════════════════════ // Multi-agent scene generation pipeline: -// Writer (编剧) — narrative + beats[] + per-beat activeCharacters +// Architect (总编剧) — ONE-TIME at session start: the story bible +// (protagonist / logline / genre / opening hook / +// planned cast) → seeds StoryState +// Writer (编剧) — narrative + beats[] + per-beat activeCharacters, +// reads StoryState and emits a StoryStatePatch // CharacterDesigner — per-new-character visual + voice cards // Cinematographer (分镜导演) — sceneKey + English compositional prompt // Painter (画师) — FLUX rendering with character archetypes // // Each agent owns one system prompt + one user-message builder below. -// All four agents see the same world / style guide, but each only reads -// the slice of session state it needs to make its decision. +// All agents see the same world / style guide, but each only reads the +// slice of session state it needs to make its decision. // ══════════════════════════════════════════════════════════════════════ +// ────────────────────────────────────────────────────────────────────── +// Shared — render the StoryState bible into a compact prompt block read +// by the Writer (and Architect, on revisions). Keeping one renderer means +// the bible looks identical to every agent that consumes it. +// ────────────────────────────────────────────────────────────────────── + +export function renderStoryState(s: StoryState | undefined): string { + if (!s) return ""; + const lines: string[] = ["【故事档案 / 主线记忆】"]; + if (s.logline) lines.push(`主线(中心钩子):${s.logline}`); + if (s.genreTags) lines.push(`题材基调:${s.genreTags}`); + if (s.protagonist) lines.push(`主角「你」:${s.protagonist}`); + if (s.castNotes) lines.push(`核心配角:\n${s.castNotes}`); + if (s.synopsis) lines.push(`已发生(梗概):${s.synopsis}`); + if (s.relationships?.length) { + lines.push(`当前关系/情绪:\n${s.relationships.map((r) => `- ${r}`).join("\n")}`); + } + if (s.openThreads?.length) { + lines.push(`未收的悬念/伏笔:\n${s.openThreads.map((t) => `- ${t}`).join("\n")}`); + } + if (s.nextHook) lines.push(`接下来要往哪走(下一个钩子方向):${s.nextHook}`); + return lines.join("\n"); +} + +// ────────────────────────────────────────────────────────────────────── +// 0. Architect (总编剧) — ONE LLM call at session start. +// +// Turns the (often terse) user world + style prompt into a real story +// bible: a second-person protagonist with a want and a flaw, a single +// central dramatic question, a genre frame that anchors the 爽点 rhythm, +// an engineered opening hook (前3秒冷开场), and a small intentional cast. +// Everything downstream — Writer, CharacterDesigner — reads this so the +// story has a spine from beat one instead of being improvised cold. +// ────────────────────────────────────────────────────────────────────── + +export const ARCHITECT_SYSTEM = `你是一部交互视觉小说的「总编剧 / 故事架构师」。玩家只给了你一句到几句的世界观和画风,你要在开拍前把它扩写成一份**故事档案(story bible)**,为后续每一幕定下脊梁。你不写具体台词、不写分镜、不设计立绘——你只搭骨架。 + +你深谙网文(番茄)、短剧(红果)与视觉小说(galgame)的爆款心法: +- **开篇即钩子**:黄金三章 / 前3秒法则。开场不铺垫世界观,直接抛出冲突、悬念或一个反常的瞬间。 +- **代入感**:主角是第二人称「你」,是玩家的化身——要让玩家一进场就清楚"我是谁、我此刻卡在什么处境里、我想要什么"。 +- **题材锚定爽点**:先选定一个清晰的题材框架(如 甜宠 / 校园暗恋 / 悬疑追凶 / 复仇逆袭 / 救赎治愈),它决定了情绪回报的节奏与类型。 +- **戏剧问题**:整部故事由一个悬而未决的中心问题驱动(她到底是谁?你能否在记忆消失前查明真相?这场暗恋会走向哪里?)。 +- **人设要鲜明且有反差**:每个核心角色一个强标签 + 一个反差面(外冷内热 / 傲娇 / 看似柔弱实则腹黑)。 + +你要产出(全部用中文,except 不需要英文): +- logline:一句话主线 / 中心戏剧问题,必须带钩子,让人想看下去 +- genreTags:题材+基调标签,斜杠分隔,如 "甜宠 / 校园 / 慢热治愈带点伤感" +- protagonist:第二人称主角卡。包含:你是谁、你此刻正卡在什么具体处境里(要有即时张力)、你想要什么、一个软肋或秘密。50–120 字。 +- castNotes:2–3 个核心配角,每行一个「名字:一句话人设(强标签+反差)+ 与你的关系/张力」。给真实好记的中文名字(不要"神秘女子"这种占位)。 +- synopsis:开场此刻的情境梗概(故事尚未展开,就写"故事从……开始"),1–3 句。 +- openThreads:开场就埋下的 1–3 个悬念/问题(数组)。 +- nextHook:**第一幕**应当如何冷开场——具体描述开场那个抓人的瞬间/冲突(这会直接指导编剧写开场)。要画面感强、有张力。 + +设计硬规则: +- 主角「你」永不出现在画面里(第二人称 POV),所以 castNotes 里**不要**把"你/主角"当成一个角色。 +- 配角名字要符合世界观(年代、地域、文化)。 +- 一切服从玩家给的世界观与画风,不要擅自跑题;玩家信息少时,做最贴合、最有戏的合理扩写。 + +必须输出严格 JSON: +{ + "logline": "...", + "genreTags": "...", + "protagonist": "...", + "castNotes": "夏海:表面开朗的天台诗人,实则在用诗逃避家里的变故;与你是同班转学的邻座,对你有种说不清的在意。\\n班主任老周:…", + "synopsis": "...", + "openThreads": ["...", "..."], + "nextHook": "第一幕冷开场:……" +} + +不要输出 JSON 以外的任何文本。`; + +export function buildArchitectUserMessage(session: Session): string { + const parts: string[] = []; + parts.push(`世界观:${session.worldSetting}`); + parts.push(`画风:${session.styleGuide}`); + parts.push( + "\n请据此产出这部交互剧的故事档案(story bible),严格以 JSON 格式返回。", + ); + return parts.join("\n"); +} + // ────────────────────────────────────────────────────────────────────── // 1. Writer (编剧) — drives the narrative. // @@ -27,7 +113,26 @@ import type { // session.characters. // ────────────────────────────────────────────────────────────────────── -export const WRITER_SYSTEM = `你是一个交互视觉小说的「编剧」。每次基于世界观、画风、玩家历史、已登记角色,写出**一个完整场景的剧本**:场景背景概要 + 一组对话节拍 beats。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。 +export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。每次基于【故事档案 / 主线记忆】、世界观、画风、玩家历史、已登记角色,写出**一个完整场景的剧本**:场景背景概要 + 一组对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。 + +═══════════════════════════════════════════════════════════════════ +爆款心法(番茄网文 / 红果短剧 / galgame 的叙事手感)—— 必须贯彻 +═══════════════════════════════════════════════════════════════════ +- **每个场景都要有钩子**:开头 1–2 个 beat 内就抛出新信息、悬念、冲突或情绪冲击,绝不平铺直叙地交代背景;结尾 beat 留一个让玩家"想知道接下来"的扣子。 +- **兑现爽点 / 情绪回报**:按题材给观众想要的情绪(甜宠的心动、暗恋的暧昧拉扯、逆袭的扬眉吐气、悬疑的真相一角)。让玩家这一场"有所得"。 +- **反转与反差**:适时打破预期——以为是 A 结果是 B、角色露出与第一印象相反的一面;但反转要可信、要扣主线。 +- **快节奏、入戏快**:进场即冲突,少铺陈,删掉一切"为完整而存在"却不推进情绪的对话。 +- **show, don't tell**:用动作、神态、潜台词、环境细节传递情绪,别直接旁白"她很难过"——让玩家自己读出来。 +- **人设鲜明有反差**:每个角色一个强标签 + 一个反差面,台词紧贴其腔调(傲娇嘴硬心软、外冷内热、看似柔弱实则强势)。 +- **选择要有分量**:choice 只出现在真正的岔路口,每个选项都要让玩家感到"通向不同的东西"(情绪指向不同 / 关系走向不同),别给等价的废选项。 + +═══════════════════════════════════════════════════════════════════ +连贯性铁律(跨场景切换不能跳戏 —— 最重要) +═══════════════════════════════════════════════════════════════════ +- 你会收到【故事档案 / 主线记忆】和上一场的结尾。**新场景必须从上一刻自然承接**——承接上一场的情绪、地点逻辑、人物状态与未收的悬念。 +- 若给了「转场种子 nextSceneSeed」,把它当作"下一场的命题"去兑现,而不是另起炉灶;开场要让玩家感到"这正是我上一个动作 / 选择导致的结果"。 +- 沿用主线记忆里的人物关系与情绪温度——别让刚告白的人下一场形同陌路,也别凭空遗忘已埋的伏笔。 +- 推进、但别重置:每一场都让主线问题往前走一点(关系变化 / 真相揭露一角 / 新悬念浮现)。 一个场景包含: - sceneSummary:当前场景的中文概要(地点、时间、氛围、关键事件——给后续的分镜导演看) @@ -98,6 +203,13 @@ sceneKey 设计原则(重要 — 用于跨场景视觉一致性): 例:speaker="你" line="学姐,这把伞你拿着。" - 同一个 beat 可以同时有 narration(心理活动 / 动作)和 speaker="你" + line(说出口的话) +更新主线记忆(storyStatePatch)—— 写完这一场后必做: +- synopsis:把这一场并入后的整体梗概,**压缩**到 3–5 句(别越写越长,旧细节该丢就丢) +- relationships:每个核心角色此刻与「你」的关系 / 情绪温度,每条一句(如 "夏海:暗恋升温,刚向你说了一半的告白被打断") +- openThreads:仍未收的悬念 / 伏笔——已收束的可移除、新埋的加入(但至少保留一条正在推进的主线,别把列表清空) +- nextHook:基于这一场的结尾,下一场应往哪走(给"下一次的你"一个明确命题,接住本场留下的扣子) +这些字段是写给"未来的你"的连贯性记忆,请认真写。 + 必须输出严格 JSON,结构如下: { "sceneSummary": "中文场景概要:地点+时间+氛围+关键事件", @@ -149,13 +261,26 @@ sceneKey 设计原则(重要 — 用于跨场景视觉一致性): ] } } - ] + ], + "storyStatePatch": { + "synopsis": "把这一场并入后的滚动梗概,压缩到 3–5 句", + "relationships": ["夏海:暗恋升温,刚向你说了一半的告白被打断"], + "openThreads": ["夏海没说完的那句话到底是什么", "她书包里掉出的那张旧照片"], + "nextHook": "下一场:放学后的天台,她把你单独叫上去,要把话说完" + } } 不要输出 JSON 以外的任何文本。`; export function buildWriterUserMessage(session: Session): string { const parts: string[] = []; + + const bible = renderStoryState(session.storyState); + if (bible) { + parts.push(bible); + parts.push(""); + } + parts.push(`世界观:${session.worldSetting}`); parts.push(`画风:${session.styleGuide}`); @@ -173,7 +298,9 @@ export function buildWriterUserMessage(session: Session): string { } if (session.history.length === 0) { - parts.push("\n这是故事的开场。请生成第一个场景,严格以 JSON 格式返回。"); + parts.push( + "\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场写出来——开场即抓人,别花笔墨铺垫世界观。写完后更新 storyStatePatch。严格以 JSON 格式返回。", + ); return parts.join("\n"); } @@ -210,22 +337,40 @@ export function buildWriterUserMessage(session: Session): string { }); const last = session.history.at(-1); + + // The exact last moment the player stopped on — the new scene must continue + // seamlessly from this emotional beat, not reset to a neutral state. + if (last) { + const lastBeatId = last.visitedBeatIds.at(-1) ?? last.scene.entryBeatId; + const lastBeat = last.scene.beats.find((b) => b.id === lastBeatId); + if (lastBeat) { + const frag: string[] = []; + if (lastBeat.narration) frag.push(`旁白:${lastBeat.narration}`); + if (lastBeat.line) frag.push(`${lastBeat.speaker ?? "?"}:${lastBeat.line}`); + if (frag.length) { + parts.push( + `\n上一刻(玩家停留的最后一个画面,新场景要从这里的情绪无缝承接):\n ${frag.join(" / ")}`, + ); + } + } + } + const lastExit = last?.exit; if (lastExit) { if (lastExit.kind === "choice") { parts.push( - `\n请基于「玩家在上一场选择了:${lastExit.label}」,生成下一个场景(参考种子:${lastExit.nextSceneSeed})。`, + `\n承接「玩家在上一场选择了:${lastExit.label}」无缝续写下一个场景(转场命题:${lastExit.nextSceneSeed})。开场要让玩家感到这正是上一步的结果,并延续此刻的情绪。`, ); } else { parts.push( - `\n请基于「玩家自由动作:${lastExit.action}」,生成下一个场景。`, + `\n承接「玩家自由动作:${lastExit.action}」无缝续写下一个场景,延续此刻的情绪与处境。`, ); } } else { - parts.push("\n请生成下一个场景。"); + parts.push("\n无缝续写下一个场景,延续上一刻的情绪。"); } - parts.push("严格以 JSON 格式返回。"); + parts.push("写完后别忘了更新 storyStatePatch。严格以 JSON 格式返回。"); return parts.join("\n"); } @@ -506,6 +651,7 @@ export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场 - narration 与 line 加起来 ≤80 字 - 不要打破当前场景的物理状态(玩家仍在原地、对面仍是同一个角色) - 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat +- 这个 beat 也要"有所得"——给玩家一个新细节、一丝潜台词或情绪波动(show, don't tell),别写成无意义的空台词 speaker 字段允许的取值**只有两种**(与主路径 Writer 一致 — Pattern B galgame 标准): 1. **已登记角色**里的 NPC 真名(**绝不允许引入新角色**) diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index e98503f..9c7da00 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -139,6 +139,53 @@ export type BeatAudio = { mime: string; }; +// ────────────────────────────────────────────────────────────────────── +// StoryState — the persistent "story bible" + evolving narrative memory. +// +// Created once at session start by the Architect agent (rich opening +// planning), then carried across every scene and incrementally updated by +// the Writer. This is the single throughline that keeps tone, cast, and +// stakes coherent across scene cuts — without it each Writer call would +// re-derive the whole arc from a flat beat log and drift. +// +// Split into STABLE fields (set by the Architect, rarely change) and +// VOLATILE fields (rewritten each scene via StoryStatePatch). +// ────────────────────────────────────────────────────────────────────── + +export type StoryState = { + // ── Stable (Architect-authored; persists unless deliberately revised) ── + /** One-line central dramatic question / 主线钩子. */ + logline: string; + /** Genre + tone tags anchoring the 爽点 framework, e.g. "甜宠 / 校园 / 慢热治愈". */ + genreTags: string; + /** Second-person protagonist card: who 你 are, the immediate situation, the + * core want, and a flaw/secret. The audience proxy — never rendered. */ + protagonist: string; + /** Key supporting cast and their relationship/tension with 你 (one per line). */ + castNotes?: string; + + // ── Volatile (rewritten each scene by the Writer's StoryStatePatch) ── + /** Rolling, compressed synopsis of what has happened so far (~3-5 句). */ + synopsis: string; + /** Unresolved hooks / mysteries / questions still owed to the player. */ + openThreads?: string[]; + /** Current relationship/emotion state per character, e.g. + * "夏海:好感升温,刚向你告白了一半". */ + relationships?: string[]; + /** Where the story is heading next — the conflict/reversal/suspense the + * next scene should drive toward. Seeds the next scene's hook. */ + nextHook?: string; +}; + +/** The volatile subset the Writer rewrites after each scene. Stable fields + * (logline/genreTags/protagonist/castNotes) are preserved by the merge. */ +export type StoryStatePatch = { + synopsis?: string; + openThreads?: string[]; + relationships?: string[]; + nextHook?: string; +}; + // ────────────────────────────────────────────────────────────────────── // Session // ────────────────────────────────────────────────────────────────────── @@ -151,6 +198,13 @@ export type Session = { history: SceneHistoryEntry[]; /** Character registry — accumulates across scenes; voices + portraits persist for reuse. */ characters: Character[]; + /** + * Persistent story bible + evolving narrative memory. Set at session start + * by the Architect, carried by the client across every /api/scene call, and + * updated by the Writer each scene. Optional for back-compat with any + * session payload created before this field existed. + */ + storyState?: StoryState; }; // ────────────────────────────────────────────────────────────────────── @@ -207,6 +261,9 @@ export type StartResponse = { imageUrl: string; /** Character registry with voice references + visual cards provisioned. */ characters: Character[]; + /** Story bible created by the Architect + updated by the opening scene's + * Writer. The client persists this into the session for later /api/scene calls. */ + storyState: StoryState; }; // /api/scene — generates the next Scene, given session whose latest @@ -221,6 +278,10 @@ export type SceneResponse = { /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */ imageUrl: string; characters: Character[]; + /** Story bible after this scene's Writer applied its update. The client + * must persist this back into the session so the throughline survives the + * next scene cut. */ + storyState: StoryState; }; // /api/beat-audio — lazily synthesize one beat's voice. Client fires this