From 3bf5c92841c5e5309fde0e8a86e0908af855e44f Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Thu, 4 Jun 2026 11:17:34 +0800 Subject: [PATCH] perf(engine): split Writer into Phase A (plan) + Phase B (beats) The Writer was the serial long pole: a single LLM call wrote the scene skeleton AND the full beats[] graph before anything downstream could start, so variable-length beat generation blew up tail latency. Split it into two calls: - Phase A (runWriterPlan): minimal skeleton the image pipeline needs (sceneSummary, sceneKey, entryBeatId, cast, entry roster, entry speaker). Serial, on the critical path, kept lightweight. - Phase B (runWriterBeats): full beats[] + storyStatePatch, written to honor the plan. Launched immediately, overlaps the ENTIRE image pipeline (cards / cinematographer / portraits / painter), awaited last. Critical path becomes PhaseA + max(imagePipeline, PhaseB), so the long beat-writing is hidden behind image gen. A Phase B failure degrades to a single playable beat synthesized from the plan. Paired distinct-payload A/B (6 content-matched stories, baseline vs split): - median end-to-end 42.6s -> 32.2s (-24%) - mean 46.4s -> 33.1s (-29%) - worst case 74.7s -> 37.6s (halved) - no content regression: total Writer output tokens 12858 -> 13699 Co-Authored-By: Claude Opus 4.7 --- lib/engine/agents/writer.ts | 196 +++++++++++++++++++++++++--------- lib/engine/director.ts | 179 ++++++++++++++++++------------- lib/engine/index.ts | 2 +- lib/engine/prompts.ts | 203 +++++++++++++++++++++++++++--------- lib/types/index.ts | 37 +++++++ 5 files changed, 443 insertions(+), 174 deletions(-) diff --git a/lib/engine/agents/writer.ts b/lib/engine/agents/writer.ts index ce04981..e2df125 100644 --- a/lib/engine/agents/writer.ts +++ b/lib/engine/agents/writer.ts @@ -8,26 +8,30 @@ import type { ProviderConfig, Session, StoryStatePatch, + WriterPlan, } from "@infiplot/types"; import { parseJsonLoose } from "../jsonParser"; -import { WRITER_SYSTEM, buildWriterUserMessage } from "../prompts"; +import { + WRITER_BEATS_SYSTEM, + WRITER_PLAN_SYSTEM, + buildWriterBeatsUserMessage, + buildWriterPlanUserMessage, +} from "../prompts"; // ────────────────────────────────────────────────────────────────────── -// Writer agent — owns the narrative half of scene generation. +// Writer agent — owns the narrative half of scene generation, in TWO phases. // -// Output: { sceneSummary, sceneKey, entryBeatId, beats[] } -// Each beat carries activeCharacters[] (names + poses) the -// Cinematographer reads when composing the establishing shot. +// Phase A — runWriterPlan: the scene skeleton (WriterPlan) the image pipeline +// needs (sceneSummary + sceneKey + entry roster + full cast). No dialogue, +// so it returns fast and unblocks the Cinematographer + character design. +// Phase B — runWriterBeats: the full beats[] graph + storyStatePatch, written +// to honor the plan and overlapped with the (longer) image pipeline. // -// Character DESIGN (visual + voice) is NOT this agent's job — -// it only names characters; the CharacterDesigner picks up any -// unknown name from beats[].activeCharacters. +// Character DESIGN (visual + voice) is NOT this agent's job — it only NAMES +// characters (Phase A's cast); the CharacterDesigner picks up unknown names. // ────────────────────────────────────────────────────────────────────── -export type WriterOutput = { - sceneSummary: string; - sceneKey?: string; - entryBeatId: string; +export type WriterBeatsOutput = { beats: Beat[]; /** Rewritten volatile story memory — merged onto the carried StoryState by * the director. Absent when the model omitted it (rare; bible just stales). */ @@ -69,10 +73,17 @@ type RawStoryStatePatch = { relationships?: unknown; nextHook?: unknown; }; -type RawScene = { +// Phase A raw shape (skeleton only — no beats). +type RawPlan = { sceneSummary?: string; sceneKey?: string; entryBeatId?: string; + cast?: unknown; + entrySpeaker?: string; + entryActiveCharacters?: RawActiveCharacter[]; +}; +// Phase B raw shape (beats + memory only — plan fields come from runWriterPlan). +type RawBeats = { beats?: RawBeat[]; storyStatePatch?: RawStoryStatePatch; }; @@ -359,26 +370,119 @@ function coerceStoryStatePatch( return Object.keys(patch).length > 0 ? patch : undefined; } -export async function runWriter( +// Phase A — dedupe + clean the planned cast. Drops the POV player (never +// designed) and any blank/duplicate name. Order is preserved. +function coerceCast(raw: unknown): string[] { + if (!Array.isArray(raw)) return []; + const seen = new Set(); + const out: string[] = []; + for (const x of raw) { + const name = typeof x === "string" ? x.trim() : ""; + if (!name || isPovName(name) || seen.has(name)) continue; + seen.add(name); + out.push(name); + } + return out; +} + +// Rename one beat's id and repoint every INTERNAL reference (continue targets, +// advance-beat targets) so the graph stays intact. Only called when `to` is +// absent from the scene, so it can't introduce a duplicate id. +function renameBeatId(beats: Beat[], from: string, to: string): Beat[] { + if (from === to) return beats; + return beats.map((b): Beat => { + const id = b.id === from ? to : b.id; + let next = b.next; + if (next.type === "continue" && next.nextBeatId === from) { + next = { type: "continue", nextBeatId: to }; + } else if (next.type === "choice") { + next = { + type: "choice", + choices: next.choices.map((c) => + c.effect.kind === "advance-beat" && c.effect.targetBeatId === from + ? { ...c, effect: { kind: "advance-beat" as const, targetBeatId: to } } + : c, + ), + }; + } + return { ...b, id, next }; + }); +} + +// ── Phase A — plan the scene skeleton. Fast (small output): just enough for +// the Cinematographer + character design + Painter to start before the +// dialogue exists. The cast is unioned with the entry roster/speaker so a +// character named in the entry but omitted from `cast` still gets designed. +export async function runWriterPlan( config: ProviderConfig, session: Session, -): Promise { +): Promise { const raw = await chat( config, [ - { role: "system", content: WRITER_SYSTEM }, - { role: "user", content: buildWriterUserMessage(session) }, + { role: "system", content: WRITER_PLAN_SYSTEM }, + { role: "user", content: buildWriterPlanUserMessage(session) }, ], - { temperature: 0.9, responseFormat: "json_object", tag: "writer" }, + { temperature: 0.9, responseFormat: "json_object", tag: "writer-plan" }, ); - const parsed = parseJsonLoose(raw); + const parsed = parseJsonLoose(raw); + + const entryActiveCharacters = + coerceActiveCharacters(parsed.entryActiveCharacters) ?? []; + + // Normalize POV variants → "你"; NPC names pass through. "你" is a valid entry + // speaker (Pattern B — player talking), but is never a designed cast member. + const rawEntrySpeaker = parsed.entrySpeaker?.trim() || undefined; + const entrySpeaker = rawEntrySpeaker + ? normalizeSpeakerName(rawEntrySpeaker) + : undefined; + + const cast = coerceCast(parsed.cast); + const castSet = new Set(cast); + const addToCast = (name: string): void => { + if (!isPovName(name) && !castSet.has(name)) { + castSet.add(name); + cast.push(name); + } + }; + for (const c of entryActiveCharacters) addToCast(c.name); + if (entrySpeaker) addToCast(entrySpeaker); + + return { + sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要", + sceneKey: normalizeSceneKey(parsed.sceneKey), + entryBeatId: parsed.entryBeatId?.trim() || "b1", + cast, + entryActiveCharacters, + entrySpeaker, + }; +} + +// ── Phase B — expand the plan into the full beats[] graph + storyStatePatch. +// Overlapped with the image pipeline by the director. The plan's entry id is +// pinned onto a real beat so the already-painted entry frame resolves. +export async function runWriterBeats( + config: ProviderConfig, + session: Session, + plan: WriterPlan, +): Promise { + const raw = await chat( + config, + [ + { role: "system", content: WRITER_BEATS_SYSTEM }, + { role: "user", content: buildWriterBeatsUserMessage(session, plan) }, + ], + { temperature: 0.9, responseFormat: "json_object", tag: "writer-beats" }, + ); + + const parsed = parseJsonLoose(raw); const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : []; if (rawBeats.length === 0) { - throw new Error("Writer returned no beats"); + throw new Error("Writer (beats) returned no beats"); } - const beats = ensureUniqueChoiceIds( + let beats = ensureUniqueChoiceIds( repairBeats( ensureUniqueBeatIds( rawBeats.map((b, i) => coerceBeat(b, i, rawBeats.length)), @@ -386,40 +490,36 @@ export async function runWriter( ), ); - const declaredEntry = parsed.entryBeatId?.trim(); - const entryBeatId = - declaredEntry && beats.some((b) => b.id === declaredEntry) - ? declaredEntry - : beats[0]!.id; + // The Painter already composed the entry frame from plan.entryBeatId + its + // roster, so the scene's entry MUST resolve to that id. If Phase B ignored + // it, rename the first beat to it (no collision — id is absent by the guard). + if (!beats.some((b) => b.id === plan.entryBeatId)) { + beats = renameBeatId(beats, beats[0]!.id, plan.entryBeatId); + } return { - sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要", - sceneKey: normalizeSceneKey(parsed.sceneKey), - entryBeatId, beats, storyStatePatch: coerceStoryStatePatch(parsed.storyStatePatch), }; } -// Surface the set of character names introduced by this scene's beats, -// so the orchestrator can decide which ones need the CharacterDesigner to -// fire. Pulls names from both `speaker` fields AND `activeCharacters` -// (a character can be on-screen without speaking). -// -// Excludes POV ("你" / 玩家 / 主角 / ...) entirely — the player is never -// designed (no portrait, no voice, no archetype). -export function collectActiveCharacterNames(beats: Beat[]): string[] { - const seen = new Set(); - for (const b of beats) { - if (b.speaker && !isPovName(b.speaker)) seen.add(b.speaker); - if (b.activeCharacters) { - for (const c of b.activeCharacters) { - if (!isPovName(c.name)) seen.add(c.name); - } - } - } - return Array.from(seen); +// Phase B fallback — when runWriterBeats fails entirely, keep the scene +// playable with a single entry beat synthesized from the plan: narrate the +// planned summary and offer one change-scene exit so the player can advance. +export function synthesizeFallbackBeats(plan: WriterPlan): Beat[] { + const id = plan.entryBeatId || "b1"; + return [ + { + id, + narration: plan.sceneSummary, + activeCharacters: + plan.entryActiveCharacters.length > 0 + ? plan.entryActiveCharacters + : undefined, + next: { type: "choice", choices: [fallbackExitChoice(id)] }, + }, + ]; } -// Re-export POV constants for downstream filters (director's orphanSpeakers). +// Re-export POV constants for downstream filters (director's orphan voices). export { POV_DISPLAY_NAME, POV_VARIANTS, isPovName, normalizeSpeakerName }; diff --git a/lib/engine/director.ts b/lib/engine/director.ts index 8049023..316f627 100644 --- a/lib/engine/director.ts +++ b/lib/engine/director.ts @@ -1,5 +1,6 @@ import { chat } from "@infiplot/ai-client"; import type { + Beat, Character, EngineConfig, InsertBeatPartial, @@ -8,6 +9,7 @@ import type { Session, StoryState, StoryStatePatch, + WriterPlan, } from "@infiplot/types"; import type { CharacterCard } from "./agents/characterDesigner"; import { @@ -18,12 +20,14 @@ import { } from "./agents/characterDesigner"; import { runCinematographer } from "./agents/cinematographer"; import { runPainter } from "./agents/painter"; +import type { WriterBeatsOutput } from "./agents/writer"; import { - collectActiveCharacterNames, isPovName, normalizeSpeakerName, POV_DISPLAY_NAME, - runWriter, + runWriterBeats, + runWriterPlan, + synthesizeFallbackBeats, } from "./agents/writer"; import { parseJsonLoose } from "./jsonParser"; import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; @@ -33,25 +37,25 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; // // Critical path (per Scene call): // -// Writer LLM (~3s, serial) +// Writer PHASE A — plan LLM (scene skeleton only, serial) // │ -// ├─ CharacterCard LLM × N (parallel per new char — TEXT only) -// ├─ Cinematographer LLM (parallel with the cards) -// │ -// └─ wait for cards + cinema -// │ -// ├─ entry-beat portraits ──┐ (block the Painter — its refs) -// ▼ │ -// Painter — generateImage │ (overlapped, NOT on the paint path): -// with referenceImages ├─ non-entry-beat portraits -// │ └─ ALL voice provisioning + orphan voices +// ├──────────────────────────┬───────────────────────────────────────┐ +// ▼ ▼ │ +// Writer PHASE B image pipeline (concurrent): │ +// beats LLM CharacterCard LLM × N ∥ Cinematographer │ +// (full dialogue, → entry-beat portraits (block Painter) │ +// overlapped) → Painter (generateImage w/ refs) │ +// │ → await overlapped: rest portraits+voices │ +// └──────────────────────────► await Phase B ◄────────────────────────┘ // ▼ -// await the overlapped work, fold into the registry -// │ -// ▼ -// return { scene, sceneImageUrl, characters, storyState } +// assemble Scene → { scene, sceneImageUrl, characters, storyState } // -// Two deliberate decouplings unlock the parallelism: +// Why split the Writer (the latency win): the image pipeline only needs the +// scene SUMMARY + entry roster + cast (Phase A) — NOT the dialogue (Phase B). +// Writing beats used to sit serially in FRONT of the image; now it overlaps +// it, so the floor is max(beats, image) instead of beats + image. +// +// The decouplings that unlock the rest of the parallelism: // 1. The Cinematographer only POSITIONS named characters, so it needs no // visualDescription and runs alongside the card LLMs. // 2. The Painter only needs visualDescription TEXT (all on-stage) + the @@ -163,31 +167,60 @@ export async function directScene( ): Promise { const tTotal = Date.now(); - // Stage 1 — Writer (serial; everything downstream needs sceneSummary + - // beats[] to know who's on stage and what to compose around). - const tWriter = Date.now(); - const writerOut = await runWriter(config.text, session); - tlog("[directScene] Writer", tWriter); + // ── Phase A — Writer PLAN (serial). The image pipeline needs the scene + // summary + entry roster + cast to start, but NOT the dialogue beats. This + // call is small (skeleton only), so it returns fast and unblocks everything. + const tPlan = Date.now(); + const plan = await runWriterPlan(config.text, session); + tlog("[directScene] Phase A (plan)", tPlan); - // Identify NEW characters introduced by this scene that need to be - // designed (LLM + portrait + voice). Existing characters in the registry - // are skipped — their cards / portraits / voices persist across scenes. - const allActiveNames = collectActiveCharacterNames(writerOut.beats); - const newCharNames = allActiveNames.filter( + // ── Phase B — Writer BEATS, launched NOW so its (longer) output overlaps the + // ENTIRE image pipeline below. Only needed to assemble the final Scene, so we + // await it last. A failure degrades to a single playable beat from the plan. + const tBeats = Date.now(); + const beatsPromise: Promise = runWriterBeats( + config.text, + session, + plan, + ) + .then((out) => { + tlog("[directScene] Phase B (beats)", tBeats); + return out; + }) + .catch((err): WriterBeatsOutput => { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[directScene] Phase B (beats) failed, using fallback: ${msg}`, + ); + return { beats: synthesizeFallbackBeats(plan), storyStatePatch: undefined }; + }); + + // NEW characters to design come from the PLAN's cast (so design fires in + // parallel with Phase B, not after the beats are written). Existing + // characters keep their cards / portraits / voices across scenes. + const newCharNames = plan.cast.filter( (n) => !session.characters.some((c) => c.name === n), ); - // Find the entry beat for the Cinematographer (which characters are - // on-screen in the establishing shot). - const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId); - const entryBeatActive = entryBeat?.activeCharacters ?? []; + // Entry-beat composition is the PLAN's (Phase B is constrained to honor it). + // The Painter needs a Beat-shaped object for reference collection, but the + // real beat isn't written until Phase B — so synthesize one from the plan + // (collectReferenceImages only reads speaker + activeCharacters). + const entryBeatActive = plan.entryActiveCharacters; + const entryBeatSpeaker = plan.entrySpeaker; + const entryBeatForPaint: Beat = { + id: plan.entryBeatId, + speaker: entryBeatSpeaker, + activeCharacters: entryBeatActive.length > 0 ? entryBeatActive : undefined, + next: { type: "continue", nextBeatId: plan.entryBeatId }, + }; // For sceneKey-based visual continuity, look up the prior matching scene's // image to slot into Painter's referenceImages (max 4 of which include // character portraits too). const { priorSceneReference, priorSceneKey } = pickPriorSceneReference( session, - writerOut.sceneKey, + plan.sceneKey, ); // ── Stage 2 — character cards (LLM) ∥ Cinematographer ────────────────── @@ -211,12 +244,12 @@ export async function directScene( ); const cinemaPromise = runCinematographer(config.text, { - sceneSummary: writerOut.sceneSummary, + sceneSummary: plan.sceneSummary, styleGuide: session.styleGuide, entryBeatActive, - entryBeatSpeaker: entryBeat?.speaker, + entryBeatSpeaker, priorSceneKey, - currentSceneKey: writerOut.sceneKey, + currentSceneKey: plan.sceneKey, }); const [cards, cinemaOut] = await Promise.all([ @@ -242,8 +275,8 @@ export async function directScene( // Entry-beat character names: the ONLY portraits the Painter references // (collectReferenceImages slots in the entry beat's speaker + activeChars). const entryNames = new Set(); - if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) { - entryNames.add(entryBeat.speaker); + if (entryBeatSpeaker && !isPovName(entryBeatSpeaker)) { + entryNames.add(entryBeatSpeaker); } for (const c of entryBeatActive) { if (!isPovName(c.name)) entryNames.add(c.name); @@ -281,24 +314,6 @@ export async function directScene( ), ); - // Edge case: a speaker the Writer referenced without listing in any beat's - // activeCharacters. collectActiveCharacterNames already includes speakers, - // so this is a rare defensive net. Provision a voice only (never on-screen). - const speakerNames = new Set( - writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)), - ); - const orphanSpeakers = [...speakerNames].filter( - // Pattern B: "你" (player) is a valid speaker but never gets a Character - // record — TTS is intentionally skipped on the client. - (n) => - !isPovName(n) && - !characters.some((c) => c.name === n) && - !cards.some((c) => c.name === n), - ); - const orphanPromises = orphanSpeakers.map((n) => - provisionVoiceForName(config, session, n), - ); - // Block the Painter ONLY on entry-beat portraits (its referenceImages). const entryPortraits = await Promise.all(entryPortraitPromises); characters = mergeCharacters( @@ -313,11 +328,9 @@ export async function directScene( tlog("[directScene] entry-beat portraits", tProvision); // ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards + - // entry portraits). On-stage = everyone named in any beat, so the archetype - // block covers anyone the player might encounter in this scene. - const onStageCharacters = characters.filter((c) => - allActiveNames.includes(c.name), - ); + // entry portraits). On-stage = the plan's cast (everyone who'll appear), + // filtered to those now in the registry, so the archetype block covers them. + const onStageCharacters = characters.filter((c) => plan.cast.includes(c.name)); const tPainter = Date.now(); const painted = await runPainter( @@ -329,18 +342,17 @@ export async function directScene( priorSceneImage: priorSceneReference, styleReferenceImage: session.styleReferenceImage, }, - entryBeat, + entryBeatForPaint, ); tlog("[directScene] Painter", tPainter); - // Fold in the work that overlapped the paint: remaining portraits, all - // voices, and any orphan-speaker voices. Awaited before returning so the - // session the client persists is fully provisioned for later scenes. + // Fold in the work that overlapped the paint: remaining portraits + all + // voices. Awaited before returning so the session the client persists is + // fully provisioned for later scenes. const tOverlap = Date.now(); - const [restPortraits, voicedChars, orphanChars] = await Promise.all([ + const [restPortraits, voicedChars] = await Promise.all([ Promise.all(restPortraitPromises), Promise.all(voicePromises), - Promise.all(orphanPromises), ]); characters = mergeCharacters( characters, @@ -352,10 +364,31 @@ export async function directScene( })), ); characters = mergeCharacters(characters, voicedChars); - if (orphanChars.length > 0) { + tlog("[directScene] overlapped portraits+voices", tOverlap); + + // ── Await Phase B — it overlapped the whole image pipeline above. ────── + const beatsOut = await beatsPromise; + const beats = beatsOut.beats; + + // entryBeatId is guaranteed present (runWriterBeats pins it onto a beat), but + // keep the defensive fallback for the synthesized-fallback path. + const entryBeatId = beats.some((b) => b.id === plan.entryBeatId) + ? plan.entryBeatId + : beats[0]!.id; + + // Orphan-speaker voices: a beat speaker Phase B used that isn't in the + // registry. Should be rare — the prompt constrains speakers to the cast, and + // every cast member was provisioned above — so this is a defensive net, + // serial but skipped entirely (zero latency) in the common case. + const orphanSpeakers = [ + ...new Set(beats.map((b) => b.speaker).filter((n): n is string => Boolean(n))), + ].filter((n) => !isPovName(n) && !characters.some((c) => c.name === n)); + if (orphanSpeakers.length > 0) { + const orphanChars = await Promise.all( + orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)), + ); characters = mergeCharacters(characters, orphanChars); } - tlog("[directScene] overlapped portraits+voices", tOverlap); const scene: Scene = { id: newSceneId(), @@ -365,9 +398,9 @@ export async function directScene( // anything that already reads scene.scenePrompt (e.g., insert-beat // user prompt). scenePrompt: cinemaOut.integratedPrompt, - beats: writerOut.beats, - entryBeatId: writerOut.entryBeatId, - sceneKey: writerOut.sceneKey, + beats, + entryBeatId, + sceneKey: plan.sceneKey, imageUuid: painted.kind === "real" ? painted.imageUuid : undefined, imageUrl: painted.imageUrl, }; @@ -377,7 +410,7 @@ export async function directScene( // client persists it back into the session). const storyState = applyStoryStatePatch( session.storyState, - writerOut.storyStatePatch, + beatsOut.storyStatePatch, ); tlog("[directScene] TOTAL", tTotal); diff --git a/lib/engine/index.ts b/lib/engine/index.ts index be19ea2..3a94bc1 100644 --- a/lib/engine/index.ts +++ b/lib/engine/index.ts @@ -9,7 +9,7 @@ export { synthesizeBeat } from "./voice"; export { mergeCharacters } from "./director"; export type { SceneResult } from "./director"; export { runArchitect } from "./agents/architect"; -export type { WriterOutput } from "./agents/writer"; +export type { WriterBeatsOutput } from "./agents/writer"; export type { CinematographerOutput } from "./agents/cinematographer"; export type { InsertBeatPartial } from "@infiplot/types"; export * from "./prompts"; diff --git a/lib/engine/prompts.ts b/lib/engine/prompts.ts index 42bbfd9..7cb2421 100644 --- a/lib/engine/prompts.ts +++ b/lib/engine/prompts.ts @@ -4,6 +4,7 @@ import type { Scene, Session, StoryState, + WriterPlan, } from "@infiplot/types"; // ══════════════════════════════════════════════════════════════════════ @@ -137,16 +138,77 @@ export function buildArchitectUserMessage(session: Session): string { } // ────────────────────────────────────────────────────────────────────── -// 1. Writer (编剧) — drives the narrative. +// 1. Writer (编剧) — drives the narrative, in TWO phases. // -// Emits a full Scene: beats[] graph + entryBeatId + sceneKey hint + -// activeCharacters per beat. Does NOT design characters (that's the -// CharacterDesigner's job) — only names them in `activeCharacters`. -// The CharacterDesigner is invoked separately for any name not yet in -// session.characters. +// Phase A (WRITER_PLAN_SYSTEM): plans the scene SKELETON only — sceneSummary +// + sceneKey + entry-beat roster + the full cast. No dialogue. Its output +// is enough for the Cinematographer + character design + Painter to start. +// Phase B (WRITER_BEATS_SYSTEM): expands the plan into the full beats[] graph +// + storyStatePatch, overlapped with the (longer) image pipeline. +// +// Neither phase designs characters (that's the CharacterDesigner's job) — +// Phase A only NAMES them in `cast` / `entryActiveCharacters`; the +// CharacterDesigner is invoked for any name not yet in session.characters. // ────────────────────────────────────────────────────────────────────── -export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。每次基于【故事档案 / 主线记忆】、世界观、画风、玩家历史、已登记角色,写出**一个完整场景的剧本**:场景背景概要 + 一组对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。 +export const WRITER_PLAN_SYSTEM = `你是一部交互视觉小说的「编剧」。这是**两步生成中的第一步——场景规划**。你只产出本场景的「骨架」,**不要写任何 beat 台词**。你的产出会被立刻送去配图(分镜导演 + 生图),所以要快、要准、画面感要强。 + +═══════════════════════════════════════════════════════════════════ +爆款心法(要在规划阶段就立住,后续展开才好看) +═══════════════════════════════════════════════════════════════════ +- **进场即钩子**:这一场开场就要抛出新信息 / 悬念 / 冲突 / 情绪冲击,别铺陈。把这个抓人的瞬间写进 sceneSummary。 +- **兑现情绪**:按题材给观众想要的情绪(甜宠的心动、暗恋的拉扯、逆袭的扬眉、悬疑的真相一角)。 +- **人设有反差**:每个角色一个强标签 + 一个反差面。 + +═══════════════════════════════════════════════════════════════════ +连贯性铁律(跨场景切换不能跳戏 —— 最重要) +═══════════════════════════════════════════════════════════════════ +- 你会收到【故事档案 / 主线记忆】和上一场的结尾。**新场景必须从上一刻自然承接**——承接情绪、地点逻辑、人物状态与未收的悬念。 +- 若给了「转场种子 nextSceneSeed」,把它当作"下一场的命题"去兑现,开场要让玩家感到"这正是我上一步的结果"。 +- 沿用主线记忆里的人物关系与情绪温度,别让刚告白的人下一场形同陌路。 + +本步你要规划(如实产出,缺一不可): +- **sceneSummary**:当前场景的中文概要——地点 + 时间 + 氛围 + 关键事件 + 那个抓人的开场瞬间。这是分镜导演构图的**唯一依据**,要画面感强、信息足(2–4 句)。 +- **sceneKey**:当前场景的英文 slug(如 "classroom-dusk"、"rooftop-night")。 +- **entryBeatId**:玩家进入场景时落在哪个 beat 的 id(通常就是 "b1")。 +- **cast**:本场景**会出场的全部 NPC 角色名**(字符串数组)。第二步写 beats 时**只能用这里列出的名字**,所以现在必须一次想全——谁会说话、谁会在画面里露面,全部列出。名字要与「已登记角色」**完全一致**;新角色起符合世界观的真名(不要"神秘女子"这种占位)。**绝不**包含玩家(你 / 我 / 主角 / protagonist / player / MC...)。 +- **entrySpeaker**:入口 beat 由谁开口 —— 取值只有三种:① 某个 NPC 真名(必须在 cast 里)② "你"(玩家本人开口)③ 留空(纯旁白 / 环境开场)。这决定镜头语言,要选准。 +- **entryActiveCharacters**:入口画面里**此刻出现的 NPC** 及其当下姿态 / 神情(中文 pose)。即使没人说话,画面里有谁也要列。**绝不**包含玩家。 + +sceneKey 设计原则(用于跨场景视觉一致性): +- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug +- 时段 / 空间变化时换 slug("classroom-dusk" → "classroom-night" / "corridor-dusk") +- slug 规范:lowercase-with-dashes,2–4 个英文单词 +- 用户消息会列出已用过的 sceneKey,请优先**复用**这些已有 slug + +玩家视角硬规则(违反会破坏整个 galgame): +- 玩家是第二人称 POV,**永远不出现在任何画面里**——entryActiveCharacters 的 name **绝不允许**是「玩家 / 你 / 我 / 主角 / protagonist / player / Player / MC / I / me」任何变体。 +- entrySpeaker 只能是 NPC 真名 / "你" / 留空;其它 POV 变体一律视为错误。 + +必须输出严格 JSON: +{ + "sceneSummary": "黄昏的天台,风很大。夏海背对你站在栏杆边,手里攥着一张揉皱的成绩单——她把你单独叫上来,却迟迟不开口。", + "sceneKey": "rooftop-dusk", + "entryBeatId": "b1", + "cast": ["夏海"], + "entrySpeaker": "夏海", + "entryActiveCharacters": [ + { "name": "夏海", "pose": "背对你倚着栏杆,侧脸绷着,手里攥着揉皱的纸" } + ] +} + +不要输出 JSON 以外的任何文本。`; + +// ────────────────────────────────────────────────────────────────────── +// Phase B — expands the plan into the full beats[] + storyStatePatch. +// ────────────────────────────────────────────────────────────────────── + +export const WRITER_BEATS_SYSTEM = `你是一部交互视觉小说的「编剧」。这是**两步生成中的第二步——把已规划好的场景展开成完整剧本**。你会收到本场景的「规划」(场景概要 sceneSummary、sceneKey、入口 beat 的 id / speaker / 登场角色、以及本场景允许出场的角色名单 cast)。你的任务:基于规划写出玩家依次经历的对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。 + +你必须严格遵守收到的规划: +- 必须存在一个 id 等于规划 entryBeatId 的 beat,作为玩家入口。 +- 该入口 beat 的 speaker 与登场角色(activeCharacters)要与规划一致(姿态措辞可微调,但**人物身份必须一致**)。 +- speaker 与 activeCharacters 里的 NPC 名字**只能来自规划的 cast**(或玩家 "你")——**不要引入规划之外的新角色**。 ═══════════════════════════════════════════════════════════════════ 爆款心法(番茄网文 / 红果短剧 / galgame 的叙事手感)—— 必须贯彻 @@ -167,11 +229,7 @@ export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。 - 沿用主线记忆里的人物关系与情绪温度——别让刚告白的人下一场形同陌路,也别凭空遗忘已埋的伏笔。 - 推进、但别重置:每一场都让主线问题往前走一点(关系变化 / 真相揭露一角 / 新悬念浮现)。 -一个场景包含: -- sceneSummary:当前场景的中文概要(地点、时间、氛围、关键事件——给后续的分镜导演看) -- sceneKey:当前场景的英文 slug(如 "classroom-dusk"、"rooftop-night"、"rainy-street")——同一物理空间应沿用相同 slug -- beats[]:玩家依次经历的对话节拍 -- entryBeatId:玩家进入场景时落在哪个 beat +本步你只产出两样:**beats[]**(玩家依次经历的对话节拍)和 **storyStatePatch**(主线记忆更新)。sceneSummary / sceneKey / entryBeatId 已由规划给定,**不要再输出**它们。 每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接: - "continue":玩家点击图片背景 / 按继续,自然推进到下一个 beat @@ -183,6 +241,7 @@ choice 的 effect 有两种: 设计原则: - 同场景内 beat 数自由发挥,按剧情节奏自然给出(通常 2–6 个,可以更多) +- 入口 beat 的 id 必须等于规划给定的 entryBeatId;其余 beat id 依次自取且互不重复 - 多用 continue,少用 choice — 选择只应出现在「真正的岔路口」 - advance-beat 适合处理对话分支(同一场景里换个话题、追问、撒娇) - change-scene 适合空间/时间跳跃(出门、转身看窗外、第二天清晨) @@ -192,12 +251,6 @@ choice 的 effect 有两种: - next.nextBeatId 引用的 beat 必须存在 - choice 至少 2 个,至多 4 个,互不重复 -sceneKey 设计原则(重要 — 用于跨场景视觉一致性): -- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug -- 时段或空间变化时换 slug(如 "classroom-dusk" → "classroom-night","classroom-dusk" → "corridor-dusk") -- slug 规范:lowercase-with-dashes,2–4 个英文单词 -- 已登记的历史场景 sceneKey 会在用户消息里列出,请优先**复用**这些已有 slug - 文本风格约束: - narration / line 用中文(**纯净可显示文本**,绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的,会被玩家看见) - sceneSummary / lineDelivery / activeCharacters[].pose 内的文字也用中文 @@ -243,11 +296,8 @@ sceneKey 设计原则(重要 — 用于跨场景视觉一致性): - nextHook:基于这一场的结尾,下一场应往哪走(给"下一次的你"一个明确命题,接住本场留下的扣子) 这些字段是写给"未来的你"的连贯性记忆,请认真写。 -必须输出严格 JSON,结构如下: +必须输出严格 JSON,结构如下(**只含 beats 与 storyStatePatch**;sceneSummary / sceneKey / entryBeatId 由规划给定,不要输出。下例入口 beat 的 id "b1" 即规划的 entryBeatId): { - "sceneSummary": "中文场景概要:地点+时间+氛围+关键事件", - "sceneKey": "classroom-dusk", - "entryBeatId": "b1", "beats": [ { "id": "b1", @@ -343,29 +393,28 @@ function renderHistoryEntry( return lines.join("\n"); } -export function buildWriterUserMessage(session: Session): string { - // ─── STABLE PREFIX ──────────────────────────────────────────────────── - // Everything in this section is invariant across consecutive Writer calls - // within the session (or monotonically grows in a way that keeps the - // earlier bytes byte-identical). Always emit every section header — even - // when empty — so positions don't shift between calls. - // - // Order optimized for DeepSeek/MiMo prefix caching (64-token chunks): - // 1. session-immutable scalars (world / style) - // 2. story bible spine (Architect-set, never patched) - // 3. monotonically-growing lists (characters, sceneKeys) - // 4. history entries 0..N-2 (the last entry is what THIS call must - // react to, so it lives in the dynamic suffix instead) - // - // ─── DYNAMIC SUFFIX ─────────────────────────────────────────────────── - // Everything below changes on (almost) every call: - // 5. story bible dynamic patch (synopsis/threads/relationships/nextHook) - // 6. the just-completed entry (history[-1]) — same render format as the - // stable history blocks, just preceded by a "just completed" header - // 7. last-beat snippet (the exact emotional cliffhanger) - // 8. lastExit hint - // 9. format reminder tail - +// Shared narrative context for BOTH Writer phases. Returns the message parts +// from the cacheable STABLE PREFIX (sections 1-4) through the dynamic +// transition hint (section 7), but WITHOUT the trailing phase-specific +// instruction — each phase appends its own. Building this once and reusing it +// keeps EACH phase's prompt prefix byte-stable across scenes for DeepSeek +// prompt caching (Phase A and Phase B cache independently since their system +// prompts differ, but each shares its own prefix across consecutive calls). +// +// ─── STABLE PREFIX ────────────────────────────────────────────────────── +// Invariant across consecutive Writer calls within the session (or grows in a +// way that keeps earlier bytes byte-identical). Always emit every section +// header — even when empty — so positions don't shift between calls. +// 1. session-immutable scalars (world / style) +// 2. story bible spine (Architect-set, never patched) +// 3. monotonically-growing lists (characters, sceneKeys) +// 4. history entries 0..N-2 (the last entry is what THIS call must react +// to, so it lives in the dynamic suffix instead) +// ─── DYNAMIC SUFFIX ───────────────────────────────────────────────────── +// 5. story bible dynamic patch (synopsis/threads/relationships/nextHook) +// 6. last-beat snippet (the exact emotional cliffhanger) +// 7. transition hint (opening cold-open directive OR lastExit承接) +function buildWriterContextParts(session: Session): string[] { const parts: string[] = []; // ── 1. session scalars ──────────────────────────────────────────────── @@ -423,8 +472,7 @@ export function buildWriterUserMessage(session: Session): string { // ── 6. last-beat snippet (the exact emotional cliffhanger) ── // The full last entry is already in the stable history block above; here // we only re-emit the very last beat to sharply focus the Writer on the - // emotional moment to continue from. Skip the duplicate full-entry render - // that was here previously — it wasted ~200-500 tokens of dynamic suffix. + // emotional moment to continue from. const last = session.history.at(-1); if (last) { const lastBeatId = last.visitedBeatIds.at(-1) ?? last.scene.entryBeatId; @@ -441,14 +489,14 @@ export function buildWriterUserMessage(session: Session): string { } } + // ── 7. transition hint ──────────────────────────────────────────────── if (session.history.length === 0) { parts.push( - "\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场写出来——开场即抓人,别花笔墨铺垫世界观。写完后更新 storyStatePatch。严格以 JSON 格式返回。", + "\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场设计出来——开场即抓人,别花笔墨铺垫世界观。", ); - return parts.join("\n"); + return parts; } - // ── 8. lastExit hint ────────────────────────────────────────────────── const lastExit = last?.exit; if (lastExit) { if (lastExit.kind === "choice") { @@ -464,8 +512,59 @@ export function buildWriterUserMessage(session: Session): string { parts.push("\n无缝续写下一个场景,延续上一刻的情绪。"); } - // ── 9. format reminder tail ─────────────────────────────────────────── - parts.push("写完后别忘了更新 storyStatePatch。严格以 JSON 格式返回。"); + return parts; +} + +// Phase A — plan the scene skeleton (no beats). Shares the cacheable context; +// appends a plan-only instruction tail. +export function buildWriterPlanUserMessage(session: Session): string { + const parts = buildWriterContextParts(session); + parts.push( + '\n现在**只规划本场景的骨架**(不要写 beats 台词):给出 sceneSummary(画面感强、含开场钩子)、sceneKey、entryBeatId、本场景会出场的全部角色 cast、以及入口 beat 的 entrySpeaker 与 entryActiveCharacters。严格以 JSON 格式返回。', + ); + return parts.join("\n"); +} + +// Phase B — expand the plan into full beats[] + storyStatePatch. The plan is +// dynamic per scene, so it goes AFTER the cacheable context (keeping Phase B's +// prefix stable across scenes). +export function buildWriterBeatsUserMessage( + session: Session, + plan: WriterPlan, +): string { + const parts = buildWriterContextParts(session); + + parts.push(""); + parts.push("━━━ 本场景规划(上一步已定,必须严格遵守)━━━"); + parts.push(`场景概要 sceneSummary:${plan.sceneSummary}`); + if (plan.sceneKey) parts.push(`sceneKey:${plan.sceneKey}`); + parts.push( + `入口 beat 的 id(entryBeatId,必须有一个此 id 的 beat 作为入口):${plan.entryBeatId}`, + ); + parts.push( + `入口 beat 的 speaker:${plan.entrySpeaker ? plan.entrySpeaker : "(空 —— 纯旁白 / 环境开场)"}`, + ); + parts.push("入口 beat 的登场角色 activeCharacters(人物身份须一致,姿态可微调):"); + if (plan.entryActiveCharacters.length === 0) { + parts.push("(无 —— 入口画面没有 NPC)"); + } else { + for (const c of plan.entryActiveCharacters) { + parts.push(`- ${c.name}${c.pose ? `:${c.pose}` : ""}`); + } + } + parts.push( + '本场景允许出现的角色名 cast(speaker / activeCharacters 只能用这些名字或 "你",不要新增角色):', + ); + if (plan.cast.length === 0) { + parts.push("(无 NPC —— 仅旁白与玩家)"); + } else { + for (const n of plan.cast) parts.push(`- ${n}`); + } + parts.push("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + + parts.push( + "\n把上面的规划展开成完整的 beats[](入口 beat 用规划的 entryBeatId / speaker / 登场角色),写完后更新 storyStatePatch。严格以 JSON 格式返回。", + ); return parts.join("\n"); } diff --git a/lib/types/index.ts b/lib/types/index.ts index 66daf9f..3ecaf1e 100644 --- a/lib/types/index.ts +++ b/lib/types/index.ts @@ -92,6 +92,43 @@ export type SceneHistoryEntry = { exit?: SceneExit; }; +// ────────────────────────────────────────────────────────────────────── +// Writer two-phase split +// +// The Writer runs as TWO LLM calls so scene-image generation can begin +// before the dialogue is fully written: +// Phase A (WriterPlan) — the minimal skeleton the image pipeline needs: +// sceneSummary + sceneKey + the entry beat's +// on-stage roster + the full cast to design. +// Phase B (beats) — the full beats[] graph + storyStatePatch, written +// to honor the plan, overlapped with image gen. +// The Cinematographer + character design + Painter all run off the Plan, so +// Phase B's (longer) output is hidden behind the image pipeline. +// ────────────────────────────────────────────────────────────────────── + +export type WriterPlan = { + /** 中文 scene synopsis (location + time + mood + key event + opening hook). + * The sole input the Cinematographer composes the establishing shot from. */ + sceneSummary: string; + /** English location+time slug for cross-scene visual continuity. */ + sceneKey?: string; + /** Beat id the player lands on when entering the scene. Phase B must emit a + * beat with this id (reconciled if it doesn't). */ + entryBeatId: string; + /** Every NPC name that appears anywhere in this scene. Drives character + * design (card + portrait + voice) IN PARALLEL with Phase B beat writing, so + * the whole cast is provisioned by the time the scene returns. Phase B may + * only use names from this list (plus the POV "你"). Never includes the player. */ + cast: string[]; + /** The entry beat's on-stage roster (who's visible + pose when the player + * lands). Drives the Cinematographer's framing and the entry-beat portraits + * the Painter anchors to. Never includes the POV player. */ + entryActiveCharacters: BeatActiveCharacter[]; + /** The entry beat's speaker — an NPC name, "你" (player speaking), or + * undefined for a pure narration/environment entry. Drives shot selection. */ + entrySpeaker?: string; +}; + // ────────────────────────────────────────────────────────────────────── // Characters & voices (TTS) // ──────────────────────────────────────────────────────────────────────