Merge pull request #27 from zonghaoyuan/perf/writer-split

perf(engine): split Writer into Phase A (plan) + Phase B (beats)
This commit is contained in:
Zonghao Yuan
2026-06-04 16:53:21 +08:00
committed by GitHub
5 changed files with 452 additions and 174 deletions
+159 -50
View File
@@ -8,26 +8,30 @@ import type {
ProviderConfig,
Session,
StoryStatePatch,
WriterPlan,
} from "@infiplot/types";
import { parseJsonLoose } from "../jsonParser";
import { WRITER_SYSTEM, buildWriterUserMessage } from "../prompts";
import {
WRITER_BEATS_SYSTEM,
WRITER_PLAN_SYSTEM,
buildWriterBeatsUserMessage,
buildWriterPlanUserMessage,
} from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Writer agent — owns the narrative half of scene generation.
// Writer agent — owns the narrative half of scene generation, in TWO phases.
//
// Output: { sceneSummary, sceneKey, entryBeatId, beats[] }
// Each beat carries activeCharacters[] (names + poses) the
// Cinematographer reads when composing the establishing shot.
// Phase A — runWriterPlan: the scene skeleton (WriterPlan) the image pipeline
// needs (sceneSummary + sceneKey + entry roster + full cast). No dialogue,
// so it returns fast and unblocks the Cinematographer + character design.
// Phase B — runWriterBeats: the full beats[] graph + storyStatePatch, written
// to honor the plan and overlapped with the (longer) image pipeline.
//
// Character DESIGN (visual + voice) is NOT this agent's job —
// it only names characters; the CharacterDesigner picks up any
// unknown name from beats[].activeCharacters.
// Character DESIGN (visual + voice) is NOT this agent's job — it only NAMES
// characters (Phase A's cast); the CharacterDesigner picks up unknown names.
// ──────────────────────────────────────────────────────────────────────
export type WriterOutput = {
sceneSummary: string;
sceneKey?: string;
entryBeatId: string;
export type WriterBeatsOutput = {
beats: Beat[];
/** Rewritten volatile story memory — merged onto the carried StoryState by
* the director. Absent when the model omitted it (rare; bible just stales). */
@@ -69,10 +73,17 @@ type RawStoryStatePatch = {
relationships?: unknown;
nextHook?: unknown;
};
type RawScene = {
// Phase A raw shape (skeleton only — no beats).
type RawPlan = {
sceneSummary?: string;
sceneKey?: string;
entryBeatId?: string;
cast?: unknown;
entrySpeaker?: string;
entryActiveCharacters?: RawActiveCharacter[];
};
// Phase B raw shape (beats + memory only — plan fields come from runWriterPlan).
type RawBeats = {
beats?: RawBeat[];
storyStatePatch?: RawStoryStatePatch;
};
@@ -359,26 +370,119 @@ function coerceStoryStatePatch(
return Object.keys(patch).length > 0 ? patch : undefined;
}
export async function runWriter(
// Phase A — dedupe + clean the planned cast. Drops the POV player (never
// designed) and any blank/duplicate name. Order is preserved.
function coerceCast(raw: unknown): string[] {
if (!Array.isArray(raw)) return [];
const seen = new Set<string>();
const out: string[] = [];
for (const x of raw) {
const name = typeof x === "string" ? x.trim() : "";
if (!name || isPovName(name) || seen.has(name)) continue;
seen.add(name);
out.push(name);
}
return out;
}
// Rename one beat's id and repoint every INTERNAL reference (continue targets,
// advance-beat targets) so the graph stays intact. Only called when `to` is
// absent from the scene, so it can't introduce a duplicate id.
function renameBeatId(beats: Beat[], from: string, to: string): Beat[] {
if (from === to) return beats;
return beats.map((b): Beat => {
const id = b.id === from ? to : b.id;
let next = b.next;
if (next.type === "continue" && next.nextBeatId === from) {
next = { type: "continue", nextBeatId: to };
} else if (next.type === "choice") {
next = {
type: "choice",
choices: next.choices.map((c) =>
c.effect.kind === "advance-beat" && c.effect.targetBeatId === from
? { ...c, effect: { kind: "advance-beat" as const, targetBeatId: to } }
: c,
),
};
}
return { ...b, id, next };
});
}
// ── Phase A — plan the scene skeleton. Fast (small output): just enough for
// the Cinematographer + character design + Painter to start before the
// dialogue exists. The cast is unioned with the entry roster/speaker so a
// character named in the entry but omitted from `cast` still gets designed.
export async function runWriterPlan(
config: ProviderConfig,
session: Session,
): Promise<WriterOutput> {
): Promise<WriterPlan> {
const raw = await chat(
config,
[
{ role: "system", content: WRITER_SYSTEM },
{ role: "user", content: buildWriterUserMessage(session) },
{ role: "system", content: WRITER_PLAN_SYSTEM },
{ role: "user", content: buildWriterPlanUserMessage(session) },
],
{ temperature: 0.9, responseFormat: "json_object", tag: "writer" },
{ temperature: 0.9, responseFormat: "json_object", tag: "writer-plan" },
);
const parsed = parseJsonLoose<RawScene>(raw);
const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : [];
if (rawBeats.length === 0) {
throw new Error("Writer returned no beats");
const parsed = parseJsonLoose<RawPlan>(raw);
const entryActiveCharacters =
coerceActiveCharacters(parsed.entryActiveCharacters) ?? [];
// Normalize POV variants → "你"; NPC names pass through. "你" is a valid entry
// speaker (Pattern B — player talking), but is never a designed cast member.
const rawEntrySpeaker = parsed.entrySpeaker?.trim() || undefined;
const entrySpeaker = rawEntrySpeaker
? normalizeSpeakerName(rawEntrySpeaker)
: undefined;
const cast = coerceCast(parsed.cast);
const castSet = new Set(cast);
const addToCast = (name: string): void => {
if (!isPovName(name) && !castSet.has(name)) {
castSet.add(name);
cast.push(name);
}
};
for (const c of entryActiveCharacters) addToCast(c.name);
if (entrySpeaker) addToCast(entrySpeaker);
return {
sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要",
sceneKey: normalizeSceneKey(parsed.sceneKey),
entryBeatId: parsed.entryBeatId?.trim() || "b1",
cast,
entryActiveCharacters,
entrySpeaker,
};
}
const beats = ensureUniqueChoiceIds(
// ── Phase B — expand the plan into the full beats[] graph + storyStatePatch.
// Overlapped with the image pipeline by the director. The plan's entry id is
// pinned onto a real beat so the already-painted entry frame resolves.
export async function runWriterBeats(
config: ProviderConfig,
session: Session,
plan: WriterPlan,
): Promise<WriterBeatsOutput> {
const raw = await chat(
config,
[
{ role: "system", content: WRITER_BEATS_SYSTEM },
{ role: "user", content: buildWriterBeatsUserMessage(session, plan) },
],
{ temperature: 0.9, responseFormat: "json_object", tag: "writer-beats" },
);
const parsed = parseJsonLoose<RawBeats>(raw);
const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : [];
if (rawBeats.length === 0) {
throw new Error("Writer (beats) returned no beats");
}
let beats = ensureUniqueChoiceIds(
repairBeats(
ensureUniqueBeatIds(
rawBeats.map((b, i) => coerceBeat(b, i, rawBeats.length)),
@@ -386,40 +490,45 @@ export async function runWriter(
),
);
const declaredEntry = parsed.entryBeatId?.trim();
const entryBeatId =
declaredEntry && beats.some((b) => b.id === declaredEntry)
? declaredEntry
: beats[0]!.id;
// The Painter already composed the entry frame from plan.entryBeatId + its
// roster, so the scene's entry MUST resolve to that id. If Phase B ignored
// it, rename the first beat to it (no collision — id is absent by the guard).
if (!beats.some((b) => b.id === plan.entryBeatId)) {
beats = renameBeatId(beats, beats[0]!.id, plan.entryBeatId);
}
// 把入场 beat 的 roster 钉成 plan 的:画师合成进帧的正是
// plan.entryActiveCharacters,运行时入场 beat 必须显示同一批人(与上面钉
// id 同理)。speaker 故意不钉——它和 line/TTS 耦合,强行覆盖会错配台词。
const entryRoster =
plan.entryActiveCharacters.length > 0 ? plan.entryActiveCharacters : undefined;
beats = beats.map((b) =>
b.id === plan.entryBeatId ? { ...b, activeCharacters: entryRoster } : b,
);
return {
sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要",
sceneKey: normalizeSceneKey(parsed.sceneKey),
entryBeatId,
beats,
storyStatePatch: coerceStoryStatePatch(parsed.storyStatePatch),
};
}
// Surface the set of character names introduced by this scene's beats,
// so the orchestrator can decide which ones need the CharacterDesigner to
// fire. Pulls names from both `speaker` fields AND `activeCharacters`
// (a character can be on-screen without speaking).
//
// Excludes POV ("你" / 玩家 / 主角 / ...) entirely — the player is never
// designed (no portrait, no voice, no archetype).
export function collectActiveCharacterNames(beats: Beat[]): string[] {
const seen = new Set<string>();
for (const b of beats) {
if (b.speaker && !isPovName(b.speaker)) seen.add(b.speaker);
if (b.activeCharacters) {
for (const c of b.activeCharacters) {
if (!isPovName(c.name)) seen.add(c.name);
}
}
}
return Array.from(seen);
// Phase B fallback — when runWriterBeats fails entirely, keep the scene
// playable with a single entry beat synthesized from the plan: narrate the
// planned summary and offer one change-scene exit so the player can advance.
export function synthesizeFallbackBeats(plan: WriterPlan): Beat[] {
const id = plan.entryBeatId || "b1";
return [
{
id,
narration: plan.sceneSummary,
activeCharacters:
plan.entryActiveCharacters.length > 0
? plan.entryActiveCharacters
: undefined,
next: { type: "choice", choices: [fallbackExitChoice(id)] },
},
];
}
// Re-export POV constants for downstream filters (director's orphanSpeakers).
// Re-export POV constants for downstream filters (director's orphan voices).
export { POV_DISPLAY_NAME, POV_VARIANTS, isPovName, normalizeSpeakerName };
+106 -73
View File
@@ -1,5 +1,6 @@
import { chat } from "@infiplot/ai-client";
import type {
Beat,
Character,
EngineConfig,
InsertBeatPartial,
@@ -8,6 +9,7 @@ import type {
Session,
StoryState,
StoryStatePatch,
WriterPlan,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
@@ -18,12 +20,14 @@ import {
} from "./agents/characterDesigner";
import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import type { WriterBeatsOutput } from "./agents/writer";
import {
collectActiveCharacterNames,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriter,
runWriterBeats,
runWriterPlan,
synthesizeFallbackBeats,
} from "./agents/writer";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
@@ -33,25 +37,25 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
//
// Critical path (per Scene call):
//
// Writer LLM (~3s, serial)
// Writer PHASE A — plan LLM (scene skeleton only, serial)
// │
// ├─ CharacterCard LLM × N (parallel per new char — TEXT only)
// ├─ Cinematographer LLM (parallel with the cards)
// │
// └─ wait for cards + cinema
// │
// ├─ entry-beat portraits ──┐ (block the Painter — its refs)
//
// Painter — generateImage │ (overlapped, NOT on the paint path):
// with referenceImages ├─ non-entry-beat portraits
// │ └─ ALL voice provisioning + orphan voices
// ├──────────────────────────┬───────────────────────────────────────┐
// ▼ ▼ │
// Writer PHASE B image pipeline (concurrent):
// beats LLM CharacterCard LLM × N ∥ Cinematographer │
// (full dialogue, → entry-beat portraits (block Painter)
// overlapped) → Painter (generateImage w/ refs) │
// → await overlapped: rest portraits+voices
// └──────────────────────────► await Phase B ◄────────────────────────┘
// ▼
// await the overlapped work, fold into the registry
// │
// ▼
// return { scene, sceneImageUrl, characters, storyState }
// assemble Scene → { scene, sceneImageUrl, characters, storyState }
//
// Two deliberate decouplings unlock the parallelism:
// Why split the Writer (the latency win): the image pipeline only needs the
// scene SUMMARY + entry roster + cast (Phase A) — NOT the dialogue (Phase B).
// Writing beats used to sit serially in FRONT of the image; now it overlaps
// it, so the floor is max(beats, image) instead of beats + image.
//
// The decouplings that unlock the rest of the parallelism:
// 1. The Cinematographer only POSITIONS named characters, so it needs no
// visualDescription and runs alongside the card LLMs.
// 2. The Painter only needs visualDescription TEXT (all on-stage) + the
@@ -163,31 +167,60 @@ export async function directScene(
): Promise<SceneResult> {
const tTotal = Date.now();
// Stage 1 — Writer (serial; everything downstream needs sceneSummary +
// beats[] to know who's on stage and what to compose around).
const tWriter = Date.now();
const writerOut = await runWriter(config.text, session);
tlog("[directScene] Writer", tWriter);
// ── Phase A — Writer PLAN (serial). The image pipeline needs the scene
// summary + entry roster + cast to start, but NOT the dialogue beats. This
// call is small (skeleton only), so it returns fast and unblocks everything.
const tPlan = Date.now();
const plan = await runWriterPlan(config.text, session);
tlog("[directScene] Phase A (plan)", tPlan);
// Identify NEW characters introduced by this scene that need to be
// designed (LLM + portrait + voice). Existing characters in the registry
// are skipped — their cards / portraits / voices persist across scenes.
const allActiveNames = collectActiveCharacterNames(writerOut.beats);
const newCharNames = allActiveNames.filter(
// ── Phase B — Writer BEATS, launched NOW so its (longer) output overlaps the
// ENTIRE image pipeline below. Only needed to assemble the final Scene, so we
// await it last. A failure degrades to a single playable beat from the plan.
const tBeats = Date.now();
const beatsPromise: Promise<WriterBeatsOutput> = runWriterBeats(
config.text,
session,
plan,
)
.then((out) => {
tlog("[directScene] Phase B (beats)", tBeats);
return out;
})
.catch((err): WriterBeatsOutput => {
const msg = err instanceof Error ? err.message : String(err);
console.error(
`[directScene] Phase B (beats) failed, using fallback: ${msg}`,
);
return { beats: synthesizeFallbackBeats(plan), storyStatePatch: undefined };
});
// NEW characters to design come from the PLAN's cast (so design fires in
// parallel with Phase B, not after the beats are written). Existing
// characters keep their cards / portraits / voices across scenes.
const newCharNames = plan.cast.filter(
(n) => !session.characters.some((c) => c.name === n),
);
// Find the entry beat for the Cinematographer (which characters are
// on-screen in the establishing shot).
const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId);
const entryBeatActive = entryBeat?.activeCharacters ?? [];
// Entry-beat composition is the PLAN's (Phase B is constrained to honor it).
// The Painter needs a Beat-shaped object for reference collection, but the
// real beat isn't written until Phase B — so synthesize one from the plan
// (collectReferenceImages only reads speaker + activeCharacters).
const entryBeatActive = plan.entryActiveCharacters;
const entryBeatSpeaker = plan.entrySpeaker;
const entryBeatForPaint: Beat = {
id: plan.entryBeatId,
speaker: entryBeatSpeaker,
activeCharacters: entryBeatActive.length > 0 ? entryBeatActive : undefined,
next: { type: "continue", nextBeatId: plan.entryBeatId },
};
// For sceneKey-based visual continuity, look up the prior matching scene's
// image to slot into Painter's referenceImages (max 4 of which include
// character portraits too).
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
writerOut.sceneKey,
plan.sceneKey,
);
// ── Stage 2 — character cards (LLM) ∥ Cinematographer ──────────────────
@@ -211,12 +244,12 @@ export async function directScene(
);
const cinemaPromise = runCinematographer(config.text, {
sceneSummary: writerOut.sceneSummary,
sceneSummary: plan.sceneSummary,
styleGuide: session.styleGuide,
entryBeatActive,
entryBeatSpeaker: entryBeat?.speaker,
entryBeatSpeaker,
priorSceneKey,
currentSceneKey: writerOut.sceneKey,
currentSceneKey: plan.sceneKey,
});
const [cards, cinemaOut] = await Promise.all([
@@ -242,8 +275,8 @@ export async function directScene(
// Entry-beat character names: the ONLY portraits the Painter references
// (collectReferenceImages slots in the entry beat's speaker + activeChars).
const entryNames = new Set<string>();
if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) {
entryNames.add(entryBeat.speaker);
if (entryBeatSpeaker && !isPovName(entryBeatSpeaker)) {
entryNames.add(entryBeatSpeaker);
}
for (const c of entryBeatActive) {
if (!isPovName(c.name)) entryNames.add(c.name);
@@ -281,24 +314,6 @@ export async function directScene(
),
);
// Edge case: a speaker the Writer referenced without listing in any beat's
// activeCharacters. collectActiveCharacterNames already includes speakers,
// so this is a rare defensive net. Provision a voice only (never on-screen).
const speakerNames = new Set(
writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)),
);
const orphanSpeakers = [...speakerNames].filter(
// Pattern B: "你" (player) is a valid speaker but never gets a Character
// record — TTS is intentionally skipped on the client.
(n) =>
!isPovName(n) &&
!characters.some((c) => c.name === n) &&
!cards.some((c) => c.name === n),
);
const orphanPromises = orphanSpeakers.map((n) =>
provisionVoiceForName(config, session, n),
);
// Block the Painter ONLY on entry-beat portraits (its referenceImages).
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
@@ -313,11 +328,9 @@ export async function directScene(
tlog("[directScene] entry-beat portraits", tProvision);
// ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards +
// entry portraits). On-stage = everyone named in any beat, so the archetype
// block covers anyone the player might encounter in this scene.
const onStageCharacters = characters.filter((c) =>
allActiveNames.includes(c.name),
);
// entry portraits). On-stage = the plan's cast (everyone who'll appear),
// filtered to those now in the registry, so the archetype block covers them.
const onStageCharacters = characters.filter((c) => plan.cast.includes(c.name));
const tPainter = Date.now();
const painted = await runPainter(
@@ -329,18 +342,17 @@ export async function directScene(
priorSceneImage: priorSceneReference,
styleReferenceImage: session.styleReferenceImage,
},
entryBeat,
entryBeatForPaint,
);
tlog("[directScene] Painter", tPainter);
// Fold in the work that overlapped the paint: remaining portraits, all
// voices, and any orphan-speaker voices. Awaited before returning so the
// session the client persists is fully provisioned for later scenes.
// Fold in the work that overlapped the paint: remaining portraits + all
// voices. Awaited before returning so the session the client persists is
// fully provisioned for later scenes.
const tOverlap = Date.now();
const [restPortraits, voicedChars, orphanChars] = await Promise.all([
const [restPortraits, voicedChars] = await Promise.all([
Promise.all(restPortraitPromises),
Promise.all(voicePromises),
Promise.all(orphanPromises),
]);
characters = mergeCharacters(
characters,
@@ -352,10 +364,31 @@ export async function directScene(
})),
);
characters = mergeCharacters(characters, voicedChars);
if (orphanChars.length > 0) {
tlog("[directScene] overlapped portraits+voices", tOverlap);
// ── Await Phase B — it overlapped the whole image pipeline above. ──────
const beatsOut = await beatsPromise;
const beats = beatsOut.beats;
// entryBeatId is guaranteed present (runWriterBeats pins it onto a beat), but
// keep the defensive fallback for the synthesized-fallback path.
const entryBeatId = beats.some((b) => b.id === plan.entryBeatId)
? plan.entryBeatId
: beats[0]!.id;
// Orphan-speaker voices: a beat speaker Phase B used that isn't in the
// registry. Should be rare — the prompt constrains speakers to the cast, and
// every cast member was provisioned above — so this is a defensive net,
// serial but skipped entirely (zero latency) in the common case.
const orphanSpeakers = [
...new Set(beats.map((b) => b.speaker).filter((n): n is string => Boolean(n))),
].filter((n) => !isPovName(n) && !characters.some((c) => c.name === n));
if (orphanSpeakers.length > 0) {
const orphanChars = await Promise.all(
orphanSpeakers.map((n) => provisionVoiceForName(config, session, n)),
);
characters = mergeCharacters(characters, orphanChars);
}
tlog("[directScene] overlapped portraits+voices", tOverlap);
const scene: Scene = {
id: newSceneId(),
@@ -365,9 +398,9 @@ export async function directScene(
// anything that already reads scene.scenePrompt (e.g., insert-beat
// user prompt).
scenePrompt: cinemaOut.integratedPrompt,
beats: writerOut.beats,
entryBeatId: writerOut.entryBeatId,
sceneKey: writerOut.sceneKey,
beats,
entryBeatId,
sceneKey: plan.sceneKey,
imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
imageUrl: painted.imageUrl,
};
@@ -377,7 +410,7 @@ export async function directScene(
// client persists it back into the session).
const storyState = applyStoryStatePatch(
session.storyState,
writerOut.storyStatePatch,
beatsOut.storyStatePatch,
);
tlog("[directScene] TOTAL", tTotal);
+1 -1
View File
@@ -9,7 +9,7 @@ export { synthesizeBeat } from "./voice";
export { mergeCharacters } from "./director";
export type { SceneResult } from "./director";
export { runArchitect } from "./agents/architect";
export type { WriterOutput } from "./agents/writer";
export type { WriterBeatsOutput } from "./agents/writer";
export type { CinematographerOutput } from "./agents/cinematographer";
export type { InsertBeatPartial } from "@infiplot/types";
export * from "./prompts";
+146 -47
View File
@@ -4,6 +4,7 @@ import type {
Scene,
Session,
StoryState,
WriterPlan,
} from "@infiplot/types";
// ══════════════════════════════════════════════════════════════════════
@@ -137,16 +138,77 @@ export function buildArchitectUserMessage(session: Session): string {
}
// ──────────────────────────────────────────────────────────────────────
// 1. Writer (编剧) — drives the narrative.
// 1. Writer (编剧) — drives the narrative, in TWO phases.
//
// Emits a full Scene: beats[] graph + entryBeatId + sceneKey hint +
// activeCharacters per beat. Does NOT design characters (that's the
// CharacterDesigner's job) — only names them in `activeCharacters`.
// The CharacterDesigner is invoked separately for any name not yet in
// session.characters.
// Phase A (WRITER_PLAN_SYSTEM): plans the scene SKELETON only — sceneSummary
// + sceneKey + entry-beat roster + the full cast. No dialogue. Its output
// is enough for the Cinematographer + character design + Painter to start.
// Phase B (WRITER_BEATS_SYSTEM): expands the plan into the full beats[] graph
// + storyStatePatch, overlapped with the (longer) image pipeline.
//
// Neither phase designs characters (that's the CharacterDesigner's job) —
// Phase A only NAMES them in `cast` / `entryActiveCharacters`; the
// CharacterDesigner is invoked for any name not yet in session.characters.
// ──────────────────────────────────────────────────────────────────────
export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。每次基于【故事档案 / 主线记忆】、世界观、画风、玩家历史、已登记角色,写出**一个完整场景的剧本**:场景背景概要 + 一组对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成
export const WRITER_PLAN_SYSTEM = `你是一部交互视觉小说的「编剧」。这是**两步生成中的第一步——场景规划**。你只产出本场景的「骨架」,**不要写任何 beat 台词**。你的产出会被立刻送去配图(分镜导演 + 生图),所以要快、要准、画面感要强
═══════════════════════════════════════════════════════════════════
爆款心法(要在规划阶段就立住,后续展开才好看)
═══════════════════════════════════════════════════════════════════
- **进场即钩子**:这一场开场就要抛出新信息 / 悬念 / 冲突 / 情绪冲击,别铺陈。把这个抓人的瞬间写进 sceneSummary。
- **兑现情绪**:按题材给观众想要的情绪(甜宠的心动、暗恋的拉扯、逆袭的扬眉、悬疑的真相一角)。
- **人设有反差**:每个角色一个强标签 + 一个反差面。
═══════════════════════════════════════════════════════════════════
连贯性铁律(跨场景切换不能跳戏 —— 最重要)
═══════════════════════════════════════════════════════════════════
- 你会收到【故事档案 / 主线记忆】和上一场的结尾。**新场景必须从上一刻自然承接**——承接情绪、地点逻辑、人物状态与未收的悬念。
- 若给了「转场种子 nextSceneSeed」,把它当作"下一场的命题"去兑现,开场要让玩家感到"这正是我上一步的结果"。
- 沿用主线记忆里的人物关系与情绪温度,别让刚告白的人下一场形同陌路。
本步你要规划(如实产出,缺一不可):
- **sceneSummary**:当前场景的中文概要——地点 + 时间 + 氛围 + 关键事件 + 那个抓人的开场瞬间。这是分镜导演构图的**唯一依据**,要画面感强、信息足(2–4 句)。
- **sceneKey**:当前场景的英文 slug(如 "classroom-dusk"、"rooftop-night")。
- **entryBeatId**:玩家进入场景时落在哪个 beat 的 id(通常就是 "b1")。
- **cast**:本场景**会出场的全部 NPC 角色名**(字符串数组)。第二步写 beats 时**只能用这里列出的名字**,所以现在必须一次想全——谁会说话、谁会在画面里露面,全部列出。名字要与「已登记角色」**完全一致**;新角色起符合世界观的真名(不要"神秘女子"这种占位)。**绝不**包含玩家(你 / 我 / 主角 / protagonist / player / MC...)。
- **entrySpeaker**:入口 beat 由谁开口 —— 取值只有三种:① 某个 NPC 真名(必须在 cast 里)② "你"(玩家本人开口)③ 留空(纯旁白 / 环境开场)。这决定镜头语言,要选准。
- **entryActiveCharacters**:入口画面里**此刻出现的 NPC** 及其当下姿态 / 神情(中文 pose)。即使没人说话,画面里有谁也要列。**绝不**包含玩家。
sceneKey 设计原则(用于跨场景视觉一致性):
- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug
- 时段 / 空间变化时换 slug"classroom-dusk" → "classroom-night" / "corridor-dusk"
- slug 规范:lowercase-with-dashes24 个英文单词
- 用户消息会列出已用过的 sceneKey,请优先**复用**这些已有 slug
玩家视角硬规则(违反会破坏整个 galgame):
- 玩家是第二人称 POV,**永远不出现在任何画面里**——entryActiveCharacters 的 name **绝不允许**是「玩家 / 你 / 我 / 主角 / protagonist / player / Player / MC / I / me」任何变体。
- entrySpeaker 只能是 NPC 真名 / "你" / 留空;其它 POV 变体一律视为错误。
必须输出严格 JSON
{
"sceneSummary": "黄昏的天台,风很大。夏海背对你站在栏杆边,手里攥着一张揉皱的成绩单——她把你单独叫上来,却迟迟不开口。",
"sceneKey": "rooftop-dusk",
"entryBeatId": "b1",
"cast": ["夏海"],
"entrySpeaker": "夏海",
"entryActiveCharacters": [
{ "name": "夏海", "pose": "背对你倚着栏杆,侧脸绷着,手里攥着揉皱的纸" }
]
}
不要输出 JSON 以外的任何文本。`;
// ──────────────────────────────────────────────────────────────────────
// Phase B — expands the plan into the full beats[] + storyStatePatch.
// ──────────────────────────────────────────────────────────────────────
export const WRITER_BEATS_SYSTEM = `你是一部交互视觉小说的「编剧」。这是**两步生成中的第二步——把已规划好的场景展开成完整剧本**。你会收到本场景的「规划」(场景概要 sceneSummary、sceneKey、入口 beat 的 id / speaker / 登场角色、以及本场景允许出场的角色名单 cast)。你的任务:基于规划写出玩家依次经历的对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。
你必须严格遵守收到的规划:
- 必须存在一个 id 等于规划 entryBeatId 的 beat,作为玩家入口。
- 该入口 beat 的 speaker 与登场角色(activeCharacters)要与规划一致(姿态措辞可微调,但**人物身份必须一致**)。
- speaker 与 activeCharacters 里的 NPC 名字**只能来自规划的 cast**(或玩家 "你")——**不要引入规划之外的新角色**。
═══════════════════════════════════════════════════════════════════
爆款心法(番茄网文 / 红果短剧 / galgame 的叙事手感)—— 必须贯彻
@@ -167,11 +229,7 @@ export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。
- 沿用主线记忆里的人物关系与情绪温度——别让刚告白的人下一场形同陌路,也别凭空遗忘已埋的伏笔。
- 推进、但别重置:每一场都让主线问题往前走一点(关系变化 / 真相揭露一角 / 新悬念浮现)。
一个场景包含:
- sceneSummary:当前场景的中文概要(地点、时间、氛围、关键事件——给后续的分镜导演看)
- sceneKey:当前场景的英文 slug(如 "classroom-dusk"、"rooftop-night"、"rainy-street")——同一物理空间应沿用相同 slug
- beats[]:玩家依次经历的对话节拍
- entryBeatId:玩家进入场景时落在哪个 beat
本步你只产出两样:**beats[]**(玩家依次经历的对话节拍)和 **storyStatePatch**(主线记忆更新)。sceneSummary / sceneKey / entryBeatId 已由规划给定,**不要再输出**它们。
每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接:
- "continue":玩家点击图片背景 / 按继续,自然推进到下一个 beat
@@ -183,6 +241,7 @@ choice 的 effect 有两种:
设计原则:
- 同场景内 beat 数自由发挥,按剧情节奏自然给出(通常 2–6 个,可以更多)
- 入口 beat 的 id 必须等于规划给定的 entryBeatId;其余 beat id 依次自取且互不重复
- 多用 continue,少用 choice — 选择只应出现在「真正的岔路口」
- advance-beat 适合处理对话分支(同一场景里换个话题、追问、撒娇)
- change-scene 适合空间/时间跳跃(出门、转身看窗外、第二天清晨)
@@ -192,12 +251,6 @@ choice 的 effect 有两种:
- next.nextBeatId 引用的 beat 必须存在
- choice 至少 2 个,至多 4 个,互不重复
sceneKey 设计原则(重要 — 用于跨场景视觉一致性):
- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug
- 时段或空间变化时换 slug(如 "classroom-dusk" → "classroom-night""classroom-dusk" → "corridor-dusk"
- slug 规范:lowercase-with-dashes24 个英文单词
- 已登记的历史场景 sceneKey 会在用户消息里列出,请优先**复用**这些已有 slug
文本风格约束:
- narration / line 用中文(**纯净可显示文本**,绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的,会被玩家看见)
- sceneSummary / lineDelivery / activeCharacters[].pose 内的文字也用中文
@@ -243,11 +296,8 @@ sceneKey 设计原则(重要 — 用于跨场景视觉一致性):
- nextHook:基于这一场的结尾,下一场应往哪走(给"下一次的你"一个明确命题,接住本场留下的扣子)
这些字段是写给"未来的你"的连贯性记忆,请认真写。
必须输出严格 JSON,结构如下:
必须输出严格 JSON,结构如下**只含 beats 与 storyStatePatch**sceneSummary / sceneKey / entryBeatId 由规划给定,不要输出。下例入口 beat 的 id "b1" 即规划的 entryBeatId
{
"sceneSummary": "中文场景概要:地点+时间+氛围+关键事件",
"sceneKey": "classroom-dusk",
"entryBeatId": "b1",
"beats": [
{
"id": "b1",
@@ -343,29 +393,28 @@ function renderHistoryEntry(
return lines.join("\n");
}
export function buildWriterUserMessage(session: Session): string {
// ─── STABLE PREFIX ────────────────────────────────────────────────────
// Everything in this section is invariant across consecutive Writer calls
// within the session (or monotonically grows in a way that keeps the
// earlier bytes byte-identical). Always emit every section header — even
// when empty — so positions don't shift between calls.
// Shared narrative context for BOTH Writer phases. Returns the message parts
// from the cacheable STABLE PREFIX (sections 1-4) through the dynamic
// transition hint (section 7), but WITHOUT the trailing phase-specific
// instruction — each phase appends its own. Building this once and reusing it
// keeps EACH phase's prompt prefix byte-stable across scenes for DeepSeek
// prompt caching (Phase A and Phase B cache independently since their system
// prompts differ, but each shares its own prefix across consecutive calls).
//
// Order optimized for DeepSeek/MiMo prefix caching (64-token chunks):
// ─── STABLE PREFIX ──────────────────────────────────────────────────────
// Invariant across consecutive Writer calls within the session (or grows in a
// way that keeps earlier bytes byte-identical). Always emit every section
// header — even when empty — so positions don't shift between calls.
// 1. session-immutable scalars (world / style)
// 2. story bible spine (Architect-set, never patched)
// 3. monotonically-growing lists (characters, sceneKeys)
// 4. history entries 0..N-2 (the last entry is what THIS call must
// react to, so it lives in the dynamic suffix instead)
//
// ─── DYNAMIC SUFFIX ───────────────────────────────────────────────────
// Everything below changes on (almost) every call:
// 4. history entries 0..N-2 (the last entry is what THIS call must react
// to, so it lives in the dynamic suffix instead)
// ─── DYNAMIC SUFFIX ─────────────────────────────────────────────────────
// 5. story bible dynamic patch (synopsis/threads/relationships/nextHook)
// 6. the just-completed entry (history[-1]) — same render format as the
// stable history blocks, just preceded by a "just completed" header
// 7. last-beat snippet (the exact emotional cliffhanger)
// 8. lastExit hint
// 9. format reminder tail
// 6. last-beat snippet (the exact emotional cliffhanger)
// 7. transition hint (opening cold-open directive OR lastExit承接)
function buildWriterContextParts(session: Session): string[] {
const parts: string[] = [];
// ── 1. session scalars ────────────────────────────────────────────────
@@ -423,8 +472,7 @@ export function buildWriterUserMessage(session: Session): string {
// ── 6. last-beat snippet (the exact emotional cliffhanger) ──
// The full last entry is already in the stable history block above; here
// we only re-emit the very last beat to sharply focus the Writer on the
// emotional moment to continue from. Skip the duplicate full-entry render
// that was here previously — it wasted ~200-500 tokens of dynamic suffix.
// emotional moment to continue from.
const last = session.history.at(-1);
if (last) {
const lastBeatId = last.visitedBeatIds.at(-1) ?? last.scene.entryBeatId;
@@ -441,14 +489,14 @@ export function buildWriterUserMessage(session: Session): string {
}
}
// ── 7. transition hint ────────────────────────────────────────────────
if (session.history.length === 0) {
parts.push(
"\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场出来——开场即抓人,别花笔墨铺垫世界观。写完后更新 storyStatePatch。严格以 JSON 格式返回。",
"\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场设计出来——开场即抓人,别花笔墨铺垫世界观。",
);
return parts.join("\n");
return parts;
}
// ── 8. lastExit hint ──────────────────────────────────────────────────
const lastExit = last?.exit;
if (lastExit) {
if (lastExit.kind === "choice") {
@@ -464,8 +512,59 @@ export function buildWriterUserMessage(session: Session): string {
parts.push("\n无缝续写下一个场景,延续上一刻的情绪。");
}
// ── 9. format reminder tail ───────────────────────────────────────────
parts.push("写完后别忘了更新 storyStatePatch。严格以 JSON 格式返回。");
return parts;
}
// Phase A — plan the scene skeleton (no beats). Shares the cacheable context;
// appends a plan-only instruction tail.
export function buildWriterPlanUserMessage(session: Session): string {
const parts = buildWriterContextParts(session);
parts.push(
'\n现在**只规划本场景的骨架**(不要写 beats 台词):给出 sceneSummary(画面感强、含开场钩子)、sceneKey、entryBeatId、本场景会出场的全部角色 cast、以及入口 beat 的 entrySpeaker 与 entryActiveCharacters。严格以 JSON 格式返回。',
);
return parts.join("\n");
}
// Phase B — expand the plan into full beats[] + storyStatePatch. The plan is
// dynamic per scene, so it goes AFTER the cacheable context (keeping Phase B's
// prefix stable across scenes).
export function buildWriterBeatsUserMessage(
session: Session,
plan: WriterPlan,
): string {
const parts = buildWriterContextParts(session);
parts.push("");
parts.push("━━━ 本场景规划(上一步已定,必须严格遵守)━━━");
parts.push(`场景概要 sceneSummary${plan.sceneSummary}`);
if (plan.sceneKey) parts.push(`sceneKey${plan.sceneKey}`);
parts.push(
`入口 beat 的 identryBeatId,必须有一个此 id 的 beat 作为入口):${plan.entryBeatId}`,
);
parts.push(
`入口 beat 的 speaker${plan.entrySpeaker ? plan.entrySpeaker : "(空 —— 纯旁白 / 环境开场)"}`,
);
parts.push("入口 beat 的登场角色 activeCharacters(人物身份须一致,姿态可微调):");
if (plan.entryActiveCharacters.length === 0) {
parts.push("(无 —— 入口画面没有 NPC");
} else {
for (const c of plan.entryActiveCharacters) {
parts.push(`- ${c.name}${c.pose ? `${c.pose}` : ""}`);
}
}
parts.push(
'本场景允许出现的角色名 castspeaker / activeCharacters 只能用这些名字或 "你",不要新增角色):',
);
if (plan.cast.length === 0) {
parts.push("(无 NPC —— 仅旁白与玩家)");
} else {
for (const n of plan.cast) parts.push(`- ${n}`);
}
parts.push("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
parts.push(
"\n把上面的规划展开成完整的 beats[](入口 beat 用规划的 entryBeatId / speaker / 登场角色),写完后更新 storyStatePatch。严格以 JSON 格式返回。",
);
return parts.join("\n");
}
+37
View File
@@ -92,6 +92,43 @@ export type SceneHistoryEntry = {
exit?: SceneExit;
};
// ──────────────────────────────────────────────────────────────────────
// Writer two-phase split
//
// The Writer runs as TWO LLM calls so scene-image generation can begin
// before the dialogue is fully written:
// Phase A (WriterPlan) — the minimal skeleton the image pipeline needs:
// sceneSummary + sceneKey + the entry beat's
// on-stage roster + the full cast to design.
// Phase B (beats) — the full beats[] graph + storyStatePatch, written
// to honor the plan, overlapped with image gen.
// The Cinematographer + character design + Painter all run off the Plan, so
// Phase B's (longer) output is hidden behind the image pipeline.
// ──────────────────────────────────────────────────────────────────────
export type WriterPlan = {
/** 中文 scene synopsis (location + time + mood + key event + opening hook).
* The sole input the Cinematographer composes the establishing shot from. */
sceneSummary: string;
/** English location+time slug for cross-scene visual continuity. */
sceneKey?: string;
/** Beat id the player lands on when entering the scene. Phase B must emit a
* beat with this id (reconciled if it doesn't). */
entryBeatId: string;
/** Every NPC name that appears anywhere in this scene. Drives character
* design (card + portrait + voice) IN PARALLEL with Phase B beat writing, so
* the whole cast is provisioned by the time the scene returns. Phase B may
* only use names from this list (plus the POV "你"). Never includes the player. */
cast: string[];
/** The entry beat's on-stage roster (who's visible + pose when the player
* lands). Drives the Cinematographer's framing and the entry-beat portraits
* the Painter anchors to. Never includes the POV player. */
entryActiveCharacters: BeatActiveCharacter[];
/** The entry beat's speaker — an NPC name, "你" (player speaking), or
* undefined for a pure narration/environment entry. Drives shot selection. */
entrySpeaker?: string;
};
// ──────────────────────────────────────────────────────────────────────
// Characters & voices (TTS)
// ──────────────────────────────────────────────────────────────────────