Files
infiplot-web/lib/engine/prompts.ts
T
yuanzonghao 6f8125570a feat(play): always generate new scene for freeform text input + enhance insert-beat
User feedback: custom interactions rarely produce new story content because
the classifier heavily biased toward insert-beat (single reaction, no scene
change). Three changes to fix this:

1. Freeform text input now always triggers a full scene generation (skips
   the classify step entirely) — users who type expect the story to advance.

2. Vision (background click) classifier de-biased: prompt now favors
   change-scene when uncertain, and the code fallback flipped from
   insert-beat to change-scene. insert-beat narrowed to pure observation.

3. Insert-beat enhanced: generates 1-3 beats (was 1) with follow-up
   choices (was: loop back to original beat). Even when vision classifies
   as insert-beat, the player gets richer content and new options.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-24 18:36:35 +08:00

746 lines
43 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import type {
BeatActiveCharacter,
Character,
CharacterIntent,
Orientation,
Scene,
Session,
StoryState,
WriterPlan,
} from "@infiplot/types";
import { formatStepfunCatalogForPrompt } from "@infiplot/tts-client";
// ══════════════════════════════════════════════════════════════════════
// Output-language directive — appended to user messages so the AI's
// GENERATED dialogue, narration, and voice-design text follow the UI
// locale the player picked. Returns "" for zh-CN (the prompts' native
// language) so existing sessions behave byte-identically to before.
//
// We intentionally append this as a TRAILING one-liner rather than
// rewriting the system prompts in the target language — the prompts body
// is the cacheable / reviewed / future-edit-friendly asset, and a single
// trailing directive is enough for modern LLMs to switch their output
// language while still receiving Chinese instructions.
// ══════════════════════════════════════════════════════════════════════
const LANG_LABELS: Record<string, string> = {
"zh-CN": "简体中文",
en: "English",
ja: "日本語",
};
/**
* Returns a one-line Chinese instruction telling the LLM to produce its
* free-form output (dialogue, narration, voice-design text) in the player's
* selected UI language. Returns an empty string for zh-CN sessions — those
* are the prompts' native language and need no directive.
*
* Always returns Chinese regardless of session.language because the system
* prompts are Chinese; the directive instructs the model to *output* in the
* target language, not to read prompts in it.
*/
export function buildLanguageDirective(language: string | undefined): string {
if (!language || language === "zh-CN") return "";
const label = LANG_LABELS[language];
if (!label) return "";
return `\n【输出语言】你产出的所有自然语言内容(对白台词 line / 旁白 narration / sceneSummary / storyState 各字段 / voiceDescription / lineDelivery 等)必须使用「${label}」;JSON 字段名、sceneKey、英文 visualDescription / painting prompt 仍按各 agent 既有规则。`;
}
// ══════════════════════════════════════════════════════════════════════
// Multi-agent scene generation pipeline:
// Architect (总编剧) — ONE-TIME at session start: the story bible
// (protagonist / logline / genre / opening hook /
// planned cast) → seeds StoryState
// Writer (编剧) — narrative + beats[] + per-beat activeCharacters,
// reads StoryState and emits a StoryStatePatch
// CharacterDesigner — per-new-character visual + voice cards
// Cinematographer (分镜导演) — sceneKey + English compositional prompt
// Painter (画师) — FLUX rendering with character archetypes
//
// Each agent owns one system prompt + one user-message builder below.
// All agents see the same world / style guide, but each only reads the
// slice of session state it needs to make its decision.
// ══════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────
// Shared — render the StoryState bible into a compact prompt block read
// by the Writer (and Architect, on revisions). Keeping one renderer means
// the bible looks identical to every agent that consumes it.
// ──────────────────────────────────────────────────────────────────────
// ── Story bible — split spine / dynamic for prefix-cache friendliness ──
//
// SPINE = Architect-set, never updated by Writer's storyStatePatch:
// logline / genreTags / protagonist / castNotes
// → goes in the STABLE PREFIX of every Writer user message
//
// DYNAMIC = patched every scene by the Writer:
// synopsis / relationships / openThreads / nextHook
// → goes in the DYNAMIC SUFFIX
//
// Keep both sections present even when empty (固定 section) so position is
// stable across calls — a missing section here would shift every byte after
// it and torch the cache.
export function renderStoryStateSpine(s: StoryState | undefined): string {
const lines: string[] = ["【故事档案 · 主轴(不变)】"];
lines.push(`主线(中心钩子):${s?.logline ?? "(未设定)"}`);
lines.push(`题材基调:${s?.genreTags ?? "(未设定)"}`);
lines.push(`主角「你」:${s?.protagonist ?? "(未设定)"}`);
lines.push(`核心配角:${s?.castNotes ?? "(未设定)"}`);
return lines.join("\n");
}
export function renderStoryStateDynamic(s: StoryState | undefined): string {
const lines: string[] = ["【故事档案 · 当前状态(每幕更新)】"];
lines.push(`已发生(梗概):${s?.synopsis ?? "(暂无)"}`);
lines.push(
`当前关系/情绪:${
s?.relationships?.length
? "\n" + s.relationships.map((r) => `- ${r}`).join("\n")
: "(暂无)"
}`,
);
lines.push(
`未收的悬念/伏笔:${
s?.openThreads?.length
? "\n" + s.openThreads.map((t) => `- ${t}`).join("\n")
: "(暂无)"
}`,
);
lines.push(`接下来要往哪走(下一个钩子方向):${s?.nextHook ?? "(暂无)"}`);
return lines.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// Paradigm D — merged Writer (single-pass streaming with tagged output)
// ──────────────────────────────────────────────────────────────────────
// Writer prompt has been refactored to segment-driven builder.
// See lib/engine/prompts/segments/writer/ for individual prompt segments.
// See lib/engine/prompts/registry.ts for segment registration.
// See lib/engine/prompts/builder.ts for assembly logic.
export { buildWriterStreamMessages } from "./prompts/builder";
// Render one history entry as a stable, position-independent block. Used by
// the Writer to dump both "completed past" (stable prefix) and "the entry the
// player just finished" (dynamic suffix) — same format, so the model sees a
// uniform history surface.
export function renderHistoryEntry(
entry: Session["history"][number],
index: number,
): string {
const lines: string[] = [`【场景 ${index}】`];
if (entry.scene.sceneKey) lines.push(` sceneKey: ${entry.scene.sceneKey}`);
const visited = entry.visitedBeatIds.length
? entry.visitedBeatIds
: [entry.scene.entryBeatId];
const beatById = new Map(entry.scene.beats.map((b) => [b.id, b]));
const visitedBeats = visited
.map((id) => beatById.get(id))
.filter((b): b is NonNullable<typeof b> => Boolean(b));
for (const b of visitedBeats) {
const fragments: string[] = [];
if (b.narration) fragments.push(`旁白:${b.narration}`);
if (b.line) fragments.push(`${b.speaker ?? "?"}${b.line}`);
if (fragments.length) lines.push(" " + fragments.join(" / "));
}
if (entry.exit) {
if (entry.exit.kind === "choice") {
lines.push(
` 玩家最终选择:${entry.exit.label}(去往:${entry.exit.nextSceneSeed}`,
);
} else {
lines.push(` 玩家自由动作:${entry.exit.action}`);
}
}
return lines.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 2. CharacterDesigner (角色设定师) — designs one new character.
//
// Receives a character NAME (extracted by the Writer's activeCharacters)
// and produces BOTH the English visual card AND the Chinese voice card
// in a single LLM call. Bundling these two is intentional: a single agent
// that "knows who this character is" produces internally-consistent
// appearance + vocal personality, whereas split agents tend to diverge
// (e.g., gentle-looking character with energetic voice).
// ──────────────────────────────────────────────────────────────────────
// CHARACTER_DESIGNER_SYSTEM is split into a provider-agnostic CORE (visual +
// voice-text rules) and a provider-specific TAIL (the JSON contract). When the
// server runs StepFun, the tail additionally asks the model to pick a preset
// voice id from the 32-entry catalog — so the SAME LLM call that designs the
// character also selects its voice, at zero extra latency. When StepFun is
// off (Xiaomi / no TTS), the tail is byte-identical to the historical prompt
// (Xiaomi path is cache- and behavior-preserving).
const CHARACTER_DESIGNER_SYSTEM_CORE = `你是视觉小说的「角色设定师」——下游的**媒体翻译官**。给你一个**新登场角色的名字**(通常还附带编剧给定的角色性格 / 情绪基调 / 说话基调),你的职责是把这份**已给定的角色意图**忠实翻译成两份媒体卡片:
1. **视觉设定卡(英文)**——给生图模型 FLUX 用,遵循 prompt engineering 风格
2. **音色设定卡(中文)**——给小米 MiMo 配音设计用
你**不发明**角色的性格——性格由编剧主导。你的工作是:**依据给定的性格 / 情绪 / 说话基调,产出最贴合的外貌与音色**。若没有给定性格信息(降级情况),再据角色名 + 世界观自行合理推断。
两份卡片要描绘**同一个人**,且都要贴合给定的角色基调——给定「傲娇腹黑」就别配天真烂漫的外貌与嗓音;给定「声音微颤、欲言又止」音色卡就要体现这份犹豫感。
视觉设定卡 visualDescription 规则:
- **必须完全用英文**
- 风格:用形容词 + 短语,**英文逗号分隔**,符合 FLUX/Stable Diffusion prompt 习惯
- **必须融入全局画风** styleGuide 的美术指向(比如 styleGuide 是「赛博朋克」时,服饰要赛博朋克化)
- **不要写瞬时姿势或表情**(这些由编剧/分镜每帧实时控制)
- 不要包含背景环境(这不是场景图,是角色立绘卡)
- 长度:100–180 个英文词为宜
**必须覆盖的 6 大要素 — 缺一项都会让角色撞脸:**
1. **HAIR(头发)** — 同时写明四点:
① 发色 hair color(具体到色相 + 明度,例 "platinum blonde" / "deep navy blue" / "warm chestnut brown",不要只写 "dark hair"
② 发型 hairstyle(具体款式:twin tails / side ponytail / hime cut / undercut / messy bob / long straight / wolf cut...
③ 头发长度 hair lengthchin-length / shoulder-length / waist-length / cropped 等明确量级)
④ 发饰或刘海特征(blunt bangs / curtain bangs / side-swept / hair ribbon / hairpin,可省但建议有一项)
2. **EYES(眼睛)** — 同时写明:
① 瞳色 eye color(具体色相,例 "amber" / "violet" / "icy blue",不要只写 "dark eyes"
② 眼型 eye shapealmond / round / sharp upturned / droopy / hooded
③ 神情基调 default gaze tonegentle / piercing / sleepy / mischievous,不写瞬时表情)
3. **FACE & BUILD(脸型 + 体格)** — 写 1–2 条标志性特征:
- 脸型轮廓(oval / heart-shaped / sharp jawline / soft round
- 身高与体型相对感(tall and slim / petite / athletic build / broad shoulders
- 一个独特识别点(small mole below left eye / faint freckles / round glasses / fang teeth / scar across brow),用来在画面里第一眼区分
4. **OUTFIT(服饰)** — 同时写明:
① 主体款式(school uniform / casual streetwear / formal suit / kimono / lab coat / military / cyberpunk jacket...
② 配色(主色 + 强调色,例 "navy blazer with crimson tie",不要只写 "dark uniform"
③ 至少一个标志性细节(collar shape / asymmetric hem / layered scarf / fingerless gloves / chunky boots / accessory like a pendant or earring
④ 必须与 styleGuide 美术指向一致
5. **PERSONALITY-DRIVEN VIBE(性格→气质映射)** — 一句话:
- 用 23 个性格关键词(gentle and reserved / sharp and aloof / cheerful and brash / cool and analytical / lazy and easygoing
- 说明这个性格如何投射到整体气场与氛围(approachable warmth / intimidating presence / quiet confidence / carefree aura / scholarly composure),不要写具体姿势动作
6. **OVERALL SILHOUETTE & VIBE TAG(整体剪影 + 一句气质标签)** — 一句话总结这个角色"远远一看就能认出来"的剪影特征
**差异化硬规则 — 避免与已设定角色撞型:**
你会收到「已设定角色清单」,每个条目包含 name + visualDescription。在落笔前**先在心里扫一遍**清单,提取每个角色的 hair color / hair length / eye color / outfit style,然后为新角色挑选**明显对比**的属性组合:
- **发色不能撞**:已有黑发 → 新角色避免黑、深棕;已有金发 → 新角色避免银、浅栗;至少跨一个色系(黑/棕/金/红/橙/银/灰/蓝/紫/绿)
- **瞳色不能撞**:同发色规则,跨色系挑选
- **剪影不能撞**:已有长直发 → 新角色用短发 / 双马尾 / 卷发 / 扎发;用"发长 × 发型"两个维度造差异
- **服饰风格至少一处明显差异**:款式(制服 vs 便服 vs 正装)、主色(暖 vs 冷)、轮廓(紧身 vs 宽松 / 长 vs 短)三者中至少一项明显不同
- 若剧情强制视觉相似(如双胞胎),必须在配饰或配色上做一处显著识别点
落笔顺序建议:先决定 personality keywords → 由性格反推合适的发色 / 服饰倾向 → 再与已有角色对照确认差异 → 最后写成英文 tag 串。
音色设定卡 voiceDescription 规则:
- **必须以明确性别开头**"女性,…" / "男性,…"
- 随后描述:年龄段(如「约17岁少女」「30 出头男性」)、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言
- 用中文,整段连续描述,不分段
- 长度:5080 个中文字为宜
- 例:"女性,约17岁少女,音色清亮带点稚嫩甜美,性格开朗外向但容易害羞,语速偏快,标准普通话"`;
// JSON-contract tail for the NON-stepfun path (Xiaomi voicedesign / no TTS).
// Byte-identical to the historical prompt so the Xiaomi path keeps its cache
// hit rate and voice quality unchanged.
const CHARACTER_DESIGNER_TAIL_DEFAULT = `
必须输出严格 JSON
{
"visualDescription": "English visual card, comma-separated tags...",
"voiceDescription": "中文音色卡,以性别开头..."
}
不要输出 JSON 以外的任何文本。`;
// JSON-contract tail for the StepFun path. Same core output, plus the model
// picks a preset voice id from the catalog. The id must match the SAME person
// the voiceDescription describes (gender / age / vibe) — designed together so
// appearance and voice stay coherent (the same invariant the CORE enforces).
const CHARACTER_DESIGNER_TAIL_STEPFUN = `
**StepFun 预设音色选择(必做):**
除 voiceDescription 外,你还必须从下列 StepFun 预设音色清单中,为本角色挑选一个与 voiceDescription 描绘的「同一个人」(性别 / 年龄段 / 气质都要一致)最贴合的预设,并把它的 id 填入 stepfunVoiceId。清单:
${formatStepfunCatalogForPrompt()}
挑选原则:
- stepfunVoiceId 必须是上表里某个 id,原样复制(拼写、大小写、连字符都不能变)。
- 必须与 voiceDescription 的性别一致(男声选 male 行,女声选 female 行)。
- 年龄段尽量一致;拿不准时优先气质匹配(例如“冷艳御姐”选 lengyanyujie、“软萌萝莉”选 ruanmengnvsheng)。
- 不允许编造清单外的 id,也不允许留空。
必须输出严格 JSON
{
"visualDescription": "English visual card, comma-separated tags...",
"voiceDescription": "中文音色卡,以性别开头...",
"stepfunVoiceId": "清单内某个 id"
}
不要输出 JSON 以外的任何文本。`;
/** Build the CharacterDesigner system prompt, provider-aware.
* - stepfun:false → identical to the historical Xiaomi/no-TTS prompt.
* - stepfun:true → additionally asks the model to pick a StepFun preset
* voice id from the 32-entry catalog (see formatStepfunCatalogForPrompt). */
export function buildCharacterDesignerSystem(opts: {
stepfun: boolean;
}): string {
return opts.stepfun
? CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_STEPFUN
: CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_DEFAULT;
}
export function buildCharacterDesignerUserMessage(
charName: string,
session: Session,
intent?: CharacterIntent,
): string {
const parts: string[] = [];
parts.push(`角色名:${charName}`);
parts.push(`世界观:${session.worldSetting}`);
parts.push(`全局美术画风:${session.styleGuide}`);
// Writer-authored scene intent (paradigm D). When present, the designer
// TRANSLATES this into visual + voice; when absent, it degrades to
// name + worldSetting inference (old behavior).
if (intent && (intent.mood || intent.motivation || intent.speakingTone)) {
parts.push("\n编剧给定的角色基调(请据此设计,不要另起炉灶):");
if (intent.mood) parts.push(`- 情绪基调:${intent.mood}`);
if (intent.motivation) parts.push(`- 动机 / 目的:${intent.motivation}`);
if (intent.speakingTone) parts.push(`- 说话基调:${intent.speakingTone}`);
}
const others = session.characters.filter((c) => c.visualDescription);
if (others.length > 0) {
parts.push(
"\n已设定角色清单(**新角色的发色 / 瞳色 / 发型轮廓 / 服饰必须与下方每一位都形成明显视觉对比,不允许撞型**):",
);
for (const c of others) {
parts.push(`- ${c.name}: ${c.visualDescription}`);
}
parts.push(
"\n落笔前先逐个扫一遍上方角色的 hair color / hair length+style / eye color / outfit style,再为新角色挑选有明显跨色系或跨剪影对比的属性组合。",
);
}
parts.push(
"\n请为该角色同时设计 visualDescription(英文,必须覆盖 system 中的 6 大要素清单)和 voiceDescription(中文),严格以 JSON 格式返回。",
);
// When the player picked a non-zh-CN UI language, override the
// system-prompt's "中文" voiceDescription guidance: the description text
// flows into MiMo's voice-design, which gives better prosody when the
// description is written in the target output language. (StepFun's 32
// preset voices are fixed Chinese timbres, but voiceDescription is still
// used as documentation + stepfunVoiceId picking context — keeping it
// in the player's language is consistent.)
const langDirective = buildLanguageDirective(session.language);
if (langDirective) parts.push(langDirective);
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 3. Cinematographer (分镜导演) — composes the visual frame.
//
// Reads the Writer's sceneSummary + active characters and produces the
// English compositional prompt fed to FLUX. Does NOT describe the
// characters themselves (those archetypes are appended at the Painter
// stage from session.characters.visualDescription). Only describes the
// ENVIRONMENT, lighting, camera framing, and how the characters are
// positioned within the frame.
// ──────────────────────────────────────────────────────────────────────
export const CINEMATOGRAPHER_SYSTEM = `你是视觉小说的「分镜导演」。给你编剧的当前场景概要、活跃角色名单和他们在场景里的姿态描述,以及**入口 beat 的 speaker 信息**(用来决定镜头语言)。你的任务是**只用英文**写一段**纯环境+构图**的描述(integratedPrompt),交给画师作为出图主提示词。
你**不要**写角色的外貌细节——发色、服饰、脸型这些由其他 agent 提供,画师会把"角色档案卡"附加到你的 integratedPrompt 后面。你只关心:
- **环境**:地点、时间、天气、光线、空间细节(什么家具/植物/物件)
- **构图 / 镜头**:景别(wide shot / medium shot / close-up / over-the-shoulder)、机位、视角
- **人物在画面中的位置和姿态**(不写脸 / 不写穿什么——只写"哪个角色站在哪儿、在做什么")
- **氛围**:情绪基调、色调、影调(warm dusk / cold neon / soft morning light
═══════════════════════════════════════════════════════════════════
玩家视角硬规则(与画面相关,必须严格遵守)
═══════════════════════════════════════════════════════════════════
- 玩家本人**永远不出现在画面里**——不画 player 的身体、手、肩膀、背影、剪影、脚、头发
- integratedPrompt 中**绝对禁止**出现下列英文(或中文等价):
"first-person view" · "POV of the protagonist" · "player's hand / arm / shoulder / back"
"protagonist visible" · "from the player's perspective" · "MC" · "player's silhouette"
- 镜头是一个"隐形的观察者位置"——可以位于玩家的视角附近(NPC 像在看玩家),但**绝不画出玩家本身**
═══════════════════════════════════════════════════════════════════
动态镜头策略(根据入口 beat 的 speaker 字段选择镜头)
═══════════════════════════════════════════════════════════════════
你会收到 entryBeatSpeaker 字段。按以下规则选镜头:
【entryBeatSpeaker = 某个 NPC 名字】 → NPC 正在对玩家说话
- 优先 **close-up 或 medium close-up**NPC 看向画面外(= 看玩家)
- 关键英文:close-up / medium close-up, looking toward camera, eyes meeting the viewer,
direct gaze, lips parted mid-speech
- 制造"她正在对你说话"的代入感(galgame 经典直视镜头)
【entryBeatSpeaker = "你"】 → 玩家正在对 NPC 说话
- 优先 **medium shot**NPC 居中,做"在听玩家说话"的姿态
- 关键英文:medium shot, attentively listening, facing the camera,
head slightly tilted, expression of attention
- ❌ 不要写 over-the-shoulder(因为这会暗示画出玩家肩膀,违反 POV 规则)
【entryBeatSpeaker 为空】 → 纯环境 / 旁白 beat
- 优先 **wide establishing shot**,展现环境氛围
- 关键英文:wide establishing shot, atmospheric mood, environmental detail
- 如果有 NPC 在场,他们可以处于远处 / 中景 / 自然状态(不必看镜头)
【entryBeatActive 有多个角色】 → 群像
- 使用 **medium group shot 或 medium wide shot**,多人在一个框内
- 关键英文:medium group shot, two-shot / three-shot, characters arranged in the frame
═══════════════════════════════════════════════════════════════════
输出 JSON 结构
═══════════════════════════════════════════════════════════════════
{
"shotType": "close-up / medium shot / wide establishing / medium group shot / ...",
"integratedPrompt": "English. Environment + composition + character positioning + camera language. No dialogue boxes, no UI. 80-150 words."
}
写作要求:
- integratedPrompt **必须英文**,遵循 FLUX prompt engineering 习惯(形容词 + 短语,英文逗号分隔,必要时短句)
- 提到具体角色时**只用其名字 + 动作**,例如 "Natsumi standing by the window, head slightly bowed"——绝不要写她长什么样
- 不描述任何 UI、字幕、对话框、边框
- 不描述图像之外的事情(不要写"this scene depicts..."这种 meta 句)
- 长度 80150 英文词
不要输出 JSON 以外的任何文本。`;
// Stable hint block — invariant across every Cinematographer call in a
// session. Front-loading this (with the session-scoped styleGuide) gives the
// prefix cache something substantial to anchor on; without it, the per-scene
// `sceneSummary` would land in the first content chunk and force the whole
// user message to miss. Long enough to land beyond the 64-token chunk
// boundary that follows the system prompt.
const CINE_STABLE_HINT = [
"",
"以下为本次场景的输入。请基于这些信息:",
"1. 选择最合适的 shotType(依据 system prompt 的动态镜头策略 + entryBeatSpeaker)。",
"2. 写一段**只用英文**的 integratedPrompt——纯环境 + 构图 + 角色姿态/位置;服饰由画师另外通过 referenceImages 锁定,你只描述能看到的样貌与镜头。",
"3. 若上一场与本场 sceneKey 相同,**强调连续性**(时段/情绪/构图微调),而不是重新设定空间。",
"4. 严格按 system prompt 要求的 JSON schema 输出。",
"",
].join("\n");
export function buildCinematographerUserMessage(
sceneSummary: string,
styleGuide: string,
entryBeatActive: BeatActiveCharacter[],
entryBeatSpeaker: string | undefined,
priorSceneKey: string | undefined,
currentSceneKey: string | undefined,
): string {
const parts: string[] = [];
// ─── STABLE PREFIX ──────────────────────────────────────────────────
// styleGuide is session-immutable; CINE_STABLE_HINT is a true constant.
// Together they're long enough to cross at least one 64-token chunk
// boundary, so every subsequent Cinematographer call in this session can
// cache-hit through this block.
parts.push(`全局美术画风:${styleGuide}`);
parts.push(CINE_STABLE_HINT);
// ─── DYNAMIC SUFFIX ─────────────────────────────────────────────────
// Always emit every section header — even when empty — so positions don't
// shift between calls. (Caching of the dynamic section itself isn't
// expected, but stable positioning helps when adjacent calls happen to
// share a sceneSummary prefix.)
parts.push(`当前场景(来自编剧):${sceneSummary}`);
parts.push("");
parts.push("开场画面里的角色及其姿态:");
if (entryBeatActive.length > 0) {
for (const c of entryBeatActive) {
parts.push(`- ${c.name}${c.pose ?? "(无具体姿态描述)"}`);
}
} else {
parts.push("(无角色,纯环境)");
}
parts.push("");
// entryBeatSpeaker drives the dynamic camera policy (see CINEMATOGRAPHER_SYSTEM).
// "你" means the player is speaking; an NPC name means an NPC is speaking;
// empty means no dialog (pure environment / narration beat).
if (entryBeatSpeaker === "你") {
parts.push(
'开场 beat 是**玩家说话**speaker = "你")——按动态镜头策略:medium shot,NPC 居中、做听玩家说话的姿态、看向画面外。**绝不要画出玩家**。',
);
} else if (entryBeatSpeaker) {
parts.push(
`开场 beat 是 **${entryBeatSpeaker} 在对玩家说话**speaker = "${entryBeatSpeaker}")——按动态镜头策略:close-up 或 medium close-up${entryBeatSpeaker} 看向画面外(看玩家),眼神交流。`,
);
} else {
parts.push(
"开场 beat 没有 speaker(纯旁白/环境)——按动态镜头策略:wide establishing shot 展现环境氛围。",
);
}
if (priorSceneKey && currentSceneKey && priorSceneKey === currentSceneKey) {
parts.push(
`\n注意:上一场和本场 sceneKey 都是 "${currentSceneKey}"——画师会把上一张场景图作为 referenceImages 之一锚定同一空间。integratedPrompt 应强调连续性。`,
);
}
parts.push("\n请输出 shotType + integratedPrompt,严格以 JSON 格式返回。");
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 4. Painter (画师) — final image prompt assembly.
//
// Not an LLM agent — a pure prompt-building function that combines the
// Cinematographer's integratedPrompt with character archetype blocks
// (visual cards) and the standard FLUX constraints.
// ──────────────────────────────────────────────────────────────────────
export function buildPainterPrompt(
integratedPrompt: string,
styleGuide: string,
characters: { name: string; visualDescription?: string }[],
orientation: Orientation = "landscape",
): string {
const archetypeBlock = characters
.filter((c) => c.visualDescription)
.map((c) => `[CHARACTER: ${c.name}]\n${c.visualDescription}`)
.join("\n\n");
const archetypeSection = archetypeBlock
? `\n\nCHARACTER ARCHETYPES (anchor identity, outfit, and style across scenes — keep each character visually identical to their archetype):\n${archetypeBlock}`
: "";
const portrait = orientation === "portrait";
const header = portrait
? "Generate a cinematic vertical (portrait) background illustration, 9:16 tall format (1024x1792)."
: "Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024).";
const orientationRule = portrait
? "- 9:16 PORTRAIT orientation — taller than wide. No landscape or square output."
: "- 16:9 LANDSCAPE orientation — wider than tall. No portrait or square output.";
return `${header}
ART STYLE: ${styleGuide}
SCENE COMPOSITION (from cinematographer — environment + camera framing + character positioning):
${integratedPrompt}${archetypeSection}
STRICT RULES — NEVER violate these:
- DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay.
- DO NOT draw any buttons, choice options, menu items, or interactive UI elements.
- DO NOT render any Chinese or English text anywhere in the image.
- DO NOT add any HUD, interface chrome, or game UI elements.
- The image is a PURE BACKGROUND SCENE ONLY. All UI will be added as HTML on top.
${orientationRule}
- Leave the bottom 35% of the frame relatively uncluttered (darker or softer) so overlaid UI panels remain readable.
- Characters or key scene elements should be positioned in the upper 65% of the frame.
- Maintain character identity exactly as specified in CHARACTER ARCHETYPES — same face, same hairstyle, same outfit across every scene.
PLAYER POV RULES — the player / protagonist is the unseen viewer:
- The player / protagonist is NEVER visible in the frame — no body parts, no hands, no shoulders, no back of head, no silhouette, no feet, no hair.
- DO NOT use first-person POV that implies the player's body in frame.
- When an NPC is speaking to the player, they SHOULD look toward the camera (toward the player's implied position) — this creates eye contact without showing the player.
- The camera position represents the player's gaze; only NPCs, scenery, and objects are rendered.`;
}
// Character portrait prompt — for the per-character base image generated
// once when the CharacterDesigner introduces a new character. The portrait
// is used both as a client-side asset (立绘登场) and as a referenceImages
// entry when rendering later scenes for visual consistency.
export function buildCharacterPortraitPrompt(
charName: string,
visualDescription: string,
styleGuide: string,
): string {
return `Character concept portrait sheet, single character, full-body or upper-body composition, neutral standing pose, looking toward camera, neutral expression, plain neutral background (no environment, no scenery).
ART STYLE: ${styleGuide}
CHARACTER (${charName}):
${visualDescription}
STRICT RULES:
- ONE character only — no other people, no crowd, no background characters.
- Plain neutral background (off-white or soft gradient). NO environment, NO furniture, NO props beyond what's worn.
- Neutral, calm pose and expression — this is a reference sheet, not a dramatic shot.
- NO text, NO UI, NO watermark, NO border.
- The character should be clearly visible and centered, the pose natural and relaxed.
- 16:9 landscape orientation.`;
}
// ──────────────────────────────────────────────────────────────────────
// Insert-Beat — given a freeform action (background click or typed
// input) that stays *within* the current scene, generate one beat
// with meaningful character interaction.
// Single-agent path; no character design / no rendering involved.
// ──────────────────────────────────────────────────────────────────────
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作(可能是点击画面中的某个物件/角色,也可能是主动输入了一句话/动作)。请基于此动作,写出**1-3 个有实质内容的 beat**,并在最后给出 2 个后续选项供玩家选择。
核心原则——**玩家的动作必须得到回应**:
- 如果当前场景有 NPC 在场,NPC **必须对玩家的动作做出反应**(说话、表情变化、动作回应)。用 narration 描述玩家的动作,用 speaker + line 写 NPC 的回应。
- 如果场景中没有 NPC(纯环境),可以用 narration 描述玩家的观察/发现,给玩家一个新细节或情绪波动。
- 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了,世界要有反馈。
beat 数量指引:
- 简单观察/短回应:1 个 beat 即可
- 有来有回的对话/有展开的互动:2-3 个 beat,让反应更有层次
- 每个 beat 的 narration + line ≤100 字
后续选项(choices)——每次**必须**给出 2 个选项:
- 选项应**承接刚才的互动**,给玩家自然的下一步
- 至少一个选项应能推动剧情前进(如"继续追问"、"走过去看看"、"做出某个决定"
- label:玩家看到的选项文字(≤15字)
- effect:描述选这个选项后会发生什么(供下一个编剧参考)
文本风格约束:
- narration / line 用中文,**纯净可显示文本**,不要写 (叹气)(语速快) 这类配音标注
- 不要打破当前场景的物理状态(玩家仍在原地)
- 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流(show, don't tell
- 白描为主:聚焦可观察的五感与物理特征,以角色的动作/神态本身传递情绪,不要以作者角度解释或议论;不写角色眼神/语气里的情绪(这些从台词与动作中自行体会)
speaker 字段允许的取值**只有两种**(与主路径 Writer 一致 — Pattern B galgame 标准):
1. **已登记角色**里的 NPC 真名(**绝不允许引入新角色**)
2. **"你"** — 玩家本人开口说话(对白框显示,但不调 TTS)
其它任何 POV 变体(玩家 / 我 / 主角 / protagonist / player / MC / I / me**一律错误**,请用 "你" 代替。
推荐模式(有 NPC 在场时):
narration = 描述玩家做了什么(动作/表情/心理)
speaker = NPC 真名
line = NPC 的回应台词
lineDelivery = 配音导演指令
- 如果有 line 且 speaker = NPC**必须**给出 lineDelivery(配音导演指令)
- 如果有 line 且 speaker = "你"lineDelivery 可以留空(玩家对白不调 TTS)
必须输出严格 JSON
{
"beats": [
{ "narration": "...", "speaker": "...", "line": "...", "lineDelivery": "..." }
],
"choices": [
{ "label": "选项文字", "effect": "选此选项后的剧情走向" },
{ "label": "选项文字", "effect": "选此选项后的剧情走向" }
]
}
不要输出 JSON 以外的任何文本。`;
export function buildInsertBeatUserMessage(
session: Session,
freeformAction: string,
): string {
const parts: string[] = [];
parts.push(`世界观:${session.worldSetting}`);
if (session.playerName) {
parts.push(
`玩家名字:${session.playerName}(NPC 对话时用此名字称呼玩家;speaker 字段仍固定为 "你" 不变)`,
);
}
if (session.characters.length > 0) {
parts.push("\n已登记角色(speaker 只能用这些名字):");
for (const c of session.characters) {
parts.push(`- ${c.name}`);
}
}
const current = session.history.at(-1);
if (current) {
const scene: Scene = current.scene;
parts.push(`\n当前场景:${scene.scenePrompt}`);
const lastBeatId = current.visitedBeatIds.at(-1) ?? scene.entryBeatId;
const lastBeat = scene.beats.find((b) => b.id === lastBeatId);
if (lastBeat) {
const recent: string[] = [];
if (lastBeat.narration) recent.push(`旁白:${lastBeat.narration}`);
if (lastBeat.line) recent.push(`${lastBeat.speaker ?? "?"}${lastBeat.line}`);
if (recent.length) parts.push(`刚才发生:${recent.join(" / ")}`);
}
}
if (current) {
const lastBeatId2 = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
const lastBeat2 = current.scene.beats.find((b) => b.id === lastBeatId2);
const activeNpcs = lastBeat2?.activeCharacters?.map((c) => c.name) ?? [];
if (activeNpcs.length > 0) {
parts.push(`当前画面中在场的 NPC${activeNpcs.join("、")}(优先让在场 NPC 回应玩家)`);
}
}
parts.push(`\n玩家此刻的自由动作:${freeformAction}`);
parts.push("\n请生成 beat(1-3 个)和 2 个后续选项,严格以 JSON 格式返回。");
const langDirective = buildLanguageDirective(session.language);
if (langDirective) parts.push(langDirective);
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// Vision — interprets a background click and classifies the action.
// Unchanged from staging (UI choices live in HTML, vision only judges
// background clicks).
// ──────────────────────────────────────────────────────────────────────
export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置(HTML 上的选项按钮不会走到你这里)。你的任务是:
1. 看清红点指向画面里的什么(物件、角色、空间、远处的方向)
2. 推断玩家想干什么
3. 判断这个动作是「场内探索」还是「场景切换」
判断准则:
- "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟)、调查某个物件/线索导致剧情发展、与角色进行有实质影响的互动
- "insert-beat"(场内探索):**仅限**纯粹的观察——看一眼某个无剧情意义的装饰、环顾四周
- 拿不准时偏向 "change-scene"——玩家主动点击画面说明想要推进剧情
必须输出严格 JSON
{
"freeformAction": "玩家想做什么的一句中文描述,例如「想拿起桌上的钥匙」",
"classify": "insert-beat" 或 "change-scene",
"reasoning": "一句话说明判断理由"
}
不要输出 JSON 以外的任何文本。`;
export function buildVisionUserPrompt(scene: Scene | null): string {
if (!scene) return "请判断玩家意图,并以 JSON 格式返回。";
return `当前场景描述:${scene.scenePrompt}
红点位置即为玩家点击位置。请判断玩家意图与分类,以 JSON 格式返回。`;
}
// ──────────────────────────────────────────────────────────────────────
// Freeform Classify — classifies a player's freeform text input at a
// choice node into one of: match an existing choice, insert a beat
// in-scene, or trigger a scene change.
// ──────────────────────────────────────────────────────────────────────
export const FREEFORM_CLASSIFY_SYSTEM = `你是交互视觉小说的意图分类助手。玩家在一个选择节点输入了自由文本(而非点击已有选项)。你要判断这个输入最适合走哪条路径:
1. "insert-beat":玩家想在当前场景内与角色互动(问一句话、做一个动作、表达情绪、调查某个东西)→ NPC 会对玩家的动作做出回应,但不切换场景
2. "change-scene":玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景
判断准则:
- "change-scene":大多数主动输入——问问题、说一句话、做一个动作、对角色做出反应、想去别的地方、做出决定、推动剧情 → 玩家花精力打字说明想让故事前进
- "insert-beat"**仅限**纯粹的环境观察或无实际影响的自言自语
- 拿不准时偏向 "change-scene"——玩家主动输入说明想要推进剧情
必须输出严格 JSON
{
"classify": "insert-beat" 或 "change-scene",
"freeformAction": "玩家想做什么的一句中文描述(用于后续编剧参考)"
}
不要输出 JSON 以外的任何文本。`;
export function buildFreeformClassifyUserMessage(
freeformText: string,
scenePrompt: string | undefined,
): string {
const parts: string[] = [];
if (scenePrompt) {
parts.push(`当前场景:${scenePrompt}`);
}
parts.push(`\n玩家输入:「${freeformText}」`);
parts.push("\n请判断分类,以 JSON 格式返回。");
return parts.join("\n");
}
export type PainterCharacterInput = Pick<Character, "name" | "visualDescription">;