infiplot-web/lib/engine/prompts.ts

import type {
  BeatActiveCharacter,
  Character,
  CharacterIntent,
  Orientation,
  Scene,
  Session,
  StoryState,
  WriterPlan,
} from "@infiplot/types";
import { formatStepfunCatalogForPrompt } from "@infiplot/tts-client";

// ══════════════════════════════════════════════════════════════════════
//  Output-language directive — appended to user messages so the AI's
//  GENERATED dialogue, narration, and voice-design text follow the UI
//  locale the player picked. Returns "" for zh-CN (the prompts' native
//  language) so existing sessions behave byte-identically to before.
//
//  We intentionally append this as a TRAILING one-liner rather than
//  rewriting the system prompts in the target language — the prompts body
//  is the cacheable / reviewed / future-edit-friendly asset, and a single
//  trailing directive is enough for modern LLMs to switch their output
//  language while still receiving Chinese instructions.
// ══════════════════════════════════════════════════════════════════════
const LANG_LABELS: Record<string, string> = {
  "zh-CN": "简体中文",
  en: "English",
  ja: "日本語",
};

/**
 * Returns a one-line Chinese instruction telling the LLM to produce its
 * free-form output (dialogue, narration, voice-design text) in the player's
 * selected UI language. Returns an empty string for zh-CN sessions — those
 * are the prompts' native language and need no directive.
 *
 * Always returns Chinese regardless of session.language because the system
 * prompts are Chinese; the directive instructs the model to *output* in the
 * target language, not to read prompts in it.
 */
export function buildLanguageDirective(language: string | undefined): string {
  if (!language || language === "zh-CN") return "";
  const label = LANG_LABELS[language];
  if (!label) return "";
  return `\n【输出语言】你产出的所有自然语言内容（对白台词 line / 旁白 narration / sceneSummary / storyState 各字段 / voiceDescription / lineDelivery 等）必须使用「${label}」；JSON 字段名、sceneKey、英文 visualDescription / painting prompt 仍按各 agent 既有规则。`;
}

// ══════════════════════════════════════════════════════════════════════
//  Multi-agent scene generation pipeline:
//    Architect (总编剧)    — ONE-TIME at session start: the story bible
//                           (protagonist / logline / genre / opening hook /
//                            planned cast) → seeds StoryState
//    Writer (编剧)         — narrative + beats[] + per-beat activeCharacters,
//                           reads StoryState and emits a StoryStatePatch
//    CharacterDesigner    — per-new-character visual + voice cards
//    Cinematographer (分镜导演) — sceneKey + English compositional prompt
//    Painter (画师)        — FLUX rendering with character archetypes
//
//  Each agent owns one system prompt + one user-message builder below.
//  All agents see the same world / style guide, but each only reads the
//  slice of session state it needs to make its decision.
// ══════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────
//  Shared — render the StoryState bible into a compact prompt block read
//  by the Writer (and Architect, on revisions). Keeping one renderer means
//  the bible looks identical to every agent that consumes it.
// ──────────────────────────────────────────────────────────────────────

// ── Story bible — split spine / dynamic for prefix-cache friendliness ──
//
// SPINE = Architect-set, never updated by Writer's storyStatePatch:
//   logline / genreTags / protagonist / castNotes
//   → goes in the STABLE PREFIX of every Writer user message
//
// DYNAMIC = patched every scene by the Writer:
//   synopsis / relationships / openThreads / nextHook
//   → goes in the DYNAMIC SUFFIX
//
// Keep both sections present even when empty (固定 section) so position is
// stable across calls — a missing section here would shift every byte after
// it and torch the cache.

export function renderStoryStateSpine(s: StoryState | undefined): string {
  const lines: string[] = ["【故事档案 · 主轴（不变）】"];
  lines.push(`主线（中心钩子）：${s?.logline ?? "（未设定）"}`);
  lines.push(`题材基调：${s?.genreTags ?? "（未设定）"}`);
  lines.push(`主角「你」：${s?.protagonist ?? "（未设定）"}`);
  lines.push(`核心配角：${s?.castNotes ?? "（未设定）"}`);
  return lines.join("\n");
}

export function renderStoryStateDynamic(s: StoryState | undefined): string {
  const lines: string[] = ["【故事档案 · 当前状态（每幕更新）】"];
  lines.push(`已发生（梗概）：${s?.synopsis ?? "（暂无）"}`);
  lines.push(
    `当前关系/情绪：${
      s?.relationships?.length
        ? "\n" + s.relationships.map((r) => `- ${r}`).join("\n")
        : "（暂无）"
    }`,
  );
  lines.push(
    `未收的悬念/伏笔：${
      s?.openThreads?.length
        ? "\n" + s.openThreads.map((t) => `- ${t}`).join("\n")
        : "（暂无）"
    }`,
  );
  lines.push(`接下来要往哪走（下一个钩子方向）：${s?.nextHook ?? "（暂无）"}`);
  return lines.join("\n");
}

// ──────────────────────────────────────────────────────────────────────
//  Paradigm D — merged Writer (single-pass streaming with tagged output)
// ──────────────────────────────────────────────────────────────────────

// Writer prompt has been refactored to segment-driven builder.
// See lib/engine/prompts/segments/writer/ for individual prompt segments.
// See lib/engine/prompts/registry.ts for segment registration.
// See lib/engine/prompts/builder.ts for assembly logic.

export { buildWriterStreamMessages } from "./prompts/builder";

// Render one history entry as a stable, position-independent block. Used by
// the Writer to dump both "completed past" (stable prefix) and "the entry the
// player just finished" (dynamic suffix) — same format, so the model sees a
// uniform history surface.
export function renderHistoryEntry(
  entry: Session["history"][number],
  index: number,
): string {
  const lines: string[] = [`【场景 ${index}】`];
  if (entry.scene.sceneKey) lines.push(`  sceneKey: ${entry.scene.sceneKey}`);

  const visited = entry.visitedBeatIds.length
    ? entry.visitedBeatIds
    : [entry.scene.entryBeatId];
  const beatById = new Map(entry.scene.beats.map((b) => [b.id, b]));
  const visitedBeats = visited
    .map((id) => beatById.get(id))
    .filter((b): b is NonNullable<typeof b> => Boolean(b));

  for (const b of visitedBeats) {
    const fragments: string[] = [];
    if (b.narration) fragments.push(`旁白：${b.narration}`);
    if (b.line) fragments.push(`${b.speaker ?? "?"}：${b.line}`);
    if (fragments.length) lines.push("  " + fragments.join(" / "));
  }

  if (entry.exit) {
    if (entry.exit.kind === "choice") {
      lines.push(
        `  玩家最终选择：${entry.exit.label}（去往：${entry.exit.nextSceneSeed}）`,
      );
    } else {
      lines.push(`  玩家自由动作：${entry.exit.action}`);
    }
  }
  return lines.join("\n");
}


// ──────────────────────────────────────────────────────────────────────
//  2. CharacterDesigner (角色设定师) — designs one new character.
//
//  Receives a character NAME (extracted by the Writer's activeCharacters)
//  and produces BOTH the English visual card AND the Chinese voice card
//  in a single LLM call. Bundling these two is intentional: a single agent
//  that "knows who this character is" produces internally-consistent
//  appearance + vocal personality, whereas split agents tend to diverge
//  (e.g., gentle-looking character with energetic voice).
// ──────────────────────────────────────────────────────────────────────

// CHARACTER_DESIGNER_SYSTEM is split into a provider-agnostic CORE (visual +
// voice-text rules) and a provider-specific TAIL (the JSON contract). When the
// server runs StepFun, the tail additionally asks the model to pick a preset
// voice id from the 32-entry catalog — so the SAME LLM call that designs the
// character also selects its voice, at zero extra latency. When StepFun is
// off (Xiaomi / no TTS), the tail is byte-identical to the historical prompt
// (Xiaomi path is cache- and behavior-preserving).
const CHARACTER_DESIGNER_SYSTEM_CORE = `你是视觉小说的「角色设定师」——下游的**媒体翻译官**。给你一个**新登场角色的名字**（通常还附带编剧给定的角色性格 / 情绪基调 / 说话基调），你的职责是把这份**已给定的角色意图**忠实翻译成两份媒体卡片：
1. **视觉设定卡（英文）**——给生图模型 FLUX 用，遵循 prompt engineering 风格
2. **音色设定卡（中文）**——给小米 MiMo 配音设计用

你**不发明**角色的性格——性格由编剧主导。你的工作是：**依据给定的性格 / 情绪 / 说话基调，产出最贴合的外貌与音色**。若没有给定性格信息（降级情况），再据角色名 + 世界观自行合理推断。

两份卡片要描绘**同一个人**，且都要贴合给定的角色基调——给定「傲娇腹黑」就别配天真烂漫的外貌与嗓音；给定「声音微颤、欲言又止」音色卡就要体现这份犹豫感。

视觉设定卡 visualDescription 规则：
- **必须完全用英文**
- 风格：用形容词 + 短语，**英文逗号分隔**，符合 FLUX/Stable Diffusion prompt 习惯
- **必须融入全局画风** styleGuide 的美术指向（比如 styleGuide 是「赛博朋克」时，服饰要赛博朋克化）
- **不要写瞬时姿势或表情**（这些由编剧/分镜每帧实时控制）
- 不要包含背景环境（这不是场景图，是角色立绘卡）
- 长度：100–180 个英文词为宜

**必须覆盖的 6 大要素 — 缺一项都会让角色撞脸：**
1. **HAIR（头发）** — 同时写明四点：
   ① 发色 hair color（具体到色相 + 明度，例 "platinum blonde" / "deep navy blue" / "warm chestnut brown"，不要只写 "dark hair"）
   ② 发型 hairstyle（具体款式：twin tails / side ponytail / hime cut / undercut / messy bob / long straight / wolf cut...）
   ③ 头发长度 hair length（chin-length / shoulder-length / waist-length / cropped 等明确量级）
   ④ 发饰或刘海特征（blunt bangs / curtain bangs / side-swept / hair ribbon / hairpin，可省但建议有一项）
2. **EYES（眼睛）** — 同时写明：
   ① 瞳色 eye color（具体色相，例 "amber" / "violet" / "icy blue"，不要只写 "dark eyes"）
   ② 眼型 eye shape（almond / round / sharp upturned / droopy / hooded）
   ③ 神情基调 default gaze tone（gentle / piercing / sleepy / mischievous，不写瞬时表情）
3. **FACE & BUILD（脸型 + 体格）** — 写 1–2 条标志性特征：
   - 脸型轮廓（oval / heart-shaped / sharp jawline / soft round）
   - 身高与体型相对感（tall and slim / petite / athletic build / broad shoulders）
   - 一个独特识别点（small mole below left eye / faint freckles / round glasses / fang teeth / scar across brow），用来在画面里第一眼区分
4. **OUTFIT（服饰）** — 同时写明：
   ① 主体款式（school uniform / casual streetwear / formal suit / kimono / lab coat / military / cyberpunk jacket...）
   ② 配色（主色 + 强调色，例 "navy blazer with crimson tie"，不要只写 "dark uniform"）
   ③ 至少一个标志性细节（collar shape / asymmetric hem / layered scarf / fingerless gloves / chunky boots / accessory like a pendant or earring）
   ④ 必须与 styleGuide 美术指向一致
5. **PERSONALITY-DRIVEN VIBE（性格→气质映射）** — 一句话：
   - 用 2–3 个性格关键词（gentle and reserved / sharp and aloof / cheerful and brash / cool and analytical / lazy and easygoing）
   - 说明这个性格如何投射到整体气场与氛围（approachable warmth / intimidating presence / quiet confidence / carefree aura / scholarly composure），不要写具体姿势动作
6. **OVERALL SILHOUETTE & VIBE TAG（整体剪影 + 一句气质标签）** — 一句话总结这个角色"远远一看就能认出来"的剪影特征

**差异化硬规则 — 避免与已设定角色撞型：**
你会收到「已设定角色清单」，每个条目包含 name + visualDescription。在落笔前**先在心里扫一遍**清单，提取每个角色的 hair color / hair length / eye color / outfit style，然后为新角色挑选**明显对比**的属性组合：
- **发色不能撞**：已有黑发 → 新角色避免黑、深棕；已有金发 → 新角色避免银、浅栗；至少跨一个色系（黑/棕/金/红/橙/银/灰/蓝/紫/绿）
- **瞳色不能撞**：同发色规则，跨色系挑选
- **剪影不能撞**：已有长直发 → 新角色用短发 / 双马尾 / 卷发 / 扎发；用"发长 × 发型"两个维度造差异
- **服饰风格至少一处明显差异**：款式（制服 vs 便服 vs 正装）、主色（暖 vs 冷）、轮廓（紧身 vs 宽松 / 长 vs 短）三者中至少一项明显不同
- 若剧情强制视觉相似（如双胞胎），必须在配饰或配色上做一处显著识别点

落笔顺序建议：先决定 personality keywords → 由性格反推合适的发色 / 服饰倾向 → 再与已有角色对照确认差异 → 最后写成英文 tag 串。

音色设定卡 voiceDescription 规则：
- **必须以明确性别开头**："女性，…" / "男性，…"
- 随后描述：年龄段（如「约17岁少女」「30 出头男性」）、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言
- 用中文，整段连续描述，不分段
- 长度：50–80 个中文字为宜
- 例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗外向但容易害羞，语速偏快，标准普通话"`;

// JSON-contract tail for the NON-stepfun path (Xiaomi voicedesign / no TTS).
// Byte-identical to the historical prompt so the Xiaomi path keeps its cache
// hit rate and voice quality unchanged.
const CHARACTER_DESIGNER_TAIL_DEFAULT = `

必须输出严格 JSON：
{
  "visualDescription": "English visual card, comma-separated tags...",
  "voiceDescription": "中文音色卡，以性别开头..."
}

不要输出 JSON 以外的任何文本。`;

// JSON-contract tail for the StepFun path. Same core output, plus the model
// picks a preset voice id from the catalog. The id must match the SAME person
// the voiceDescription describes (gender / age / vibe) — designed together so
// appearance and voice stay coherent (the same invariant the CORE enforces).
const CHARACTER_DESIGNER_TAIL_STEPFUN = `

**StepFun 预设音色选择（必做）：**
除 voiceDescription 外，你还必须从下列 StepFun 预设音色清单中，为本角色挑选一个与 voiceDescription 描绘的「同一个人」（性别 / 年龄段 / 气质都要一致）最贴合的预设，并把它的 id 填入 stepfunVoiceId。清单：
${formatStepfunCatalogForPrompt()}

挑选原则：
- stepfunVoiceId 必须是上表里某个 id，原样复制（拼写、大小写、连字符都不能变）。
- 必须与 voiceDescription 的性别一致（男声选 male 行，女声选 female 行）。
- 年龄段尽量一致；拿不准时优先气质匹配（例如“冷艳御姐”选 lengyanyujie、“软萌萝莉”选 ruanmengnvsheng）。
- 不允许编造清单外的 id，也不允许留空。

必须输出严格 JSON：
{
  "visualDescription": "English visual card, comma-separated tags...",
  "voiceDescription": "中文音色卡，以性别开头...",
  "stepfunVoiceId": "清单内某个 id"
}

不要输出 JSON 以外的任何文本。`;

/** Build the CharacterDesigner system prompt, provider-aware.
 *  - stepfun:false → identical to the historical Xiaomi/no-TTS prompt.
 *  - stepfun:true  → additionally asks the model to pick a StepFun preset
 *    voice id from the 32-entry catalog (see formatStepfunCatalogForPrompt). */
export function buildCharacterDesignerSystem(opts: {
  stepfun: boolean;
}): string {
  return opts.stepfun
    ? CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_STEPFUN
    : CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_DEFAULT;
}

export function buildCharacterDesignerUserMessage(
  charName: string,
  session: Session,
  intent?: CharacterIntent,
): string {
  const parts: string[] = [];
  parts.push(`角色名：${charName}`);
  parts.push(`世界观：${session.worldSetting}`);
  parts.push(`全局美术画风：${session.styleGuide}`);

  // Writer-authored scene intent (paradigm D). When present, the designer
  // TRANSLATES this into visual + voice; when absent, it degrades to
  // name + worldSetting inference (old behavior).
  if (intent && (intent.mood || intent.motivation || intent.speakingTone)) {
    parts.push("\n编剧给定的角色基调（请据此设计，不要另起炉灶）：");
    if (intent.mood) parts.push(`- 情绪基调：${intent.mood}`);
    if (intent.motivation) parts.push(`- 动机 / 目的：${intent.motivation}`);
    if (intent.speakingTone) parts.push(`- 说话基调：${intent.speakingTone}`);
  }

  const others = session.characters.filter((c) => c.visualDescription);
  if (others.length > 0) {
    parts.push(
      "\n已设定角色清单（**新角色的发色 / 瞳色 / 发型轮廓 / 服饰必须与下方每一位都形成明显视觉对比，不允许撞型**）：",
    );
    for (const c of others) {
      parts.push(`- ${c.name}: ${c.visualDescription}`);
    }
    parts.push(
      "\n落笔前先逐个扫一遍上方角色的 hair color / hair length+style / eye color / outfit style，再为新角色挑选有明显跨色系或跨剪影对比的属性组合。",
    );
  }

  parts.push(
    "\n请为该角色同时设计 visualDescription（英文，必须覆盖 system 中的 6 大要素清单）和 voiceDescription（中文），严格以 JSON 格式返回。",
  );
  // When the player picked a non-zh-CN UI language, override the
  // system-prompt's "中文" voiceDescription guidance: the description text
  // flows into MiMo's voice-design, which gives better prosody when the
  // description is written in the target output language. (StepFun's 32
  // preset voices are fixed Chinese timbres, but voiceDescription is still
  // used as documentation + stepfunVoiceId picking context — keeping it
  // in the player's language is consistent.)
  const langDirective = buildLanguageDirective(session.language);
  if (langDirective) parts.push(langDirective);
  return parts.join("\n");
}

// ──────────────────────────────────────────────────────────────────────
//  3. Cinematographer (分镜导演) — composes the visual frame.
//
//  Reads the Writer's sceneSummary + active characters and produces the
//  English compositional prompt fed to FLUX. Does NOT describe the
//  characters themselves (those archetypes are appended at the Painter
//  stage from session.characters.visualDescription). Only describes the
//  ENVIRONMENT, lighting, camera framing, and how the characters are
//  positioned within the frame.
// ──────────────────────────────────────────────────────────────────────

export const CINEMATOGRAPHER_SYSTEM = `你是视觉小说的「分镜导演」。给你编剧的当前场景概要、活跃角色名单和他们在场景里的姿态描述，以及**入口 beat 的 speaker 信息**（用来决定镜头语言）。你的任务是**只用英文**写一段**纯环境+构图**的描述（integratedPrompt），交给画师作为出图主提示词。

你**不要**写角色的外貌细节——发色、服饰、脸型这些由其他 agent 提供，画师会把"角色档案卡"附加到你的 integratedPrompt 后面。你只关心：
- **环境**：地点、时间、天气、光线、空间细节（什么家具/植物/物件）
- **构图 / 镜头**：景别（wide shot / medium shot / close-up / over-the-shoulder）、机位、视角
- **人物在画面中的位置和姿态**（不写脸 / 不写穿什么——只写"哪个角色站在哪儿、在做什么"）
- **氛围**：情绪基调、色调、影调（warm dusk / cold neon / soft morning light）

═══════════════════════════════════════════════════════════════════
玩家视角硬规则（与画面相关，必须严格遵守）
═══════════════════════════════════════════════════════════════════
- 玩家本人**永远不出现在画面里**——不画 player 的身体、手、肩膀、背影、剪影、脚、头发
- integratedPrompt 中**绝对禁止**出现下列英文（或中文等价）：
    "first-person view" · "POV of the protagonist" · "player's hand / arm / shoulder / back"
    "protagonist visible" · "from the player's perspective" · "MC" · "player's silhouette"
- 镜头是一个"隐形的观察者位置"——可以位于玩家的视角附近（NPC 像在看玩家），但**绝不画出玩家本身**

═══════════════════════════════════════════════════════════════════
动态镜头策略（根据入口 beat 的 speaker 字段选择镜头）
═══════════════════════════════════════════════════════════════════
你会收到 entryBeatSpeaker 字段。按以下规则选镜头：

【entryBeatSpeaker = 某个 NPC 名字】 → NPC 正在对玩家说话
- 优先 **close-up 或 medium close-up**，NPC 看向画面外（= 看玩家）
- 关键英文：close-up / medium close-up, looking toward camera, eyes meeting the viewer,
  direct gaze, lips parted mid-speech
- 制造"她正在对你说话"的代入感（galgame 经典直视镜头）

【entryBeatSpeaker = "你"】 → 玩家正在对 NPC 说话
- 优先 **medium shot**，NPC 居中，做"在听玩家说话"的姿态
- 关键英文：medium shot, attentively listening, facing the camera,
  head slightly tilted, expression of attention
- ❌ 不要写 over-the-shoulder（因为这会暗示画出玩家肩膀，违反 POV 规则）

【entryBeatSpeaker 为空】 → 纯环境 / 旁白 beat
- 优先 **wide establishing shot**，展现环境氛围
- 关键英文：wide establishing shot, atmospheric mood, environmental detail
- 如果有 NPC 在场，他们可以处于远处 / 中景 / 自然状态（不必看镜头）

【entryBeatActive 有多个角色】 → 群像
- 使用 **medium group shot 或 medium wide shot**，多人在一个框内
- 关键英文：medium group shot, two-shot / three-shot, characters arranged in the frame

═══════════════════════════════════════════════════════════════════
输出 JSON 结构
═══════════════════════════════════════════════════════════════════
{
  "shotType": "close-up / medium shot / wide establishing / medium group shot / ...",
  "integratedPrompt": "English. Environment + composition + character positioning + camera language. No dialogue boxes, no UI. 80-150 words."
}

写作要求：
- integratedPrompt **必须英文**，遵循 FLUX prompt engineering 习惯（形容词 + 短语，英文逗号分隔，必要时短句）
- 提到具体角色时**只用其名字 + 动作**，例如 "Natsumi standing by the window, head slightly bowed"——绝不要写她长什么样
- 不描述任何 UI、字幕、对话框、边框
- 不描述图像之外的事情（不要写"this scene depicts..."这种 meta 句）
- 长度 80–150 英文词

不要输出 JSON 以外的任何文本。`;

// Stable hint block — invariant across every Cinematographer call in a
// session. Front-loading this (with the session-scoped styleGuide) gives the
// prefix cache something substantial to anchor on; without it, the per-scene
// `sceneSummary` would land in the first content chunk and force the whole
// user message to miss. Long enough to land beyond the 64-token chunk
// boundary that follows the system prompt.
const CINE_STABLE_HINT = [
  "",
  "以下为本次场景的输入。请基于这些信息：",
  "1. 选择最合适的 shotType（依据 system prompt 的动态镜头策略 + entryBeatSpeaker）。",
  "2. 写一段**只用英文**的 integratedPrompt——纯环境 + 构图 + 角色姿态/位置；服饰由画师另外通过 referenceImages 锁定，你只描述能看到的样貌与镜头。",
  "3. 若上一场与本场 sceneKey 相同，**强调连续性**（时段/情绪/构图微调），而不是重新设定空间。",
  "4. 严格按 system prompt 要求的 JSON schema 输出。",
  "",
].join("\n");

export function buildCinematographerUserMessage(
  sceneSummary: string,
  styleGuide: string,
  entryBeatActive: BeatActiveCharacter[],
  entryBeatSpeaker: string | undefined,
  priorSceneKey: string | undefined,
  currentSceneKey: string | undefined,
): string {
  const parts: string[] = [];

  // ─── STABLE PREFIX ──────────────────────────────────────────────────
  // styleGuide is session-immutable; CINE_STABLE_HINT is a true constant.
  // Together they're long enough to cross at least one 64-token chunk
  // boundary, so every subsequent Cinematographer call in this session can
  // cache-hit through this block.
  parts.push(`全局美术画风：${styleGuide}`);
  parts.push(CINE_STABLE_HINT);

  // ─── DYNAMIC SUFFIX ─────────────────────────────────────────────────
  // Always emit every section header — even when empty — so positions don't
  // shift between calls. (Caching of the dynamic section itself isn't
  // expected, but stable positioning helps when adjacent calls happen to
  // share a sceneSummary prefix.)
  parts.push(`当前场景（来自编剧）：${sceneSummary}`);
  parts.push("");

  parts.push("开场画面里的角色及其姿态：");
  if (entryBeatActive.length > 0) {
    for (const c of entryBeatActive) {
      parts.push(`- ${c.name}：${c.pose ?? "（无具体姿态描述）"}`);
    }
  } else {
    parts.push("（无角色，纯环境）");
  }
  parts.push("");

  // entryBeatSpeaker drives the dynamic camera policy (see CINEMATOGRAPHER_SYSTEM).
  // "你" means the player is speaking; an NPC name means an NPC is speaking;
  // empty means no dialog (pure environment / narration beat).
  if (entryBeatSpeaker === "你") {
    parts.push(
      '开场 beat 是**玩家说话**（speaker = "你"）——按动态镜头策略：medium shot，NPC 居中、做听玩家说话的姿态、看向画面外。**绝不要画出玩家**。',
    );
  } else if (entryBeatSpeaker) {
    parts.push(
      `开场 beat 是 **${entryBeatSpeaker} 在对玩家说话**（speaker = "${entryBeatSpeaker}"）——按动态镜头策略：close-up 或 medium close-up，${entryBeatSpeaker} 看向画面外（看玩家），眼神交流。`,
    );
  } else {
    parts.push(
      "开场 beat 没有 speaker（纯旁白/环境）——按动态镜头策略：wide establishing shot 展现环境氛围。",
    );
  }

  if (priorSceneKey && currentSceneKey && priorSceneKey === currentSceneKey) {
    parts.push(
      `\n注意：上一场和本场 sceneKey 都是 "${currentSceneKey}"——画师会把上一张场景图作为 referenceImages 之一锚定同一空间。integratedPrompt 应强调连续性。`,
    );
  }

  parts.push("\n请输出 shotType + integratedPrompt，严格以 JSON 格式返回。");
  return parts.join("\n");
}

// ──────────────────────────────────────────────────────────────────────
//  4. Painter (画师) — final image prompt assembly.
//
//  Not an LLM agent — a pure prompt-building function that combines the
//  Cinematographer's integratedPrompt with character archetype blocks
//  (visual cards) and the standard FLUX constraints.
// ──────────────────────────────────────────────────────────────────────

export function buildPainterPrompt(
  integratedPrompt: string,
  styleGuide: string,
  characters: { name: string; visualDescription?: string }[],
  orientation: Orientation = "landscape",
): string {
  const archetypeBlock = characters
    .filter((c) => c.visualDescription)
    .map((c) => `[CHARACTER: ${c.name}]\n${c.visualDescription}`)
    .join("\n\n");

  const archetypeSection = archetypeBlock
    ? `\n\nCHARACTER ARCHETYPES (anchor identity, outfit, and style across scenes — keep each character visually identical to their archetype):\n${archetypeBlock}`
    : "";

  const portrait = orientation === "portrait";
  const header = portrait
    ? "Generate a cinematic vertical (portrait) background illustration, 9:16 tall format (1024x1792)."
    : "Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024).";
  const orientationRule = portrait
    ? "- 9:16 PORTRAIT orientation — taller than wide. No landscape or square output."
    : "- 16:9 LANDSCAPE orientation — wider than tall. No portrait or square output.";

  return `${header}

ART STYLE: ${styleGuide}

SCENE COMPOSITION (from cinematographer — environment + camera framing + character positioning):
${integratedPrompt}${archetypeSection}

STRICT RULES — NEVER violate these:
- DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay.
- DO NOT draw any buttons, choice options, menu items, or interactive UI elements.
- DO NOT render any Chinese or English text anywhere in the image.
- DO NOT add any HUD, interface chrome, or game UI elements.
- The image is a PURE BACKGROUND SCENE ONLY. All UI will be added as HTML on top.
${orientationRule}
- Leave the bottom 35% of the frame relatively uncluttered (darker or softer) so overlaid UI panels remain readable.
- Characters or key scene elements should be positioned in the upper 65% of the frame.
- Maintain character identity exactly as specified in CHARACTER ARCHETYPES — same face, same hairstyle, same outfit across every scene.

PLAYER POV RULES — the player / protagonist is the unseen viewer:
- The player / protagonist is NEVER visible in the frame — no body parts, no hands, no shoulders, no back of head, no silhouette, no feet, no hair.
- DO NOT use first-person POV that implies the player's body in frame.
- When an NPC is speaking to the player, they SHOULD look toward the camera (toward the player's implied position) — this creates eye contact without showing the player.
- The camera position represents the player's gaze; only NPCs, scenery, and objects are rendered.`;
}

// Character portrait prompt — for the per-character base image generated
// once when the CharacterDesigner introduces a new character. The portrait
// is used both as a client-side asset (立绘登场) and as a referenceImages
// entry when rendering later scenes for visual consistency.
export function buildCharacterPortraitPrompt(
  charName: string,
  visualDescription: string,
  styleGuide: string,
): string {
  return `Character concept portrait sheet, single character, full-body or upper-body composition, neutral standing pose, looking toward camera, neutral expression, plain neutral background (no environment, no scenery).

ART STYLE: ${styleGuide}

CHARACTER (${charName}):
${visualDescription}

STRICT RULES:
- ONE character only — no other people, no crowd, no background characters.
- Plain neutral background (off-white or soft gradient). NO environment, NO furniture, NO props beyond what's worn.
- Neutral, calm pose and expression — this is a reference sheet, not a dramatic shot.
- NO text, NO UI, NO watermark, NO border.
- The character should be clearly visible and centered, the pose natural and relaxed.
- 16:9 landscape orientation.`;
}

// ──────────────────────────────────────────────────────────────────────
//  Insert-Beat — given a freeform action (background click or typed
//  input) that stays *within* the current scene, generate one beat
//  with meaningful character interaction.
//  Single-agent path; no character design / no rendering involved.
// ──────────────────────────────────────────────────────────────────────

export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作（可能是点击画面中的某个物件/角色，也可能是主动输入了一句话/动作）。请基于此动作，写出**1-3 个有实质内容的 beat**，并在最后给出 2 个后续选项供玩家选择。

核心原则——**玩家的动作必须得到回应**：
- 如果当前场景有 NPC 在场，NPC **必须对玩家的动作做出反应**（说话、表情变化、动作回应）。用 narration 描述玩家的动作，用 speaker + line 写 NPC 的回应。
- 如果场景中没有 NPC（纯环境），可以用 narration 描述玩家的观察/发现，给玩家一个新细节或情绪波动。
- 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了，世界要有反馈。

beat 数量指引：
- 简单观察/短回应：1 个 beat 即可
- 有来有回的对话/有展开的互动：2-3 个 beat，让反应更有层次
- 每个 beat 的 narration + line ≤100 字

后续选项（choices）——每次**必须**给出 2 个选项：
- 选项应**承接刚才的互动**，给玩家自然的下一步
- 至少一个选项应能推动剧情前进（如"继续追问"、"走过去看看"、"做出某个决定"）
- label：玩家看到的选项文字（≤15字）
- effect：描述选这个选项后会发生什么（供下一个编剧参考）

文本风格约束：
- narration / line 用中文，**纯净可显示文本**，不要写 (叹气)(语速快) 这类配音标注
- 不要打破当前场景的物理状态（玩家仍在原地）
- 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流（show, don't tell）
- 白描为主：聚焦可观察的五感与物理特征，以角色的动作/神态本身传递情绪，不要以作者角度解释或议论；不写角色眼神/语气里的情绪（这些从台词与动作中自行体会）

speaker 字段允许的取值**只有两种**（与主路径 Writer 一致 — Pattern B galgame 标准）：
1. **已登记角色**里的 NPC 真名（**绝不允许引入新角色**）
2. **"你"** — 玩家本人开口说话（对白框显示，但不调 TTS）

其它任何 POV 变体（玩家 / 我 / 主角 / protagonist / player / MC / I / me）**一律错误**，请用 "你" 代替。

推荐模式（有 NPC 在场时）：
  narration = 描述玩家做了什么（动作/表情/心理）
  speaker = NPC 真名
  line = NPC 的回应台词
  lineDelivery = 配音导演指令

- 如果有 line 且 speaker = NPC，**必须**给出 lineDelivery（配音导演指令）
- 如果有 line 且 speaker = "你"，lineDelivery 可以留空（玩家对白不调 TTS）

必须输出严格 JSON：
{
  "beats": [
    { "narration": "...", "speaker": "...", "line": "...", "lineDelivery": "..." }
  ],
  "choices": [
    { "label": "选项文字", "effect": "选此选项后的剧情走向" },
    { "label": "选项文字", "effect": "选此选项后的剧情走向" }
  ]
}

不要输出 JSON 以外的任何文本。`;

export function buildInsertBeatUserMessage(
  session: Session,
  freeformAction: string,
): string {
  const parts: string[] = [];
  parts.push(`世界观：${session.worldSetting}`);
  if (session.playerName) {
    parts.push(
      `玩家名字：${session.playerName}（NPC 对话时用此名字称呼玩家；speaker 字段仍固定为 "你" 不变）`,
    );
  }

  if (session.characters.length > 0) {
    parts.push("\n已登记角色（speaker 只能用这些名字）：");
    for (const c of session.characters) {
      parts.push(`- ${c.name}`);
    }
  }

  const current = session.history.at(-1);
  if (current) {
    const scene: Scene = current.scene;
    parts.push(`\n当前场景：${scene.scenePrompt}`);
    const lastBeatId = current.visitedBeatIds.at(-1) ?? scene.entryBeatId;
    const lastBeat = scene.beats.find((b) => b.id === lastBeatId);
    if (lastBeat) {
      const recent: string[] = [];
      if (lastBeat.narration) recent.push(`旁白：${lastBeat.narration}`);
      if (lastBeat.line) recent.push(`${lastBeat.speaker ?? "?"}：${lastBeat.line}`);
      if (recent.length) parts.push(`刚才发生：${recent.join(" / ")}`);
    }
  }

  if (current) {
    const lastBeatId2 = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
    const lastBeat2 = current.scene.beats.find((b) => b.id === lastBeatId2);
    const activeNpcs = lastBeat2?.activeCharacters?.map((c) => c.name) ?? [];
    if (activeNpcs.length > 0) {
      parts.push(`当前画面中在场的 NPC：${activeNpcs.join("、")}（优先让在场 NPC 回应玩家）`);
    }
  }

  parts.push(`\n玩家此刻的自由动作：${freeformAction}`);
  parts.push("\n请生成 beat（1-3 个）和 2 个后续选项，严格以 JSON 格式返回。");
  const langDirective = buildLanguageDirective(session.language);
  if (langDirective) parts.push(langDirective);
  return parts.join("\n");
}

// ──────────────────────────────────────────────────────────────────────
//  Vision — interprets a background click and classifies the action.
//  Unchanged from staging (UI choices live in HTML, vision only judges
//  background clicks).
// ──────────────────────────────────────────────────────────────────────

export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置（HTML 上的选项按钮不会走到你这里）。你的任务是：
1. 看清红点指向画面里的什么（物件、角色、空间、远处的方向）
2. 推断玩家想干什么
3. 判断这个动作是「场内探索」还是「场景切换」

判断准则：
- "change-scene"（场景切换）：走向画面深处的门 / 走廊、转头看向新方向（视角变了）、点了远处的另一个空间、暗示时间跳跃的物件（如时钟）、调查某个物件/线索导致剧情发展、与角色进行有实质影响的互动
- "insert-beat"（场内探索）：**仅限**纯粹的观察——看一眼某个无剧情意义的装饰、环顾四周
- 拿不准时偏向 "change-scene"——玩家主动点击画面说明想要推进剧情

必须输出严格 JSON：
{
  "freeformAction": "玩家想做什么的一句中文描述，例如「想拿起桌上的钥匙」",
  "classify": "insert-beat" 或 "change-scene",
  "reasoning": "一句话说明判断理由"
}

不要输出 JSON 以外的任何文本。`;

export function buildVisionUserPrompt(scene: Scene | null): string {
  if (!scene) return "请判断玩家意图，并以 JSON 格式返回。";
  return `当前场景描述：${scene.scenePrompt}

红点位置即为玩家点击位置。请判断玩家意图与分类，以 JSON 格式返回。`;
}

// ──────────────────────────────────────────────────────────────────────
//  Freeform Classify — classifies a player's freeform text input at a
//  choice node into one of: match an existing choice, insert a beat
//  in-scene, or trigger a scene change.
// ──────────────────────────────────────────────────────────────────────

export const FREEFORM_CLASSIFY_SYSTEM = `你是交互视觉小说的意图分类助手。玩家在一个选择节点输入了自由文本（而非点击已有选项）。你要判断这个输入最适合走哪条路径：

1. "insert-beat"：玩家想在当前场景内与角色互动（问一句话、做一个动作、表达情绪、调查某个东西）→ NPC 会对玩家的动作做出回应，但不切换场景
2. "change-scene"：玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景

判断准则：
- "change-scene"：大多数主动输入——问问题、说一句话、做一个动作、对角色做出反应、想去别的地方、做出决定、推动剧情 → 玩家花精力打字说明想让故事前进
- "insert-beat"：**仅限**纯粹的环境观察或无实际影响的自言自语
- 拿不准时偏向 "change-scene"——玩家主动输入说明想要推进剧情

必须输出严格 JSON：
{
  "classify": "insert-beat" 或 "change-scene",
  "freeformAction": "玩家想做什么的一句中文描述（用于后续编剧参考）"
}

不要输出 JSON 以外的任何文本。`;

export function buildFreeformClassifyUserMessage(
  freeformText: string,
  scenePrompt: string | undefined,
): string {
  const parts: string[] = [];
  if (scenePrompt) {
    parts.push(`当前场景：${scenePrompt}`);
  }
  parts.push(`\n玩家输入：「${freeformText}」`);
  parts.push("\n请判断分类，以 JSON 格式返回。");
  return parts.join("\n");
}

export type PainterCharacterInput = Pick<Character, "name" | "visualDescription">;