refactor: flatten monorepo to single web package (#12)

Flatten the pnpm monorepo (apps/web + packages/*) into a single web package at the repo root. - Move app/lib/components/scripts/public to root; drop apps/web and packages/* wrappers - Rewrite tsconfig paths (@infiplot/*) to ./lib/*; turbopack.root = __dirname - Update Vercel (no root-directory) and Cloudflare (pnpm build:cf at root) deploy paths - Regenerate pnpm-lock.yaml to drop stale workspace importers - Bump engines.node to >=22 to match wrangler Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 00:55:45 +08:00
parent 9543c3dba1
commit dc5ecd60f6
221 changed files with 241 additions and 379 deletions
@@ -0,0 +1,90 @@
+import { chat } from "@infiplot/ai-client";
+import type { ProviderConfig, Session, StoryState } from "@infiplot/types";
+import { parseJsonLoose } from "../jsonParser";
+import { ARCHITECT_SYSTEM, buildArchitectUserMessage } from "../prompts";
+
+// ──────────────────────────────────────────────────────────────────────
+//  Architect agent — ONE LLM call at session start.
+//
+//  Expands the user's (often terse) world + style prompt into a real story
+//  bible: a second-person protagonist with a want and a flaw, a single
+//  central dramatic question (logline), a genre frame that anchors the
+//  爽点 rhythm, an engineered cold-open for scene 1 (nextHook), and a small
+//  intentional cast. Seeds the StoryState that the Writer reads and updates
+//  every scene — so the story has a spine from beat one instead of being
+//  improvised cold.
+//
+//  Everything is best-effort coerced with fallbacks: a malformed LLM
+//  response can never abort session start — worst case the Writer just gets
+//  a thinner bible and improvises more.
+// ──────────────────────────────────────────────────────────────────────
+
+type RawStoryState = {
+  logline?: unknown;
+  genreTags?: unknown;
+  protagonist?: unknown;
+  castNotes?: unknown;
+  synopsis?: unknown;
+  openThreads?: unknown;
+  relationships?: unknown;
+  nextHook?: unknown;
+};
+
+function str(raw: unknown): string {
+  return typeof raw === "string" ? raw.trim() : "";
+}
+
+function strArray(raw: unknown): string[] | undefined {
+  if (!Array.isArray(raw)) return undefined;
+  const out = raw
+    .map((x) => (typeof x === "string" ? x.trim() : ""))
+    .filter((x) => x.length > 0);
+  return out.length > 0 ? out : undefined;
+}
+
+export async function runArchitect(
+  config: ProviderConfig,
+  session: Session,
+): Promise<StoryState> {
+  try {
+    const raw = await chat(
+      config,
+      [
+        { role: "system", content: ARCHITECT_SYSTEM },
+        { role: "user", content: buildArchitectUserMessage(session) },
+      ],
+      { temperature: 0.85, responseFormat: "json_object" },
+    );
+
+    const parsed = parseJsonLoose<RawStoryState>(raw);
+
+    return {
+      // Stable spine — fall back to the raw world/style prompt so the bible is
+      // never wholly empty even if the model returns garbage.
+      logline: str(parsed.logline) || session.worldSetting,
+      genreTags: str(parsed.genreTags),
+      protagonist:
+        str(parsed.protagonist) ||
+        "你是这个故事的主角（第二人称视角，永不出现在画面里）。",
+      castNotes: str(parsed.castNotes) || undefined,
+      // Volatile seeds — the opening Writer will rewrite these via its patch.
+      synopsis: str(parsed.synopsis) || "故事即将开始。",
+      openThreads: strArray(parsed.openThreads),
+      relationships: strArray(parsed.relationships),
+      nextHook: str(parsed.nextHook) || undefined,
+    };
+  } catch (err) {
+    // chat() or parseJsonLoose() can throw (network / unrepairable JSON).
+    // The Architect is best-effort: never let it abort session start — return
+    // a minimal bible seeded from the raw prompt and let the Writer improvise.
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(`[architect] failed, using minimal bible: ${msg}`);
+    return {
+      logline: session.worldSetting,
+      genreTags: "",
+      protagonist:
+        "你是这个故事的主角（第二人称视角，永不出现在画面里）。",
+      synopsis: "故事即将开始。",
+    };
+  }
+}
@@ -0,0 +1,155 @@
+import { chat, generateImage } from "@infiplot/ai-client";
+import { provisionVoice } from "@infiplot/tts-client";
+import type {
+  Character,
+  CharacterVoice,
+  EngineConfig,
+  Session,
+} from "@infiplot/types";
+import { parseJsonLoose } from "../jsonParser";
+import { mockImageDataUri } from "../mockImage";
+import {
+  CHARACTER_DESIGNER_SYSTEM,
+  buildCharacterDesignerUserMessage,
+  buildCharacterPortraitPrompt,
+} from "../prompts";
+
+// ──────────────────────────────────────────────────────────────────────
+//  CharacterDesigner agent — designs ONE new character.
+//
+//  Exposed as three GRANULAR stages so the director can schedule the slow
+//  parts around the Painter (a voice is never needed to paint a scene, and
+//  only entry-beat characters' portraits are referenced by the Painter):
+//
+//    1. designCharacterCard      — ONE LLM call → visual + voice TEXT cards
+//       (intentional bundling: the same agent thinks about who this character
+//        IS, keeping appearance and vocal personality coherent)
+//    2. renderCharacterPortrait  — base portrait image (Runware URL + UUID)
+//    3. provisionCharacterVoice  — Xiaomi MiMo voicedesign → reference audio
+//
+//  Each step degrades gracefully — if image gen fails the character just has
+//  no portrait; if voice gen fails it has no voice. The game keeps running.
+// ──────────────────────────────────────────────────────────────────────
+
+type CharacterDesignOutput = {
+  visualDescription?: string;
+  voiceDescription?: string;
+};
+
+// TEMP: per-phase timing for latency diagnosis. Same convention as the
+// orchestrator's tlog. Remove after we have data on real-world numbers.
+function tlog(label: string, t0: number): void {
+  console.log(`${label}: ${Date.now() - t0}ms`);
+}
+
+async function runDesignLLM(
+  config: EngineConfig,
+  session: Session,
+  charName: string,
+): Promise<CharacterDesignOutput> {
+  const raw = await chat(
+    config.text,
+    [
+      { role: "system", content: CHARACTER_DESIGNER_SYSTEM },
+      {
+        role: "user",
+        content: buildCharacterDesignerUserMessage(charName, session),
+      },
+    ],
+    { temperature: 0.7, responseFormat: "json_object" },
+  );
+  return parseJsonLoose<CharacterDesignOutput>(raw);
+}
+
+// Generate the per-character base portrait. The portrait is a "concept
+// sheet" — single character, neutral pose, plain background — so it works
+// well as a Runware referenceImages anchor for later scenes.
+//
+// Returns the URL (for any client display + URL-form references) and the
+// UUID (cheapest reference form for subsequent Painter calls). Both come
+// back in one `imageInference` response now that we use outputType=URL —
+// no separate upload step needed.
+//
+// In mock mode we return the data URI as basePortraitUrl with no UUID
+// (Painter is short-circuited anyway, so the lack of a UUID is moot).
+export async function renderCharacterPortrait(
+  config: EngineConfig,
+  charName: string,
+  visualDescription: string,
+  styleGuide: string,
+): Promise<{ basePortraitUrl?: string; basePortraitUuid?: string }> {
+  try {
+    if (config.mockImage) {
+      return { basePortraitUrl: await mockImageDataUri() };
+    }
+    const prompt = buildCharacterPortraitPrompt(
+      charName,
+      visualDescription,
+      styleGuide,
+    );
+    const { imageUrl, imageUuid } = await generateImage(config.image, prompt);
+    return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(`[characterDesigner] portrait gen failed for ${charName}: ${msg}`);
+    return {}; // no portrait at all — degrade gracefully
+  }
+}
+
+export async function provisionCharacterVoice(
+  config: EngineConfig,
+  voiceDescription: string,
+  charName: string,
+): Promise<CharacterVoice | undefined> {
+  if (!config.tts) return undefined;
+  try {
+    return await provisionVoice(config.tts, voiceDescription);
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(`[characterDesigner] voice provision failed for ${charName}: ${msg}`);
+    return undefined;
+  }
+}
+
+// The cheap first stage: design the visual + voice TEXT cards in one LLM
+// call. The director then schedules renderCharacterPortrait /
+// provisionCharacterVoice around the Painter. Multiple new characters in the
+// same scene run this stage in parallel at the director level.
+export type CharacterCard = {
+  name: string;
+  visualDescription?: string;
+  voiceDescription: string;
+};
+
+export async function designCharacterCard(
+  config: EngineConfig,
+  session: Session,
+  charName: string,
+): Promise<CharacterCard> {
+  const tDesign = Date.now();
+  const design = await runDesignLLM(config, session, charName);
+  tlog(`[charDesigner ${charName}] design LLM`, tDesign);
+
+  return {
+    name: charName,
+    visualDescription: design.visualDescription?.trim() || undefined,
+    voiceDescription:
+      design.voiceDescription?.trim() ||
+      `请根据角色名「${charName}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`,
+  };
+}
+
+// Provision voice ONLY for an existing character that the LLM mentioned
+// without us having designed them yet (e.g., 编剧 referenced a name that
+// wasn't in `activeCharacters` but appeared as a speaker). Used by
+// directInsertBeat path and as a safety net in directScene. No portrait
+// is generated for these — they get a name + voice only.
+export async function provisionVoiceForName(
+  config: EngineConfig,
+  session: Session,
+  charName: string,
+): Promise<Character> {
+  const voiceDescription = `请根据角色名「${charName}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`;
+  const voice = await provisionCharacterVoice(config, voiceDescription, charName);
+  return { name: charName, voiceDescription, voice };
+}
@@ -0,0 +1,86 @@
+import { chat } from "@infiplot/ai-client";
+import type { BeatActiveCharacter, ProviderConfig } from "@infiplot/types";
+import { parseJsonLoose } from "../jsonParser";
+import {
+  CINEMATOGRAPHER_SYSTEM,
+  buildCinematographerUserMessage,
+} from "../prompts";
+
+// ──────────────────────────────────────────────────────────────────────
+//  Cinematographer agent — translates the Writer's narrative scene
+//  summary into an English compositional prompt for FLUX.
+//
+//  Reads: sceneSummary + entry beat's activeCharacters (poses)
+//         + prior sceneKey (for continuity hints)
+//  Writes: { shotType, integratedPrompt }
+//
+//  Does NOT describe character APPEARANCE — that's appended at the
+//  Painter stage from session.characters[].visualDescription. The
+//  Cinematographer only positions named characters in the frame and
+//  describes the environment + lighting + camera framing.
+//
+//  This separation lets the Cinematographer run IN PARALLEL with the
+//  CharacterDesigner — neither needs the other's output. They both
+//  feed independently into the Painter prompt.
+// ──────────────────────────────────────────────────────────────────────
+
+export type CinematographerOutput = {
+  shotType: string;
+  integratedPrompt: string;
+};
+
+type RawCinematographerOutput = {
+  shotType?: string;
+  integratedPrompt?: string;
+};
+
+export type CinematographerInput = {
+  sceneSummary: string;
+  styleGuide: string;
+  entryBeatActive: BeatActiveCharacter[];
+  /** Entry beat's speaker — drives the dynamic camera policy:
+   *    NPC name → NPC looks toward camera (close-up)
+   *    "你"     → medium shot, NPC listens
+   *    undefined → wide establishing shot */
+  entryBeatSpeaker?: string;
+  priorSceneKey?: string;
+  currentSceneKey?: string;
+};
+
+export async function runCinematographer(
+  config: ProviderConfig,
+  input: CinematographerInput,
+): Promise<CinematographerOutput> {
+  const raw = await chat(
+    config,
+    [
+      { role: "system", content: CINEMATOGRAPHER_SYSTEM },
+      {
+        role: "user",
+        content: buildCinematographerUserMessage(
+          input.sceneSummary,
+          input.styleGuide,
+          input.entryBeatActive,
+          input.entryBeatSpeaker,
+          input.priorSceneKey,
+          input.currentSceneKey,
+        ),
+      },
+    ],
+    { temperature: 0.6, responseFormat: "json_object" },
+  );
+
+  const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
+
+  // Fallback: if the LLM produced nothing usable, synthesize a minimal
+  // integratedPrompt from the Writer's sceneSummary so the Painter has
+  // SOMETHING to work with rather than blowing up the whole pipeline.
+  const integratedPrompt =
+    parsed.integratedPrompt?.trim() ||
+    `A cinematic illustration depicting: ${input.sceneSummary}. Wide establishing shot, natural lighting, atmospheric mood.`;
+
+  return {
+    shotType: parsed.shotType?.trim() || "medium shot",
+    integratedPrompt,
+  };
+}
@@ -0,0 +1,163 @@
+import { generateImage } from "@infiplot/ai-client";
+import type { GenerateImageOptions, GenerateImageResult } from "@infiplot/ai-client";
+import type {
+  Beat,
+  Character,
+  EngineConfig,
+  ProviderConfig,
+} from "@infiplot/types";
+import { mockImageDataUri } from "../mockImage";
+import { buildPainterPrompt } from "../prompts";
+
+// ──────────────────────────────────────────────────────────────────────
+//  Painter — final image generation with multi-reference anchoring.
+//
+//  FLUX.2 [klein] 9B KV does NOT support seedImage (img2img). Instead,
+//  visual continuity comes entirely from `referenceImages` (capped at 4),
+//  which the KV-optimized variant accelerates ~2.5× via key-value caching
+//  of reference latents.
+//
+//  References are slotted in priority order (max 4):
+//    1. Prior scene image — when sceneKey matched a previous scene, this
+//       anchors the same physical space (lighting/layout/style continuity)
+//    2. Entry beat's speaker portrait — the NPC the player is talking with
+//       (most visually prominent)
+//    3. Other on-stage NPCs' portraits — secondary characters in the frame
+//
+//  References are sent as UUIDs (preferred — cheapest in transport) or URLs
+//  (fallback — still cheaper than base64). Base64 fallback was removed when
+//  generateImage switched to outputType=URL, which always returns both a UUID
+//  and a URL so we never lack a cheap reference handle.
+//
+//  Failure handling — two-tier degradation:
+//    A. referenceImages call           (preferred — full visual anchoring)
+//    B. pure text-to-image fallback    (last resort if Runware refs API errors)
+// ──────────────────────────────────────────────────────────────────────
+
+const MAX_REFERENCE_IMAGES = 4;
+
+export type PainterInput = {
+  integratedPrompt: string;
+  styleGuide: string;
+  onStageCharacters: Character[];
+  /**
+   * Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
+   * scene), it slots into referenceImages[0] for spatial continuity.
+   * Capacity-wise this displaces ONE character portrait — slot is shared
+   * with character refs, capped at 4 total per Runware spec.
+   */
+  priorSceneImage?: string;
+};
+
+// Pick the references we send to Runware as `referenceImages`. Priority:
+//   slot 0: priorSceneImage (if any — sceneKey continuity)
+//   slot 1: entry beat's speaker portrait (the NPC speaking to the player)
+//   slot 2+: other on-stage NPCs from entry beat's activeCharacters
+// Caps at 4 total. Returns the array exactly as it'll be sent — already
+// truncated, already deduplicated.
+export function collectReferenceImages(
+  characters: Character[],
+  entryBeat: Beat | undefined,
+  priorSceneImage: string | undefined,
+): string[] {
+  const refs: string[] = [];
+  const seen = new Set<string>();
+
+  // Slot 0 — prior scene image for spatial continuity. Goes first because
+  // backdrop drift is the most jarring discontinuity across same-sceneKey
+  // scenes; character drift is partially masked by character archetype text
+  // in the prompt anyway.
+  if (priorSceneImage) {
+    refs.push(priorSceneImage);
+  }
+
+  // Slot 1+ — character portraits, speaker-first.
+  //
+  // Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
+  // UUID isn't always recognized by the `referenceImages` pipeline (the error
+  // surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
+  // they can always fetch it from their own infra. UUID is kept as a backstop
+  // for any edge case where URL is missing (e.g., legacy session state).
+  const speakerName = entryBeat?.speaker;
+  if (speakerName) {
+    const speaker = characters.find((c) => c.name === speakerName);
+    const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
+    if (ref && refs.length < MAX_REFERENCE_IMAGES) {
+      refs.push(ref);
+      seen.add(speakerName);
+    }
+  }
+
+  for (const c of entryBeat?.activeCharacters ?? []) {
+    if (refs.length >= MAX_REFERENCE_IMAGES) break;
+    if (seen.has(c.name)) continue;
+    const char = characters.find((x) => x.name === c.name);
+    const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
+    if (ref) {
+      refs.push(ref);
+      seen.add(c.name);
+    }
+  }
+
+  return refs.slice(0, MAX_REFERENCE_IMAGES);
+}
+
+async function tryGenerate(
+  config: ProviderConfig,
+  prompt: string,
+  options: GenerateImageOptions,
+  label: string,
+): Promise<GenerateImageResult | null> {
+  try {
+    return await generateImage(config, prompt, options);
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.warn(`[painter] ${label} failed: ${msg}`);
+    return null;
+  }
+}
+
+export type PainterResult =
+  | { kind: "real"; imageUrl: string; imageUuid: string }
+  | { kind: "mock"; imageUrl: string };
+
+export async function runPainter(
+  config: EngineConfig,
+  input: PainterInput,
+  entryBeat: Beat | undefined,
+): Promise<PainterResult> {
+  if (config.mockImage) {
+    return { kind: "mock", imageUrl: await mockImageDataUri() };
+  }
+
+  const prompt = buildPainterPrompt(
+    input.integratedPrompt,
+    input.styleGuide,
+    input.onStageCharacters,
+  );
+
+  const refs = collectReferenceImages(
+    input.onStageCharacters,
+    entryBeat,
+    input.priorSceneImage,
+  );
+
+  // Tier A — with referenceImages (priorSceneImage + character portraits).
+  // FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
+  // ~2.5× compared to the non-KV variant.
+  if (refs.length > 0) {
+    const r = await tryGenerate(
+      config.image,
+      prompt,
+      { referenceImages: refs },
+      `referenceImages (${refs.length})`,
+    );
+    if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
+  }
+
+  // Tier B — pure text-to-image. Last resort, used when Tier A failed OR
+  // there are no references to send (first scene with no characters yet).
+  // Errors here propagate to the caller.
+  const r = await generateImage(config.image, prompt);
+  return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
+}
@@ -0,0 +1,425 @@
+import { chat } from "@infiplot/ai-client";
+import type {
+  Beat,
+  BeatActiveCharacter,
+  BeatChoice,
+  BeatChoiceEffect,
+  BeatNext,
+  ProviderConfig,
+  Session,
+  StoryStatePatch,
+} from "@infiplot/types";
+import { parseJsonLoose } from "../jsonParser";
+import { WRITER_SYSTEM, buildWriterUserMessage } from "../prompts";
+
+// ──────────────────────────────────────────────────────────────────────
+//  Writer agent — owns the narrative half of scene generation.
+//
+//  Output: { sceneSummary, sceneKey, entryBeatId, beats[] }
+//  Each beat carries activeCharacters[] (names + poses) the
+//  Cinematographer reads when composing the establishing shot.
+//
+//  Character DESIGN (visual + voice) is NOT this agent's job —
+//  it only names characters; the CharacterDesigner picks up any
+//  unknown name from beats[].activeCharacters.
+// ──────────────────────────────────────────────────────────────────────
+
+export type WriterOutput = {
+  sceneSummary: string;
+  sceneKey?: string;
+  entryBeatId: string;
+  beats: Beat[];
+  /** Rewritten volatile story memory — merged onto the carried StoryState by
+   *  the director. Absent when the model omitted it (rare; bible just stales). */
+  storyStatePatch?: StoryStatePatch;
+};
+
+// Raw shapes — what the LLM produces before validation / coercion.
+type RawActiveCharacter = {
+  name?: string;
+  pose?: string;
+};
+type RawEffect = {
+  kind?: string;
+  targetBeatId?: string;
+  nextSceneSeed?: string;
+};
+type RawChoice = {
+  id?: string;
+  label?: string;
+  effect?: RawEffect;
+};
+type RawNext = {
+  type?: string;
+  nextBeatId?: string;
+  choices?: RawChoice[];
+};
+type RawBeat = {
+  id?: string;
+  narration?: string;
+  speaker?: string;
+  line?: string;
+  lineDelivery?: string;
+  activeCharacters?: RawActiveCharacter[];
+  next?: RawNext;
+};
+type RawStoryStatePatch = {
+  synopsis?: unknown;
+  openThreads?: unknown;
+  relationships?: unknown;
+  nextHook?: unknown;
+};
+type RawScene = {
+  sceneSummary?: string;
+  sceneKey?: string;
+  entryBeatId?: string;
+  beats?: RawBeat[];
+  storyStatePatch?: RawStoryStatePatch;
+};
+
+// ──────────────────────────────────────────────────────────────────────
+//  POV (player viewpoint) handling — Pattern B (galgame standard):
+//    - speaker = "你"      → ALLOWED (renders as dialog box, never TTS'd)
+//    - any other POV term  → normalized to "你" (LLM slip-up safety net)
+//    - activeCharacters    → POV is NEVER allowed (player has no body in-scene)
+//    - CharacterDesigner   → never invoked for "你" or POV variants
+// ──────────────────────────────────────────────────────────────────────
+
+const POV_DISPLAY_NAME = "你";
+const POV_VARIANTS = new Set([
+  "玩家",
+  "我",
+  "主角",
+  "protagonist",
+  "Protagonist",
+  "player",
+  "Player",
+  "PLAYER",
+  "MC",
+  "mc",
+  "Mc",
+  "I",
+  "i",
+  "me",
+  "Me",
+  "ME",
+]);
+
+function isPovName(name: string): boolean {
+  return name === POV_DISPLAY_NAME || POV_VARIANTS.has(name);
+}
+
+// Normalize a speaker name: any POV variant collapses to "你"; an NPC name
+// passes through unchanged. Caller passes already-trimmed input.
+function normalizeSpeakerName(name: string): string {
+  return POV_VARIANTS.has(name) ? POV_DISPLAY_NAME : name;
+}
+
+function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
+  if (raw?.kind === "advance-beat" && raw.targetBeatId?.trim()) {
+    return { kind: "advance-beat", targetBeatId: raw.targetBeatId.trim() };
+  }
+  return {
+    kind: "change-scene",
+    nextSceneSeed: raw?.nextSceneSeed?.trim() || "未指定",
+  };
+}
+
+function coerceChoice(raw: RawChoice, idx: number): BeatChoice {
+  return {
+    id: raw.id?.trim() || `c${idx + 1}`,
+    label: raw.label?.trim() || `选项 ${idx + 1}`,
+    effect: coerceEffect(raw.effect),
+  };
+}
+
+function coerceNext(raw: RawNext | undefined, fallbackBeatId: string): BeatNext {
+  if (raw?.type === "choice" && Array.isArray(raw.choices) && raw.choices.length) {
+    return {
+      type: "choice",
+      choices: raw.choices.map((c, i) => coerceChoice(c, i)),
+    };
+  }
+  return {
+    type: "continue",
+    nextBeatId: raw?.nextBeatId?.trim() || fallbackBeatId,
+  };
+}
+
+function coerceActiveCharacters(
+  raw: RawActiveCharacter[] | undefined,
+): BeatActiveCharacter[] | undefined {
+  if (!Array.isArray(raw)) return undefined;
+  const out = raw
+    .map((c): BeatActiveCharacter | null => {
+      const name = c.name?.trim();
+      if (!name) return null;
+      // POV is never IN the picture — strip the LLM's slip-up silently so
+      // CharacterDesigner doesn't end up generating a portrait for the player.
+      if (isPovName(name)) return null;
+      const pose = c.pose?.trim();
+      return pose ? { name, pose } : { name };
+    })
+    .filter((c): c is BeatActiveCharacter => Boolean(c));
+  return out.length > 0 ? out : undefined;
+}
+
+function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
+  const id = raw.id?.trim() || `b${idx + 1}`;
+  // Non-last beats default their `continue` target to the following beat.
+  // The last beat gets an empty fallback on purpose: repairBeats() turns a
+  // last/dangling continue into a real scene-change exit so the player can
+  // never get stuck self-looping on it.
+  const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
+
+  const rawSpeaker = raw.speaker?.trim() || undefined;
+  // Normalize any POV variant (玩家/我/主角/protagonist/...) to "你".
+  // NPC names pass through unchanged. This means the LLM can slip and
+  // write "玩家" or "I" and we still render the dialog box correctly with
+  // speaker="你" — and TTS is automatically skipped because no Character
+  // record exists for "你".
+  const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
+
+  const line = raw.line?.trim() || undefined;
+  return {
+    id,
+    narration: raw.narration?.trim() || undefined,
+    speaker,
+    line,
+    // lineDelivery is meaningful only for NPC speakers (TTS). For POV
+    // speaker ("你") TTS is skipped, so lineDelivery would never be used.
+    lineDelivery:
+      line && speaker !== POV_DISPLAY_NAME
+        ? raw.lineDelivery?.trim() || undefined
+        : undefined,
+    activeCharacters: coerceActiveCharacters(raw.activeCharacters),
+    next: coerceNext(raw.next, fallback),
+  };
+}
+
+const FALLBACK_SEED = "故事继续推进";
+
+function fallbackExitChoice(beatId: string): BeatChoice {
+  return {
+    id: `${beatId}__exit`,
+    label: "继续",
+    effect: { kind: "change-scene", nextSceneSeed: FALLBACK_SEED },
+  };
+}
+
+// Beat ids are graph keys (the front-end's `beats.find(b => b.id === ...)`,
+// the session's `visitedBeatIds`, and `continue`/`advance-beat` targets). If
+// the model reuses an id across beats, the second occurrence becomes silently
+// unreachable and external references collapse to the first beat. Rename
+// duplicates; rewrite the renamed beat's OWN self-references. External
+// references stay pointing at the first occurrence.
+function ensureUniqueBeatIds(beats: Beat[]): Beat[] {
+  const seen = new Set<string>();
+  return beats.map((b): Beat => {
+    if (!seen.has(b.id)) {
+      seen.add(b.id);
+      return b;
+    }
+    const oldId = b.id;
+    let n = 2;
+    while (seen.has(`${oldId}_${n}`)) n += 1;
+    const newId = `${oldId}_${n}`;
+    seen.add(newId);
+
+    let next = b.next;
+    if (next.type === "continue" && next.nextBeatId === oldId) {
+      next = { type: "continue", nextBeatId: newId };
+    } else if (next.type === "choice") {
+      next = {
+        type: "choice",
+        choices: next.choices.map((c) =>
+          c.effect.kind === "advance-beat" && c.effect.targetBeatId === oldId
+            ? {
+                ...c,
+                effect: { kind: "advance-beat" as const, targetBeatId: newId },
+              }
+            : c,
+        ),
+      };
+    }
+    return { ...b, id: newId, next };
+  });
+}
+
+// Repairs referential integrity AND guarantees the scene is escapable:
+// - a `continue` to a missing/self id is repointed to the next beat in order;
+//   a last/dangling continue with nowhere to go becomes a scene-change exit
+// - an `advance-beat` to a missing id is downgraded to a scene change
+// - if no change-scene exit exists anywhere, one is appended to the last beat
+function repairBeats(beats: Beat[]): Beat[] {
+  const ids = new Set(beats.map((b) => b.id));
+
+  const fixed: Beat[] = beats.map((b, idx): Beat => {
+    if (b.next.type === "continue") {
+      const target = b.next.nextBeatId;
+      if (ids.has(target) && target !== b.id) return b;
+      const nextByIndex = beats[idx + 1]?.id;
+      if (nextByIndex) {
+        return { ...b, next: { type: "continue", nextBeatId: nextByIndex } };
+      }
+      return { ...b, next: { type: "choice", choices: [fallbackExitChoice(b.id)] } };
+    }
+
+    const patched = b.next.choices.map((c) =>
+      c.effect.kind === "advance-beat" && !ids.has(c.effect.targetBeatId)
+        ? {
+            ...c,
+            effect: {
+              kind: "change-scene" as const,
+              nextSceneSeed: "未指定（导演引用不存在的 beat，已降级为换场）",
+            },
+          }
+        : c,
+    );
+    return { ...b, next: { type: "choice", choices: patched } };
+  });
+
+  const hasExit = fixed.some(
+    (b) =>
+      b.next.type === "choice" &&
+      b.next.choices.some((c) => c.effect.kind === "change-scene"),
+  );
+  if (!hasExit && fixed.length > 0) {
+    const lastIdx = fixed.length - 1;
+    const last = fixed[lastIdx]!;
+    const existing = last.next.type === "choice" ? last.next.choices : [];
+    fixed[lastIdx] = {
+      ...last,
+      next: { type: "choice", choices: [...existing, fallbackExitChoice(last.id)] },
+    };
+  }
+
+  return fixed;
+}
+
+// Choice ids are keys the front-end uses to cache + consume prefetched
+// scenes. Two beats both defaulting to c1/c2 would make a transition reuse
+// the WRONG prefetched scene — so force every choice id to be unique within
+// the scene.
+function ensureUniqueChoiceIds(beats: Beat[]): Beat[] {
+  const seen = new Set<string>();
+  for (const b of beats) {
+    if (b.next.type !== "choice") continue;
+    for (const c of b.next.choices) {
+      if (seen.has(c.id)) {
+        let n = 2;
+        while (seen.has(`${c.id}_${n}`)) n += 1;
+        c.id = `${c.id}_${n}`;
+      }
+      seen.add(c.id);
+    }
+  }
+  return beats;
+}
+
+// Normalize sceneKey to a safe lowercase-with-dashes English slug. If the
+// model returns something weird (中文 / spaces / mixed case), best-effort
+// fix; if it ends up empty, return undefined (the scene just won't be
+// considered for img2img reuse).
+function normalizeSceneKey(raw: string | undefined): string | undefined {
+  if (!raw) return undefined;
+  const slug = raw
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9-]+/g, "-")
+    .replace(/-+/g, "-")
+    .replace(/^-|-$/g, "");
+  return slug.length > 0 ? slug : undefined;
+}
+
+function coerceStringArray(raw: unknown): string[] | undefined {
+  if (!Array.isArray(raw)) return undefined;
+  const out = raw
+    .map((x) => (typeof x === "string" ? x.trim() : ""))
+    .filter((x) => x.length > 0);
+  return out.length > 0 ? out : undefined;
+}
+
+// Pull the volatile story-memory rewrite out of the Writer's JSON. Only
+// non-empty fields are kept; an all-empty/absent patch returns undefined so
+// the director leaves the carried StoryState untouched.
+function coerceStoryStatePatch(
+  raw: RawStoryStatePatch | undefined,
+): StoryStatePatch | undefined {
+  if (!raw || typeof raw !== "object") return undefined;
+  const patch: StoryStatePatch = {};
+  const synopsis = typeof raw.synopsis === "string" ? raw.synopsis.trim() : "";
+  if (synopsis) patch.synopsis = synopsis;
+  const openThreads = coerceStringArray(raw.openThreads);
+  if (openThreads) patch.openThreads = openThreads;
+  const relationships = coerceStringArray(raw.relationships);
+  if (relationships) patch.relationships = relationships;
+  const nextHook = typeof raw.nextHook === "string" ? raw.nextHook.trim() : "";
+  if (nextHook) patch.nextHook = nextHook;
+  return Object.keys(patch).length > 0 ? patch : undefined;
+}
+
+export async function runWriter(
+  config: ProviderConfig,
+  session: Session,
+): Promise<WriterOutput> {
+  const raw = await chat(
+    config,
+    [
+      { role: "system", content: WRITER_SYSTEM },
+      { role: "user", content: buildWriterUserMessage(session) },
+    ],
+    { temperature: 0.9, responseFormat: "json_object" },
+  );
+
+  const parsed = parseJsonLoose<RawScene>(raw);
+  const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : [];
+  if (rawBeats.length === 0) {
+    throw new Error("Writer returned no beats");
+  }
+
+  const beats = ensureUniqueChoiceIds(
+    repairBeats(
+      ensureUniqueBeatIds(
+        rawBeats.map((b, i) => coerceBeat(b, i, rawBeats.length)),
+      ),
+    ),
+  );
+
+  const declaredEntry = parsed.entryBeatId?.trim();
+  const entryBeatId =
+    declaredEntry && beats.some((b) => b.id === declaredEntry)
+      ? declaredEntry
+      : beats[0]!.id;
+
+  return {
+    sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要",
+    sceneKey: normalizeSceneKey(parsed.sceneKey),
+    entryBeatId,
+    beats,
+    storyStatePatch: coerceStoryStatePatch(parsed.storyStatePatch),
+  };
+}
+
+// Surface the set of character names introduced by this scene's beats,
+// so the orchestrator can decide which ones need the CharacterDesigner to
+// fire. Pulls names from both `speaker` fields AND `activeCharacters`
+// (a character can be on-screen without speaking).
+//
+// Excludes POV ("你" / 玩家 / 主角 / ...) entirely — the player is never
+// designed (no portrait, no voice, no archetype).
+export function collectActiveCharacterNames(beats: Beat[]): string[] {
+  const seen = new Set<string>();
+  for (const b of beats) {
+    if (b.speaker && !isPovName(b.speaker)) seen.add(b.speaker);
+    if (b.activeCharacters) {
+      for (const c of b.activeCharacters) {
+        if (!isPovName(c.name)) seen.add(c.name);
+      }
+    }
+  }
+  return Array.from(seen);
+}
+
+// Re-export POV constants for downstream filters (director's orphanSpeakers).
+export { POV_DISPLAY_NAME, POV_VARIANTS, isPovName, normalizeSpeakerName };
@@ -0,0 +1,429 @@
+import { chat } from "@infiplot/ai-client";
+import type {
+  Character,
+  EngineConfig,
+  InsertBeatPartial,
+  ProviderConfig,
+  Scene,
+  Session,
+  StoryState,
+  StoryStatePatch,
+} from "@infiplot/types";
+import type { CharacterCard } from "./agents/characterDesigner";
+import {
+  designCharacterCard,
+  provisionCharacterVoice,
+  provisionVoiceForName,
+  renderCharacterPortrait,
+} from "./agents/characterDesigner";
+import { runCinematographer } from "./agents/cinematographer";
+import { runPainter } from "./agents/painter";
+import {
+  collectActiveCharacterNames,
+  isPovName,
+  normalizeSpeakerName,
+  POV_DISPLAY_NAME,
+  runWriter,
+} from "./agents/writer";
+import { parseJsonLoose } from "./jsonParser";
+import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
+
+// ══════════════════════════════════════════════════════════════════════
+//  director.ts — multi-agent orchestrator for one full Scene generation.
+//
+//  Critical path (per Scene call):
+//
+//    Writer LLM (~3s, serial)
+//      │
+//      ├─ CharacterCard LLM × N        (parallel per new char — TEXT only)
+//      ├─ Cinematographer LLM          (parallel with the cards)
+//      │
+//      └─ wait for cards + cinema
+//      │
+//      ├─ entry-beat portraits   ──┐  (block the Painter — its refs)
+//      ▼                           │
+//    Painter — generateImage       │  (overlapped, NOT on the paint path):
+//      with referenceImages        ├─ non-entry-beat portraits
+//      │                           └─ ALL voice provisioning + orphan voices
+//      ▼
+//    await the overlapped work, fold into the registry
+//      │
+//      ▼
+//    return { scene, sceneImageUrl, characters, storyState }
+//
+//  Two deliberate decouplings unlock the parallelism:
+//   1. The Cinematographer only POSITIONS named characters, so it needs no
+//      visualDescription and runs alongside the card LLMs.
+//   2. The Painter only needs visualDescription TEXT (all on-stage) + the
+//      entry-beat characters' PORTRAITS (its referenceImages). Voices are
+//      never needed to paint, and non-entry portraits are never referenced —
+//      so both overlap the (longest) paint call instead of blocking it.
+// ══════════════════════════════════════════════════════════════════════
+
+function newSceneId(): string {
+  return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`;
+}
+
+function tlog(label: string, t0: number): void {
+  console.log(`${label}: ${Date.now() - t0}ms`);
+}
+
+// Merge a freshly-designed Character into a registry, preserving any
+// previously-set voice/portrait that the new design didn't fill in (so
+// re-designing a known character can't silently drop their voice or wipe
+// out an already-generated portrait UUID). Match by name.
+export function mergeCharacters(
+  existing: Character[],
+  updates: Character[],
+): Character[] {
+  if (updates.length === 0) return existing;
+  const byName = new Map(existing.map((c) => [c.name, c]));
+  for (const u of updates) {
+    const prev = byName.get(u.name);
+    if (!prev) {
+      byName.set(u.name, u);
+      continue;
+    }
+    // Preserve any prior provisioned resource that the new design omitted.
+    byName.set(u.name, {
+      ...u,
+      voice: u.voice ?? prev.voice,
+      visualDescription: u.visualDescription ?? prev.visualDescription,
+      basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
+      basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
+      voiceDescription: u.voiceDescription || prev.voiceDescription,
+    });
+  }
+  return Array.from(byName.values());
+}
+
+// Pick a reference to the prior scene image when sceneKey matches a prior
+// scene — used by the Painter as one of the `referenceImages` (NOT as a
+// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
+//
+// Prefer URL over UUID for the same reason painter.collectReferenceImages
+// does: the UUID returned by `imageInference` isn't always recognized by
+// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
+// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
+// as a backstop. Returns undefined when no prior scene shares the sceneKey.
+function pickPriorSceneReference(
+  session: Session,
+  currentSceneKey: string | undefined,
+): { priorSceneReference?: string; priorSceneKey?: string } {
+  if (!currentSceneKey) return {};
+  for (let i = session.history.length - 1; i >= 0; i--) {
+    const prior = session.history[i]!.scene;
+    if (prior.sceneKey === currentSceneKey) {
+      const ref = prior.imageUrl ?? prior.imageUuid;
+      if (ref) {
+        return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
+      }
+    }
+  }
+  return {};
+}
+
+// Merge the Writer's volatile story-memory patch onto the carried StoryState.
+// The stable spine (logline/genreTags/protagonist/castNotes) is preserved;
+// only the volatile fields the Writer is allowed to rewrite are overwritten,
+// and only when the patch actually provided them. A missing carried state
+// (legacy session from before the Architect existed) degrades to an empty
+// spine rather than throwing.
+function applyStoryStatePatch(
+  base: StoryState | undefined,
+  patch: StoryStatePatch | undefined,
+): StoryState {
+  const start: StoryState =
+    base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" };
+  if (!patch) return start;
+  return {
+    ...start,
+    synopsis: patch.synopsis ?? start.synopsis,
+    openThreads: patch.openThreads ?? start.openThreads,
+    relationships: patch.relationships ?? start.relationships,
+    nextHook: patch.nextHook ?? start.nextHook,
+  };
+}
+
+export type SceneResult = {
+  scene: Scene;
+  sceneImageUrl: string;
+  characters: Character[];
+  storyState: StoryState;
+};
+
+// ──────────────────────────────────────────────────────────────────────
+//  directScene — the multi-agent pipeline. Used by orchestrator's
+//  startSession and requestScene.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function directScene(
+  config: EngineConfig,
+  session: Session,
+): Promise<SceneResult> {
+  const tTotal = Date.now();
+
+  // Stage 1 — Writer (serial; everything downstream needs sceneSummary +
+  // beats[] to know who's on stage and what to compose around).
+  const tWriter = Date.now();
+  const writerOut = await runWriter(config.text, session);
+  tlog("[directScene] Writer", tWriter);
+
+  // Identify NEW characters introduced by this scene that need to be
+  // designed (LLM + portrait + voice). Existing characters in the registry
+  // are skipped — their cards / portraits / voices persist across scenes.
+  const allActiveNames = collectActiveCharacterNames(writerOut.beats);
+  const newCharNames = allActiveNames.filter(
+    (n) => !session.characters.some((c) => c.name === n),
+  );
+
+  // Find the entry beat for the Cinematographer (which characters are
+  // on-screen in the establishing shot).
+  const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId);
+  const entryBeatActive = entryBeat?.activeCharacters ?? [];
+
+  // For sceneKey-based visual continuity, look up the prior matching scene's
+  // image to slot into Painter's referenceImages (max 4 of which include
+  // character portraits too).
+  const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
+    session,
+    writerOut.sceneKey,
+  );
+
+  // ── Stage 2 — character cards (LLM) ∥ Cinematographer ──────────────────
+  // Both are cheap LLM calls and neither needs the other's output, so they
+  // run concurrently. The cards give us each new character's visualDescription
+  // TEXT; portraits + voices are deferred to Stage 3 so they can overlap the
+  // paint instead of blocking it.
+  const tParallel = Date.now();
+
+  const cardPromises = newCharNames.map((name) =>
+    designCharacterCard(config, session, name).catch((err): CharacterCard => {
+      const msg = err instanceof Error ? err.message : String(err);
+      console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
+      // Last-resort fallback: a name + generic voice card so the speaker isn't
+      // unknown. No visualDescription → no portrait is attempted for them.
+      return {
+        name,
+        voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观：${session.worldSetting}`,
+      };
+    }),
+  );
+
+  const cinemaPromise = runCinematographer(config.text, {
+    sceneSummary: writerOut.sceneSummary,
+    styleGuide: session.styleGuide,
+    entryBeatActive,
+    entryBeatSpeaker: entryBeat?.speaker,
+    priorSceneKey,
+    currentSceneKey: writerOut.sceneKey,
+  });
+
+  const [cards, cinemaOut] = await Promise.all([
+    Promise.all(cardPromises),
+    cinemaPromise,
+  ]);
+  tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
+
+  // Working registry: existing characters + new cards. visualDescription text
+  // is present now; portraits + voices fill in over the next two phases.
+  let characters = mergeCharacters(
+    session.characters,
+    cards.map((c) => ({
+      name: c.name,
+      voiceDescription: c.voiceDescription,
+      visualDescription: c.visualDescription,
+    })),
+  );
+
+  // ── Stage 3 — portraits + voices, scheduled around the Painter ─────────
+  const tProvision = Date.now();
+
+  // Entry-beat character names: the ONLY portraits the Painter references
+  // (collectReferenceImages slots in the entry beat's speaker + activeChars).
+  const entryNames = new Set<string>();
+  if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) {
+    entryNames.add(entryBeat.speaker);
+  }
+  for (const c of entryBeatActive) {
+    if (!isPovName(c.name)) entryNames.add(c.name);
+  }
+
+  type NamedPortrait = {
+    name: string;
+    basePortraitUrl?: string;
+    basePortraitUuid?: string;
+  };
+  // Kick off portrait gen for every NEW char that has a visualDescription.
+  // Entry-beat portraits block the Painter; the rest overlap it.
+  const entryPortraitPromises: Promise<NamedPortrait>[] = [];
+  const restPortraitPromises: Promise<NamedPortrait>[] = [];
+  for (const card of cards) {
+    const vd = card.visualDescription;
+    if (!vd) continue;
+    const p = renderCharacterPortrait(
+      config,
+      card.name,
+      vd,
+      session.styleGuide,
+    ).then((res): NamedPortrait => ({ name: card.name, ...res }));
+    (entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p);
+  }
+
+  // Kick off voice provisioning for every NEW char (never on the paint path).
+  const voicePromises = cards.map((card) =>
+    provisionCharacterVoice(config, card.voiceDescription, card.name).then(
+      (voice): Character => ({
+        name: card.name,
+        voiceDescription: card.voiceDescription,
+        voice,
+      }),
+    ),
+  );
+
+  // Edge case: a speaker the Writer referenced without listing in any beat's
+  // activeCharacters. collectActiveCharacterNames already includes speakers,
+  // so this is a rare defensive net. Provision a voice only (never on-screen).
+  const speakerNames = new Set(
+    writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)),
+  );
+  const orphanSpeakers = [...speakerNames].filter(
+    // Pattern B: "你" (player) is a valid speaker but never gets a Character
+    // record — TTS is intentionally skipped on the client.
+    (n) =>
+      !isPovName(n) &&
+      !characters.some((c) => c.name === n) &&
+      !cards.some((c) => c.name === n),
+  );
+  const orphanPromises = orphanSpeakers.map((n) =>
+    provisionVoiceForName(config, session, n),
+  );
+
+  // Block the Painter ONLY on entry-beat portraits (its referenceImages).
+  const entryPortraits = await Promise.all(entryPortraitPromises);
+  characters = mergeCharacters(
+    characters,
+    entryPortraits.map((p) => ({
+      name: p.name,
+      voiceDescription: "", // preserved from the card by mergeCharacters
+      basePortraitUrl: p.basePortraitUrl,
+      basePortraitUuid: p.basePortraitUuid,
+    })),
+  );
+  tlog("[directScene] entry-beat portraits", tProvision);
+
+  // ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards +
+  // entry portraits). On-stage = everyone named in any beat, so the archetype
+  // block covers anyone the player might encounter in this scene.
+  const onStageCharacters = characters.filter((c) =>
+    allActiveNames.includes(c.name),
+  );
+
+  const tPainter = Date.now();
+  const painted = await runPainter(
+    config,
+    {
+      integratedPrompt: cinemaOut.integratedPrompt,
+      styleGuide: session.styleGuide,
+      onStageCharacters,
+      priorSceneImage: priorSceneReference,
+    },
+    entryBeat,
+  );
+  tlog("[directScene] Painter", tPainter);
+
+  // Fold in the work that overlapped the paint: remaining portraits, all
+  // voices, and any orphan-speaker voices. Awaited before returning so the
+  // session the client persists is fully provisioned for later scenes.
+  const tOverlap = Date.now();
+  const [restPortraits, voicedChars, orphanChars] = await Promise.all([
+    Promise.all(restPortraitPromises),
+    Promise.all(voicePromises),
+    Promise.all(orphanPromises),
+  ]);
+  characters = mergeCharacters(
+    characters,
+    restPortraits.map((p) => ({
+      name: p.name,
+      voiceDescription: "",
+      basePortraitUrl: p.basePortraitUrl,
+      basePortraitUuid: p.basePortraitUuid,
+    })),
+  );
+  characters = mergeCharacters(characters, voicedChars);
+  if (orphanChars.length > 0) {
+    characters = mergeCharacters(characters, orphanChars);
+  }
+  tlog("[directScene] overlapped portraits+voices", tOverlap);
+
+  const scene: Scene = {
+    id: newSceneId(),
+    // scenePrompt is the cinematographer's English compositional output;
+    // the Writer's sceneSummary stays in the session log via beats[]/
+    // history. Keeping the original field name preserves compat with
+    // anything that already reads scene.scenePrompt (e.g., insert-beat
+    // user prompt).
+    scenePrompt: cinemaOut.integratedPrompt,
+    beats: writerOut.beats,
+    entryBeatId: writerOut.entryBeatId,
+    sceneKey: writerOut.sceneKey,
+    imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
+    imageUrl: painted.imageUrl,
+  };
+
+  // Merge the Writer's volatile memory rewrite onto the carried bible so the
+  // throughline survives the next scene cut (orchestrator returns it; the
+  // client persists it back into the session).
+  const storyState = applyStoryStatePatch(
+    session.storyState,
+    writerOut.storyStatePatch,
+  );
+
+  tlog("[directScene] TOTAL", tTotal);
+
+  return { scene, sceneImageUrl: painted.imageUrl, characters, storyState };
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  directInsertBeat — single-agent path for vision-driven in-scene
+//  exploration. Generates ONE transient beat with NO new image, NO new
+//  characters. Multi-agent pipeline doesn't apply here (no rendering, no
+//  character introduction allowed by the prompt).
+// ──────────────────────────────────────────────────────────────────────
+
+export async function directInsertBeat(
+  config: ProviderConfig,
+  session: Session,
+  freeformAction: string,
+): Promise<InsertBeatPartial> {
+  const raw = await chat(
+    config,
+    [
+      { role: "system", content: INSERT_BEAT_SYSTEM },
+      {
+        role: "user",
+        content: buildInsertBeatUserMessage(session, freeformAction),
+      },
+    ],
+    { temperature: 0.9, responseFormat: "json_object" },
+  );
+
+  const parsed = parseJsonLoose<InsertBeatPartial>(raw);
+
+  const narration = parsed.narration?.trim() || undefined;
+  const rawSpeaker = parsed.speaker?.trim() || undefined;
+  // Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through.
+  const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
+  const line = parsed.line?.trim() || undefined;
+  // lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你")
+  // TTS is intentionally skipped on the client, so lineDelivery is dropped.
+  const lineDelivery =
+    line && speaker !== POV_DISPLAY_NAME
+      ? parsed.lineDelivery?.trim() || undefined
+      : undefined;
+
+  if (!narration && !speaker && !line) {
+    return { narration: "（你停下脚步，环视片刻。）" };
+  }
+  return { narration, speaker, line, lineDelivery };
+}
@@ -0,0 +1,15 @@
+export {
+  startSession,
+  requestScene,
+  visionDecide,
+  requestInsertBeat,
+  requestBeatAudio,
+} from "./orchestrator";
+export { synthesizeBeat } from "./voice";
+export { mergeCharacters } from "./director";
+export type { SceneResult } from "./director";
+export { runArchitect } from "./agents/architect";
+export type { WriterOutput } from "./agents/writer";
+export type { CinematographerOutput } from "./agents/cinematographer";
+export type { InsertBeatPartial } from "@infiplot/types";
+export * from "./prompts";
@@ -0,0 +1,95 @@
+import { jsonrepair, JSONRepairError } from "jsonrepair";
+
+// Strict-then-forgiving JSON parser for LLM output. Tries in order:
+//   1. Direct JSON.parse on the trimmed text.
+//   2. Extract from ```json``` fenced block.
+//   3. Slice between first { and last } and parse.
+//   4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
+//
+// On final failure, logs the first 800 chars of the raw model output so we
+// can diagnose the actual syntax error without flooding logs or leaking
+// sensitive content.
+//
+// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
+// broad LLM-output failure modes: truncated JSON, missing commas/brackets,
+// single quotes, Python None/True/False, JS comments. We layer a small set
+// of targeted pre-repairs in front of it for failure modes jsonrepair can't
+// disambiguate on its own (see preRepair).
+
+// ──────────────────────────────────────────────────────────────────────
+//  preRepair — fix specific LLM error patterns before handing to jsonrepair.
+//
+//  Pattern 1: missing closing quote on a key.
+//     Broken:  "lineDelivery: "语速稍快...",
+//     Correct: "lineDelivery": "语速稍快...",
+//
+//  jsonrepair fails on this because it's ambiguous — "lineDelivery: " could
+//  be a complete string value, leaving "语速稍快..." as a syntax error. But
+//  if we see  "<key-like>:<whitespace>"  we know structurally it should be
+//  a key-colon-value triplet.
+//
+//  Match constraints:
+//    - The key match excludes  "  \n  :  so we can't overrun into adjacent
+//      fields or absorb the colon as part of the key name.
+//    - The colon must be followed by whitespace and another  "  (the value
+//      string's opening quote). This is what disambiguates from a value
+//      string that happens to contain a colon.
+// ──────────────────────────────────────────────────────────────────────
+
+function preRepair(s: string): string {
+  return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
+}
+
+export function parseJsonLoose<T>(raw: string): T {
+  const trimmed = raw.trim();
+
+  try {
+    return JSON.parse(trimmed) as T;
+  } catch {
+    // fall through
+  }
+
+  const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
+  if (fenced?.[1]) {
+    try {
+      return JSON.parse(fenced[1]) as T;
+    } catch {
+      // fall through
+    }
+  }
+
+  const first = trimmed.indexOf("{");
+  const last = trimmed.lastIndexOf("}");
+  const slice =
+    first !== -1 && last > first ? trimmed.slice(first, last + 1) : trimmed;
+
+  // Try the brace-sliced version first; if there were no braces at all
+  // (slice === trimmed), this is just a second attempt at the raw text.
+  try {
+    return JSON.parse(slice) as T;
+  } catch {
+    // Targeted pre-repair (no-op on already-valid JSON) → jsonrepair.
+    const prefixed = preRepair(slice);
+
+    // If preRepair changed something, give the cheap path another shot —
+    // the input might already be valid now without needing jsonrepair.
+    if (prefixed !== slice) {
+      try {
+        return JSON.parse(prefixed) as T;
+      } catch {
+        // fall through to jsonrepair
+      }
+    }
+
+    try {
+      const repaired = jsonrepair(prefixed);
+      return JSON.parse(repaired) as T;
+    } catch (err) {
+      const isRepairErr = err instanceof JSONRepairError;
+      console.error(
+        `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
+      );
+      throw err;
+    }
+  }
+}
@@ -0,0 +1,25 @@
+// Static SVG placeholder used when MOCK_IMAGE=true, so we can exercise the
+// TTS path without paying for image generation. Returned as a data URI so the
+// rest of the pipeline can treat it as an `imageUrl` interchangeably with
+// real Runware URLs (the client's <img src> accepts both, and we never feed
+// a mock image to Runware's referenceImages because mockImage mode
+// short-circuits the Painter entirely).
+//
+// Previously rendered to PNG via sharp; switched to a self-describing SVG
+// data URI so the engine has zero Node-native dependencies and runs on
+// Cloudflare Workers. SVG also stays crisp at any display size.
+
+const W = 1792;
+const H = 1024;
+const SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
+  <rect width="${W}" height="${H}" fill="#161109"/>
+  <rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none" stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
+  <text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif" font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
+  <text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif" font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
+</svg>`;
+
+const DATA_URI = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(SVG)}`;
+
+export async function mockImageDataUri(): Promise<string> {
+  return DATA_URI;
+}
@@ -0,0 +1,180 @@
+import type {
+  BeatAudioRequest,
+  BeatAudioResponse,
+  EngineConfig,
+  InsertBeatRequest,
+  InsertBeatResponse,
+  Session,
+  SceneRequest,
+  SceneResponse,
+  StartRequest,
+  StartResponse,
+  VisionRequest,
+  VisionResponse,
+} from "@infiplot/types";
+import { runArchitect } from "./agents/architect";
+import { directInsertBeat, directScene } from "./director";
+import { synthesizeBeat } from "./voice";
+import { interpret } from "./vision";
+
+function newSessionId(): string {
+  return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
+}
+
+function tlog(label: string, t0: number): void {
+  console.log(`${label}: ${Date.now() - t0}ms`);
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  startSession — initial Scene via the multi-agent pipeline.
+//
+//  directScene internally handles: Writer → (CharacterDesigner+
+//  Cinematographer parallel) → Painter → upload. Voice provisioning and
+//  portrait generation happen inside CharacterDesigner per new character,
+//  so the orchestrator no longer needs to coordinate them separately.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function startSession(
+  config: EngineConfig,
+  req: StartRequest,
+): Promise<StartResponse> {
+  const tTotal = Date.now();
+
+  const session: Session = {
+    id: newSessionId(),
+    createdAt: Date.now(),
+    worldSetting: req.worldSetting.trim(),
+    styleGuide: req.styleGuide.trim(),
+    history: [],
+    characters: [],
+  };
+
+  // Stage 0 — Architect: expand the terse world/style prompt into a story
+  // bible BEFORE the first scene. Serial by necessity (the opening Writer
+  // reads session.storyState), but it gives the whole story a spine from beat
+  // one — the latency is offset by the director's portrait/voice overlap win.
+  const tArchitect = Date.now();
+  session.storyState = await runArchitect(config.text, session);
+  tlog("[start] Architect", tArchitect);
+
+  const { scene, sceneImageUrl, characters, storyState } = await directScene(
+    config,
+    session,
+  );
+
+  tlog("[start] TOTAL", tTotal);
+
+  return {
+    sessionId: session.id,
+    scene,
+    imageUrl: sceneImageUrl,
+    characters,
+    storyState,
+  };
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  requestScene — next Scene from existing session.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function requestScene(
+  config: EngineConfig,
+  req: SceneRequest,
+): Promise<SceneResponse> {
+  const tTotal = Date.now();
+
+  const { scene, sceneImageUrl, characters, storyState } = await directScene(
+    config,
+    req.session,
+  );
+
+  tlog("[scene] TOTAL", tTotal);
+
+  return {
+    scene,
+    imageUrl: sceneImageUrl,
+    characters,
+    storyState,
+  };
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  visionDecide — interprets a background click into intent + classify.
+//  No change from staging — vision lives outside the scene-generation
+//  pipeline.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function visionDecide(
+  config: EngineConfig,
+  req: VisionRequest,
+): Promise<VisionResponse> {
+  const current = req.session.history.at(-1)?.scene ?? null;
+  return interpret(config.vision, req.annotatedImageBase64, current);
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  requestInsertBeat — single-agent transient beat (no image, no new
+//  characters). Stays single-LLM by design — the INSERT_BEAT prompt
+//  forbids new characters and there's nothing to render.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function requestInsertBeat(
+  config: EngineConfig,
+  req: InsertBeatRequest,
+): Promise<InsertBeatResponse> {
+  const tTotal = Date.now();
+
+  const partial = await directInsertBeat(
+    config.text,
+    req.session,
+    req.freeformAction,
+  );
+
+  // INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines
+  // to narration so the player still sees the text (the client only renders
+  // `line` when there is a `speaker`).
+  //
+  // Exception (Pattern B): speaker = "你" is the player speaking. No
+  // Character record exists for "你" (intentional — TTS is skipped), so we
+  // must NOT demote it; the client renders the dialog box correctly.
+  // directInsertBeat already normalized POV variants to "你" before this
+  // guard, so a literal "你" here is always Pattern B player dialog.
+  if (
+    partial.speaker &&
+    partial.speaker !== "你" &&
+    !req.session.characters.some((c) => c.name === partial.speaker)
+  ) {
+    console.warn(
+      `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
+    );
+    const promotedNarration =
+      [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
+    tlog("[insert-beat] TOTAL", tTotal);
+    return {
+      partial: {
+        narration: promotedNarration,
+        speaker: undefined,
+        line: undefined,
+        lineDelivery: undefined,
+      },
+      characters: req.session.characters,
+    };
+  }
+
+  tlog("[insert-beat] TOTAL", tTotal);
+  return { partial, characters: req.session.characters };
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  requestBeatAudio — lazy per-beat synth. Returns audio:null on
+//  timeout / failure / TTS disabled, so the client just plays silent.
+// ──────────────────────────────────────────────────────────────────────
+
+export async function requestBeatAudio(
+  config: EngineConfig,
+  req: BeatAudioRequest,
+): Promise<BeatAudioResponse> {
+  if (!config.tts) return { audio: null };
+  const audio = await synthesizeBeat(config.tts, req.voice, req.beat);
+  return { audio };
+}
@@ -0,0 +1,739 @@
+import type {
+  BeatActiveCharacter,
+  Character,
+  Scene,
+  Session,
+  StoryState,
+} from "@infiplot/types";
+
+// ══════════════════════════════════════════════════════════════════════
+//  Multi-agent scene generation pipeline:
+//    Architect (总编剧)    — ONE-TIME at session start: the story bible
+//                           (protagonist / logline / genre / opening hook /
+//                            planned cast) → seeds StoryState
+//    Writer (编剧)         — narrative + beats[] + per-beat activeCharacters,
+//                           reads StoryState and emits a StoryStatePatch
+//    CharacterDesigner    — per-new-character visual + voice cards
+//    Cinematographer (分镜导演) — sceneKey + English compositional prompt
+//    Painter (画师)        — FLUX rendering with character archetypes
+//
+//  Each agent owns one system prompt + one user-message builder below.
+//  All agents see the same world / style guide, but each only reads the
+//  slice of session state it needs to make its decision.
+// ══════════════════════════════════════════════════════════════════════
+
+// ──────────────────────────────────────────────────────────────────────
+//  Shared — render the StoryState bible into a compact prompt block read
+//  by the Writer (and Architect, on revisions). Keeping one renderer means
+//  the bible looks identical to every agent that consumes it.
+// ──────────────────────────────────────────────────────────────────────
+
+export function renderStoryState(s: StoryState | undefined): string {
+  if (!s) return "";
+  const lines: string[] = ["【故事档案 / 主线记忆】"];
+  if (s.logline) lines.push(`主线（中心钩子）：${s.logline}`);
+  if (s.genreTags) lines.push(`题材基调：${s.genreTags}`);
+  if (s.protagonist) lines.push(`主角「你」：${s.protagonist}`);
+  if (s.castNotes) lines.push(`核心配角：\n${s.castNotes}`);
+  if (s.synopsis) lines.push(`已发生（梗概）：${s.synopsis}`);
+  if (s.relationships?.length) {
+    lines.push(`当前关系/情绪：\n${s.relationships.map((r) => `- ${r}`).join("\n")}`);
+  }
+  if (s.openThreads?.length) {
+    lines.push(`未收的悬念/伏笔：\n${s.openThreads.map((t) => `- ${t}`).join("\n")}`);
+  }
+  if (s.nextHook) lines.push(`接下来要往哪走（下一个钩子方向）：${s.nextHook}`);
+  return lines.join("\n");
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  0. Architect (总编剧) — ONE LLM call at session start.
+//
+//  Turns the (often terse) user world + style prompt into a real story
+//  bible: a second-person protagonist with a want and a flaw, a single
+//  central dramatic question, a genre frame that anchors the 爽点 rhythm,
+//  an engineered opening hook (前3秒冷开场), and a small intentional cast.
+//  Everything downstream — Writer, CharacterDesigner — reads this so the
+//  story has a spine from beat one instead of being improvised cold.
+// ──────────────────────────────────────────────────────────────────────
+
+export const ARCHITECT_SYSTEM = `你是一部交互视觉小说的「总编剧 / 故事架构师」。玩家只给了你一句到几句的世界观和画风，你要在开拍前把它扩写成一份**故事档案（story bible）**，为后续每一幕定下脊梁。你不写具体台词、不写分镜、不设计立绘——你只搭骨架。
+
+你深谙网文（番茄）、短剧（红果）与视觉小说（galgame）的爆款心法：
+- **开篇即钩子**：黄金三章 / 前3秒法则。开场不铺垫世界观，直接抛出冲突、悬念或一个反常的瞬间。
+- **代入感**：主角是第二人称「你」，是玩家的化身——要让玩家一进场就清楚"我是谁、我此刻卡在什么处境里、我想要什么"。
+- **题材锚定爽点**：先选定一个清晰的题材框架（如 甜宠 / 校园暗恋 / 悬疑追凶 / 复仇逆袭 / 救赎治愈），它决定了情绪回报的节奏与类型。
+- **戏剧问题**：整部故事由一个悬而未决的中心问题驱动（她到底是谁？你能否在记忆消失前查明真相？这场暗恋会走向哪里？）。
+- **人设要鲜明且有反差**：每个核心角色一个强标签 + 一个反差面（外冷内热 / 傲娇 / 看似柔弱实则腹黑）。
+
+你要产出（全部用中文，except 不需要英文）：
+- logline：一句话主线 / 中心戏剧问题，必须带钩子，让人想看下去
+- genreTags：题材+基调标签，斜杠分隔，如 "甜宠 / 校园 / 慢热治愈带点伤感"
+- protagonist：第二人称主角卡。包含：你是谁、你此刻正卡在什么具体处境里（要有即时张力）、你想要什么、一个软肋或秘密。50–120 字。
+- castNotes：2–3 个核心配角，每行一个「名字：一句话人设（强标签+反差）+ 与你的关系/张力」。给真实好记的中文名字（不要"神秘女子"这种占位）。
+- synopsis：开场此刻的情境梗概（故事尚未展开，就写"故事从……开始"），1–3 句。
+- openThreads：开场就埋下的 1–3 个悬念/问题（数组）。
+- nextHook：**第一幕**应当如何冷开场——具体描述开场那个抓人的瞬间/冲突（这会直接指导编剧写开场）。要画面感强、有张力。
+
+设计硬规则：
+- 主角「你」永不出现在画面里（第二人称 POV），所以 castNotes 里**不要**把"你/主角"当成一个角色。
+- 配角名字要符合世界观（年代、地域、文化）。
+- 一切服从玩家给的世界观与画风，不要擅自跑题；玩家信息少时，做最贴合、最有戏的合理扩写。
+
+必须输出严格 JSON：
+{
+  "logline": "...",
+  "genreTags": "...",
+  "protagonist": "...",
+  "castNotes": "夏海：表面开朗的天台诗人，实则在用诗逃避家里的变故；与你是同班转学的邻座，对你有种说不清的在意。\\n班主任老周：…",
+  "synopsis": "...",
+  "openThreads": ["...", "..."],
+  "nextHook": "第一幕冷开场：……"
+}
+
+不要输出 JSON 以外的任何文本。`;
+
+export function buildArchitectUserMessage(session: Session): string {
+  const parts: string[] = [];
+  parts.push(`世界观：${session.worldSetting}`);
+  parts.push(`画风：${session.styleGuide}`);
+  parts.push(
+    "\n请据此产出这部交互剧的故事档案（story bible），严格以 JSON 格式返回。",
+  );
+  return parts.join("\n");
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  1. Writer (编剧) — drives the narrative.
+//
+//  Emits a full Scene: beats[] graph + entryBeatId + sceneKey hint +
+//  activeCharacters per beat. Does NOT design characters (that's the
+//  CharacterDesigner's job) — only names them in `activeCharacters`.
+//  The CharacterDesigner is invoked separately for any name not yet in
+//  session.characters.
+// ──────────────────────────────────────────────────────────────────────
+
+export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。每次基于【故事档案 / 主线记忆】、世界观、画风、玩家历史、已登记角色，写出**一个完整场景的剧本**：场景背景概要 + 一组对话节拍 beats，并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度，这些由其他 agent 完成。
+
+═══════════════════════════════════════════════════════════════════
+爆款心法（番茄网文 / 红果短剧 / galgame 的叙事手感）—— 必须贯彻
+═══════════════════════════════════════════════════════════════════
+- **每个场景都要有钩子**：开头 1–2 个 beat 内就抛出新信息、悬念、冲突或情绪冲击，绝不平铺直叙地交代背景；结尾 beat 留一个让玩家"想知道接下来"的扣子。
+- **兑现爽点 / 情绪回报**：按题材给观众想要的情绪（甜宠的心动、暗恋的暧昧拉扯、逆袭的扬眉吐气、悬疑的真相一角）。让玩家这一场"有所得"。
+- **反转与反差**：适时打破预期——以为是 A 结果是 B、角色露出与第一印象相反的一面；但反转要可信、要扣主线。
+- **快节奏、入戏快**：进场即冲突，少铺陈，删掉一切"为完整而存在"却不推进情绪的对话。
+- **show, don't tell**：用动作、神态、潜台词、环境细节传递情绪，别直接旁白"她很难过"——让玩家自己读出来。
+- **人设鲜明有反差**：每个角色一个强标签 + 一个反差面，台词紧贴其腔调（傲娇嘴硬心软、外冷内热、看似柔弱实则强势）。
+- **选择要有分量**：choice 只出现在真正的岔路口，每个选项都要让玩家感到"通向不同的东西"（情绪指向不同 / 关系走向不同），别给等价的废选项。
+
+═══════════════════════════════════════════════════════════════════
+连贯性铁律（跨场景切换不能跳戏 —— 最重要）
+═══════════════════════════════════════════════════════════════════
+- 你会收到【故事档案 / 主线记忆】和上一场的结尾。**新场景必须从上一刻自然承接**——承接上一场的情绪、地点逻辑、人物状态与未收的悬念。
+- 若给了「转场种子 nextSceneSeed」，把它当作"下一场的命题"去兑现，而不是另起炉灶；开场要让玩家感到"这正是我上一个动作 / 选择导致的结果"。
+- 沿用主线记忆里的人物关系与情绪温度——别让刚告白的人下一场形同陌路，也别凭空遗忘已埋的伏笔。
+- 推进、但别重置：每一场都让主线问题往前走一点（关系变化 / 真相揭露一角 / 新悬念浮现）。
+
+一个场景包含：
+- sceneSummary：当前场景的中文概要（地点、时间、氛围、关键事件——给后续的分镜导演看）
+- sceneKey：当前场景的英文 slug（如 "classroom-dusk"、"rooftop-night"、"rainy-street"）——同一物理空间应沿用相同 slug
+- beats[]：玩家依次经历的对话节拍
+- entryBeatId：玩家进入场景时落在哪个 beat
+
+每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接：
+- "continue"：玩家点击图片背景 / 按继续，自然推进到下一个 beat
+- "choice"：在此让玩家做选择，按所选 choice 的 effect 走向
+
+choice 的 effect 有两种：
+- "advance-beat"：玩家选了之后跳到**同场景内**的另一个 beat（不换背景图，速度极快）
+- "change-scene"：玩家选了之后切换到**新场景**（视角变了 / 走到新地方 / 时间跳了）
+
+设计原则：
+- 同场景内 beat 数自由发挥，按剧情节奏自然给出（通常 2–6 个，可以更多）
+- 多用 continue，少用 choice — 选择只应出现在「真正的岔路口」
+- advance-beat 适合处理对话分支（同一场景里换个话题、追问、撒娇）
+- change-scene 适合空间/时间跳跃（出门、转身看窗外、第二天清晨）
+- 一个场景至少要有一个 change-scene 出口（除非真到结局）
+- 每个 change-scene 必须带 nextSceneSeed —— 一句中文简述「下一场是哪里、谁在、要发生什么」
+- 同一场景的 beat id 互不重复
+- next.nextBeatId 引用的 beat 必须存在
+- choice 至少 2 个，至多 4 个，互不重复
+
+sceneKey 设计原则（重要 — 用于跨场景视觉一致性）：
+- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug
+- 时段或空间变化时换 slug（如 "classroom-dusk" → "classroom-night"，"classroom-dusk" → "corridor-dusk"）
+- slug 规范：lowercase-with-dashes，2–4 个英文单词
+- 已登记的历史场景 sceneKey 会在用户消息里列出，请优先**复用**这些已有 slug
+
+文本风格约束：
+- narration / line 用中文（**纯净可显示文本**，绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的，会被玩家看见）
+- sceneSummary / lineDelivery / activeCharacters[].pose 内的文字也用中文
+- sceneKey 用英文 slug
+- 单个 beat 的 narration 与 line 加起来 ≤80 字
+- 单个 choice label ≤15 字
+
+配音相关字段：
+- 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的「配音导演指令」，描述该句台词怎么念（情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏）。例："鼓起勇气又害羞，声音发颤、偏小，句尾带一丝气声，语速偏慢"。平淡场合写"平静自然、语速适中"即可，但要贴当下情境。
+
+角色与台词的硬性规则：
+- 任何 beat 的 speaker 字段一旦填了名字，**该名字必须**：① 是 "你"（玩家本人，见下方"玩家视角硬规则"），或 ② 在「已登记角色」列表中存在，或 ③ 出现在本场景的某个 beat 的 activeCharacters 里。
+- speaker 名字必须与登记名**完全一致**，不要加「（回忆）」「学姐」之类后缀或别名。
+- 每个 beat 的 activeCharacters 列出**此时此刻画面里出现的 NPC 角色**及其当下姿态/神情（中文）。即使没人说话，画面里有谁在也要列出。
+
+玩家视角硬规则（重要 — 违反这条会破坏整个 galgame）：
+
+【画面规则 — 严格禁止】
+- 玩家是第二人称 POV，**永远不出现在任何 Scene 画面里**
+- activeCharacters[].name 数组**绝不允许**包含任何下列名字（任何大小写、中英文变体）：
+  「玩家」「你」「我」「主角」「protagonist」「player」「Player」「MC」「I」「me」
+- 玩家不会被设计立绘、不会被设计音色
+
+【对白规则 — galgame 标准做法（Pattern B）】
+- 玩家**可以正常说话**——当主角对 NPC 开口时：
+    speaker = "你"（**固定用这两个字，不要用其他变体**）
+    line = 实际说的话（如「学姐，下雨了」）
+    lineDelivery 可以留空（玩家对白不会被 TTS 合成）
+- speaker 字段允许的取值**只有两种**：① NPC 真名（必须在 activeCharacters 里）② "你"
+- 其它 POV 变体（玩家 / 我 / 主角 / protagonist / player / MC / I / me）**一律视为错误**
+
+【内心 vs 外显的区分】
+- 主角在心里想 / 在做某个动作 / 在观察 / 自己的体感 → 用 narration（speaker 留空）
+  例："你的心跳得很快，几乎听不见外面的雨声。"
+- 主角真的开口对 NPC 说出来 → 用 speaker="你" + line
+  例：speaker="你" line="学姐，这把伞你拿着。"
+- 同一个 beat 可以同时有 narration（心理活动 / 动作）和 speaker="你" + line（说出口的话）
+
+更新主线记忆（storyStatePatch）—— 写完这一场后必做：
+- synopsis：把这一场并入后的整体梗概，**压缩**到 3–5 句（别越写越长，旧细节该丢就丢）
+- relationships：每个核心角色此刻与「你」的关系 / 情绪温度，每条一句（如 "夏海：暗恋升温，刚向你说了一半的告白被打断"）
+- openThreads：仍未收的悬念 / 伏笔——已收束的可移除、新埋的加入（但至少保留一条正在推进的主线，别把列表清空）
+- nextHook：基于这一场的结尾，下一场应往哪走（给"下一次的你"一个明确命题，接住本场留下的扣子）
+这些字段是写给"未来的你"的连贯性记忆，请认真写。
+
+必须输出严格 JSON，结构如下：
+{
+  "sceneSummary": "中文场景概要：地点+时间+氛围+关键事件",
+  "sceneKey": "classroom-dusk",
+  "entryBeatId": "b1",
+  "beats": [
+    {
+      "id": "b1",
+      "narration": "可空（纯净文本）",
+      "speaker": "可空",
+      "line": "可空（纯净文本）",
+      "lineDelivery": "line 非空时必填：配音导演指令",
+      "activeCharacters": [
+        { "name": "夏海", "pose": "脸红害羞地绞着衣角，双眼躲闪" }
+      ],
+      "next": { "type": "continue", "nextBeatId": "b2" }
+    },
+    {
+      "id": "b2",
+      "speaker": "夏海",
+      "line": "学长，我有话想对你说。",
+      "lineDelivery": "鼓起勇气，但又有点害羞，语速偏慢，句尾微微上扬",
+      "activeCharacters": [
+        { "name": "夏海", "pose": "鼓起勇气直视对方，双手紧握" }
+      ],
+      "next": { "type": "continue", "nextBeatId": "b3" }
+    },
+    {
+      "id": "b3",
+      "narration": "你下意识攥紧了书包带，喉咙有点干。",
+      "speaker": "你",
+      "line": "……你说。",
+      "activeCharacters": [
+        { "name": "夏海", "pose": "鼓起勇气直视对方，双手紧握" }
+      ],
+      "next": {
+        "type": "choice",
+        "choices": [
+          {
+            "id": "c1",
+            "label": "继续追问",
+            "effect": { "kind": "advance-beat", "targetBeatId": "b4" }
+          },
+          {
+            "id": "c2",
+            "label": "起身离开教室",
+            "effect": { "kind": "change-scene", "nextSceneSeed": "雨后湿漉漉的走廊，她追了出来" }
+          }
+        ]
+      }
+    }
+  ],
+  "storyStatePatch": {
+    "synopsis": "把这一场并入后的滚动梗概，压缩到 3–5 句",
+    "relationships": ["夏海：暗恋升温，刚向你说了一半的告白被打断"],
+    "openThreads": ["夏海没说完的那句话到底是什么", "她书包里掉出的那张旧照片"],
+    "nextHook": "下一场：放学后的天台，她把你单独叫上去，要把话说完"
+  }
+}
+
+不要输出 JSON 以外的任何文本。`;
+
+export function buildWriterUserMessage(session: Session): string {
+  const parts: string[] = [];
+
+  const bible = renderStoryState(session.storyState);
+  if (bible) {
+    parts.push(bible);
+    parts.push("");
+  }
+
+  parts.push(`世界观：${session.worldSetting}`);
+  parts.push(`画风：${session.styleGuide}`);
+
+  if (session.characters.length > 0) {
+    parts.push("\n已登记角色（speaker 必须用这些名字之一，或本场景新引入）：");
+    for (const c of session.characters) {
+      parts.push(`- ${c.name}`);
+    }
+  }
+
+  const priorKeys = collectPriorSceneKeys(session);
+  if (priorKeys.length > 0) {
+    parts.push("\n已使用的 sceneKey（同一物理空间请沿用，不要新造）：");
+    for (const k of priorKeys) parts.push(`- ${k}`);
+  }
+
+  if (session.history.length === 0) {
+    parts.push(
+      "\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场写出来——开场即抓人，别花笔墨铺垫世界观。写完后更新 storyStatePatch。严格以 JSON 格式返回。",
+    );
+    return parts.join("\n");
+  }
+
+  parts.push("\n场景历史（按时间顺序）：");
+  session.history.forEach((entry, idx) => {
+    const lines: string[] = [`【场景 ${idx + 1}】`];
+    if (entry.scene.sceneKey) lines.push(`  sceneKey: ${entry.scene.sceneKey}`);
+
+    const visited = entry.visitedBeatIds.length
+      ? entry.visitedBeatIds
+      : [entry.scene.entryBeatId];
+    const beatById = new Map(entry.scene.beats.map((b) => [b.id, b]));
+    const visitedBeats = visited
+      .map((id) => beatById.get(id))
+      .filter((b): b is NonNullable<typeof b> => Boolean(b));
+
+    for (const b of visitedBeats) {
+      const fragments: string[] = [];
+      if (b.narration) fragments.push(`旁白：${b.narration}`);
+      if (b.line) fragments.push(`${b.speaker ?? "?"}：${b.line}`);
+      if (fragments.length) lines.push("  " + fragments.join(" / "));
+    }
+
+    if (entry.exit) {
+      if (entry.exit.kind === "choice") {
+        lines.push(
+          `  玩家最终选择：${entry.exit.label}（去往：${entry.exit.nextSceneSeed}）`,
+        );
+      } else {
+        lines.push(`  玩家自由动作：${entry.exit.action}`);
+      }
+    }
+    parts.push(lines.join("\n"));
+  });
+
+  const last = session.history.at(-1);
+
+  // The exact last moment the player stopped on — the new scene must continue
+  // seamlessly from this emotional beat, not reset to a neutral state.
+  if (last) {
+    const lastBeatId = last.visitedBeatIds.at(-1) ?? last.scene.entryBeatId;
+    const lastBeat = last.scene.beats.find((b) => b.id === lastBeatId);
+    if (lastBeat) {
+      const frag: string[] = [];
+      if (lastBeat.narration) frag.push(`旁白：${lastBeat.narration}`);
+      if (lastBeat.line) frag.push(`${lastBeat.speaker ?? "?"}：${lastBeat.line}`);
+      if (frag.length) {
+        parts.push(
+          `\n上一刻（玩家停留的最后一个画面，新场景要从这里的情绪无缝承接）：\n  ${frag.join(" / ")}`,
+        );
+      }
+    }
+  }
+
+  const lastExit = last?.exit;
+  if (lastExit) {
+    if (lastExit.kind === "choice") {
+      parts.push(
+        `\n承接「玩家在上一场选择了：${lastExit.label}」无缝续写下一个场景（转场命题：${lastExit.nextSceneSeed}）。开场要让玩家感到这正是上一步的结果，并延续此刻的情绪。`,
+      );
+    } else {
+      parts.push(
+        `\n承接「玩家自由动作：${lastExit.action}」无缝续写下一个场景，延续此刻的情绪与处境。`,
+      );
+    }
+  } else {
+    parts.push("\n无缝续写下一个场景，延续上一刻的情绪。");
+  }
+
+  parts.push("写完后别忘了更新 storyStatePatch。严格以 JSON 格式返回。");
+  return parts.join("\n");
+}
+
+function collectPriorSceneKeys(session: Session): string[] {
+  const seen = new Set<string>();
+  for (const entry of session.history) {
+    const k = entry.scene.sceneKey;
+    if (k) seen.add(k);
+  }
+  return Array.from(seen);
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  2. CharacterDesigner (角色设定师) — designs one new character.
+//
+//  Receives a character NAME (extracted by the Writer's activeCharacters)
+//  and produces BOTH the English visual card AND the Chinese voice card
+//  in a single LLM call. Bundling these two is intentional: a single agent
+//  that "knows who this character is" produces internally-consistent
+//  appearance + vocal personality, whereas split agents tend to diverge
+//  (e.g., gentle-looking character with energetic voice).
+// ──────────────────────────────────────────────────────────────────────
+
+export const CHARACTER_DESIGNER_SYSTEM = `你是视觉小说的「角色设定师」。给你一个**新登场角色的名字**，你要为这个角色同时设计两份卡片：
+1. **视觉设定卡（英文）**——给生图模型 FLUX 用，遵循 prompt engineering 风格
+2. **音色设定卡（中文）**——给小米 MiMo 配音设计用
+
+两份卡片要描绘**同一个人**——外貌温柔的人不该被配上张扬聒噪的嗓音；冷酷干练的人不该用甜软糯的童声。先在心里想清楚这个人的整体气质，再分两面落笔。
+
+视觉设定卡 visualDescription 规则：
+- **必须完全用英文**
+- 风格：用形容词 + 短语，**英文逗号分隔**，符合 FLUX/Stable Diffusion prompt 习惯
+- 包含：年龄段、发型发色、眼睛 / 神情基调、面部特征、标志性服饰（款式 + 配色 + 花纹）、整体气质
+- **不要写瞬时姿势或表情**（这些由编剧/分镜每帧实时控制）
+- **必须融入全局画风** styleGuide 的美术指向（比如 styleGuide 是「赛博朋克」时，服饰要赛博朋克化）
+- 长度：80–150 个英文词为宜
+- 不要包含背景环境（这不是场景图，是角色立绘卡）
+
+音色设定卡 voiceDescription 规则：
+- **必须以明确性别开头**："女性，…" / "男性，…"
+- 随后描述：年龄段（如「约17岁少女」「30 出头男性」）、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言
+- 用中文，整段连续描述，不分段
+- 长度：50–80 个中文字为宜
+- 例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗外向但容易害羞，语速偏快，标准普通话"
+
+必须输出严格 JSON：
+{
+  "visualDescription": "English visual card, comma-separated tags...",
+  "voiceDescription": "中文音色卡，以性别开头..."
+}
+
+不要输出 JSON 以外的任何文本。`;
+
+export function buildCharacterDesignerUserMessage(
+  charName: string,
+  session: Session,
+): string {
+  const parts: string[] = [];
+  parts.push(`角色名：${charName}`);
+  parts.push(`世界观：${session.worldSetting}`);
+  parts.push(`全局美术画风：${session.styleGuide}`);
+
+  const others = session.characters.filter((c) => c.visualDescription);
+  if (others.length > 0) {
+    parts.push("\n已设定角色（外貌应与他们有区分）：");
+    for (const c of others) {
+      parts.push(`- ${c.name}: ${c.visualDescription}`);
+    }
+  }
+
+  parts.push(
+    "\n请为该角色同时设计 visualDescription（英文）和 voiceDescription（中文），严格以 JSON 格式返回。",
+  );
+  return parts.join("\n");
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  3. Cinematographer (分镜导演) — composes the visual frame.
+//
+//  Reads the Writer's sceneSummary + active characters and produces the
+//  English compositional prompt fed to FLUX. Does NOT describe the
+//  characters themselves (those archetypes are appended at the Painter
+//  stage from session.characters.visualDescription). Only describes the
+//  ENVIRONMENT, lighting, camera framing, and how the characters are
+//  positioned within the frame.
+// ──────────────────────────────────────────────────────────────────────
+
+export const CINEMATOGRAPHER_SYSTEM = `你是视觉小说的「分镜导演」。给你编剧的当前场景概要、活跃角色名单和他们在场景里的姿态描述，以及**入口 beat 的 speaker 信息**（用来决定镜头语言）。你的任务是**只用英文**写一段**纯环境+构图**的描述（integratedPrompt），交给画师作为出图主提示词。
+
+你**不要**写角色的外貌细节——发色、服饰、脸型这些由其他 agent 提供，画师会把"角色档案卡"附加到你的 integratedPrompt 后面。你只关心：
+- **环境**：地点、时间、天气、光线、空间细节（什么家具/植物/物件）
+- **构图 / 镜头**：景别（wide shot / medium shot / close-up / over-the-shoulder）、机位、视角
+- **人物在画面中的位置和姿态**（不写脸 / 不写穿什么——只写"哪个角色站在哪儿、在做什么"）
+- **氛围**：情绪基调、色调、影调（warm dusk / cold neon / soft morning light）
+
+═══════════════════════════════════════════════════════════════════
+玩家视角硬规则（与画面相关，必须严格遵守）
+═══════════════════════════════════════════════════════════════════
+- 玩家本人**永远不出现在画面里**——不画 player 的身体、手、肩膀、背影、剪影、脚、头发
+- integratedPrompt 中**绝对禁止**出现下列英文（或中文等价）：
+    "first-person view" · "POV of the protagonist" · "player's hand / arm / shoulder / back"
+    "protagonist visible" · "from the player's perspective" · "MC" · "player's silhouette"
+- 镜头是一个"隐形的观察者位置"——可以位于玩家的视角附近（NPC 像在看玩家），但**绝不画出玩家本身**
+
+═══════════════════════════════════════════════════════════════════
+动态镜头策略（根据入口 beat 的 speaker 字段选择镜头）
+═══════════════════════════════════════════════════════════════════
+你会收到 entryBeatSpeaker 字段。按以下规则选镜头：
+
+【entryBeatSpeaker = 某个 NPC 名字】 → NPC 正在对玩家说话
+- 优先 **close-up 或 medium close-up**，NPC 看向画面外（= 看玩家）
+- 关键英文：close-up / medium close-up, looking toward camera, eyes meeting the viewer,
+  direct gaze, lips parted mid-speech
+- 制造"她正在对你说话"的代入感（galgame 经典直视镜头）
+
+【entryBeatSpeaker = "你"】 → 玩家正在对 NPC 说话
+- 优先 **medium shot**，NPC 居中，做"在听玩家说话"的姿态
+- 关键英文：medium shot, attentively listening, facing the camera,
+  head slightly tilted, expression of attention
+- ❌ 不要写 over-the-shoulder（因为这会暗示画出玩家肩膀，违反 POV 规则）
+
+【entryBeatSpeaker 为空】 → 纯环境 / 旁白 beat
+- 优先 **wide establishing shot**，展现环境氛围
+- 关键英文：wide establishing shot, atmospheric mood, environmental detail
+- 如果有 NPC 在场，他们可以处于远处 / 中景 / 自然状态（不必看镜头）
+
+【entryBeatActive 有多个角色】 → 群像
+- 使用 **medium group shot 或 medium wide shot**，多人在一个框内
+- 关键英文：medium group shot, two-shot / three-shot, characters arranged in the frame
+
+═══════════════════════════════════════════════════════════════════
+输出 JSON 结构
+═══════════════════════════════════════════════════════════════════
+{
+  "shotType": "close-up / medium shot / wide establishing / medium group shot / ...",
+  "integratedPrompt": "English. Environment + composition + character positioning + camera language. No dialogue boxes, no UI. 80-150 words."
+}
+
+写作要求：
+- integratedPrompt **必须英文**，遵循 FLUX prompt engineering 习惯（形容词 + 短语，英文逗号分隔，必要时短句）
+- 提到具体角色时**只用其名字 + 动作**，例如 "Natsumi standing by the window, head slightly bowed"——绝不要写她长什么样
+- 不描述任何 UI、字幕、对话框、边框
+- 不描述图像之外的事情（不要写"this scene depicts..."这种 meta 句）
+- 长度 80–150 英文词
+
+不要输出 JSON 以外的任何文本。`;
+
+export function buildCinematographerUserMessage(
+  sceneSummary: string,
+  styleGuide: string,
+  entryBeatActive: BeatActiveCharacter[],
+  entryBeatSpeaker: string | undefined,
+  priorSceneKey: string | undefined,
+  currentSceneKey: string | undefined,
+): string {
+  const parts: string[] = [];
+  parts.push(`全局美术画风：${styleGuide}`);
+  parts.push(`\n当前场景（来自编剧）：${sceneSummary}`);
+
+  if (entryBeatActive.length > 0) {
+    parts.push("\n开场画面里的角色及其姿态：");
+    for (const c of entryBeatActive) {
+      parts.push(`- ${c.name}：${c.pose ?? "（无具体姿态描述）"}`);
+    }
+  } else {
+    parts.push("\n开场画面里没有角色（纯环境）。");
+  }
+
+  // entryBeatSpeaker drives the dynamic camera policy (see CINEMATOGRAPHER_SYSTEM).
+  // "你" means the player is speaking; an NPC name means an NPC is speaking;
+  // empty means no dialog (pure environment / narration beat).
+  if (entryBeatSpeaker === "你") {
+    parts.push(
+      '\n开场 beat 是**玩家说话**（speaker = "你"）——按动态镜头策略：medium shot，NPC 居中、做听玩家说话的姿态、看向画面外。**绝不要画出玩家**。',
+    );
+  } else if (entryBeatSpeaker) {
+    parts.push(
+      `\n开场 beat 是 **${entryBeatSpeaker} 在对玩家说话**（speaker = "${entryBeatSpeaker}"）——按动态镜头策略：close-up 或 medium close-up，${entryBeatSpeaker} 看向画面外（看玩家），眼神交流。`,
+    );
+  } else {
+    parts.push(
+      "\n开场 beat 没有 speaker（纯旁白/环境）——按动态镜头策略：wide establishing shot 展现环境氛围。",
+    );
+  }
+
+  if (priorSceneKey && currentSceneKey && priorSceneKey === currentSceneKey) {
+    parts.push(
+      `\n注意：上一场和本场 sceneKey 都是 "${currentSceneKey}"——画师会把上一张场景图作为 referenceImages 之一锚定同一空间。你的 integratedPrompt 应该**强调连续性**，描述时段/情绪/构图的细微变化，而不是完全重新设定空间。`,
+    );
+  }
+
+  parts.push("\n请输出 shotType + integratedPrompt，严格以 JSON 格式返回。");
+  return parts.join("\n");
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  4. Painter (画师) — final image prompt assembly.
+//
+//  Not an LLM agent — a pure prompt-building function that combines the
+//  Cinematographer's integratedPrompt with character archetype blocks
+//  (visual cards) and the standard FLUX constraints.
+// ──────────────────────────────────────────────────────────────────────
+
+export function buildPainterPrompt(
+  integratedPrompt: string,
+  styleGuide: string,
+  characters: { name: string; visualDescription?: string }[],
+): string {
+  const archetypeBlock = characters
+    .filter((c) => c.visualDescription)
+    .map((c) => `[CHARACTER: ${c.name}]\n${c.visualDescription}`)
+    .join("\n\n");
+
+  const archetypeSection = archetypeBlock
+    ? `\n\nCHARACTER ARCHETYPES (anchor identity, outfit, and style across scenes — keep each character visually identical to their archetype):\n${archetypeBlock}`
+    : "";
+
+  return `Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024).
+
+ART STYLE: ${styleGuide}
+
+SCENE COMPOSITION (from cinematographer — environment + camera framing + character positioning):
+${integratedPrompt}${archetypeSection}
+
+STRICT RULES — NEVER violate these:
+- DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay.
+- DO NOT draw any buttons, choice options, menu items, or interactive UI elements.
+- DO NOT render any Chinese or English text anywhere in the image.
+- DO NOT add any HUD, interface chrome, or game UI elements.
+- The image is a PURE BACKGROUND SCENE ONLY. All UI will be added as HTML on top.
+- 16:9 LANDSCAPE orientation — wider than tall. No portrait or square output.
+- Leave the bottom 35% of the frame relatively uncluttered (darker or softer) so overlaid UI panels remain readable.
+- Characters or key scene elements should be positioned in the upper 65% of the frame.
+- Maintain character identity exactly as specified in CHARACTER ARCHETYPES — same face, same hairstyle, same outfit across every scene.
+
+PLAYER POV RULES — the player / protagonist is the unseen viewer:
+- The player / protagonist is NEVER visible in the frame — no body parts, no hands, no shoulders, no back of head, no silhouette, no feet, no hair.
+- DO NOT use first-person POV that implies the player's body in frame.
+- When an NPC is speaking to the player, they SHOULD look toward the camera (toward the player's implied position) — this creates eye contact without showing the player.
+- The camera position represents the player's gaze; only NPCs, scenery, and objects are rendered.`;
+}
+
+// Character portrait prompt — for the per-character base image generated
+// once when the CharacterDesigner introduces a new character. The portrait
+// is used both as a client-side asset (立绘登场) and as a referenceImages
+// entry when rendering later scenes for visual consistency.
+export function buildCharacterPortraitPrompt(
+  charName: string,
+  visualDescription: string,
+  styleGuide: string,
+): string {
+  return `Character concept portrait sheet, single character, full-body or upper-body composition, neutral standing pose, looking toward camera, neutral expression, plain neutral background (no environment, no scenery).
+
+ART STYLE: ${styleGuide}
+
+CHARACTER (${charName}):
+${visualDescription}
+
+STRICT RULES:
+- ONE character only — no other people, no crowd, no background characters.
+- Plain neutral background (off-white or soft gradient). NO environment, NO furniture, NO props beyond what's worn.
+- Neutral, calm pose and expression — this is a reference sheet, not a dramatic shot.
+- NO text, NO UI, NO watermark, NO border.
+- The character should be clearly visible and centered, the pose natural and relaxed.
+- 16:9 landscape orientation.`;
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  Insert-Beat — given a freeform vision action that is judged to stay
+//  *within* the current scene, generate one transient beat.
+//  Single-agent path; no character design / no rendering involved.
+// ──────────────────────────────────────────────────────────────────────
+
+export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**（比如看一眼桌上的相框、想了想刚才那句话）。请基于此动作，写出一个**单独的、过渡性的 beat**：可以是旁白、角色台词、或两者结合。
+
+文本风格约束：
+- narration / line 用中文，**纯净可显示文本**，不要写 (叹气) 这类配音标注
+- narration 与 line 加起来 ≤80 字
+- 不要打破当前场景的物理状态（玩家仍在原地、对面仍是同一个角色）
+- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
+- 这个 beat 也要"有所得"——给玩家一个新细节、一丝潜台词或情绪波动（show, don't tell），别写成无意义的空台词
+
+speaker 字段允许的取值**只有两种**（与主路径 Writer 一致 — Pattern B galgame 标准）：
+1. **已登记角色**里的 NPC 真名（**绝不允许引入新角色**）
+2. **"你"** — 玩家本人在自言自语 / 说一句过渡性的话（对白框显示，但不调 TTS）
+
+其它任何 POV 变体（玩家 / 我 / 主角 / protagonist / player / MC / I / me）**一律错误**，请用 "你" 代替。
+
+- 如果有 line 且 speaker = NPC，**必须**给出 lineDelivery（配音导演指令）
+- 如果有 line 且 speaker = "你"，lineDelivery 可以留空（玩家对白不调 TTS）
+
+必须输出严格 JSON：
+{
+  "narration": "...",
+  "speaker": "...",
+  "line": "...",
+  "lineDelivery": "..."
+}
+
+narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
+
+export function buildInsertBeatUserMessage(
+  session: Session,
+  freeformAction: string,
+): string {
+  const parts: string[] = [];
+  parts.push(`世界观：${session.worldSetting}`);
+
+  if (session.characters.length > 0) {
+    parts.push("\n已登记角色（speaker 只能用这些名字）：");
+    for (const c of session.characters) {
+      parts.push(`- ${c.name}`);
+    }
+  }
+
+  const current = session.history.at(-1);
+  if (current) {
+    const scene: Scene = current.scene;
+    parts.push(`\n当前场景：${scene.scenePrompt}`);
+    const lastBeatId = current.visitedBeatIds.at(-1) ?? scene.entryBeatId;
+    const lastBeat = scene.beats.find((b) => b.id === lastBeatId);
+    if (lastBeat) {
+      const recent: string[] = [];
+      if (lastBeat.narration) recent.push(`旁白：${lastBeat.narration}`);
+      if (lastBeat.line) recent.push(`${lastBeat.speaker ?? "?"}：${lastBeat.line}`);
+      if (recent.length) parts.push(`刚才发生：${recent.join(" / ")}`);
+    }
+  }
+
+  parts.push(`\n玩家此刻的自由动作：${freeformAction}`);
+  parts.push("\n请生成一个过渡性 beat，严格以 JSON 格式返回。");
+  return parts.join("\n");
+}
+
+// ──────────────────────────────────────────────────────────────────────
+//  Vision — interprets a background click and classifies the action.
+//  Unchanged from staging (UI choices live in HTML, vision only judges
+//  background clicks).
+// ──────────────────────────────────────────────────────────────────────
+
+export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置（HTML 上的选项按钮不会走到你这里）。你的任务是：
+1. 看清红点指向画面里的什么（物件、角色、空间、远处的方向）
+2. 推断玩家想干什么
+3. 判断这个动作是「场内探索」（不该换图）还是「场景切换」（要换图）
+
+判断准则：
+- "insert-beat"（场内探索）：观察画面里某个细节、自言自语、和当前角色继续互动、看一眼某个物件
+- "change-scene"（场景切换）：走向画面深处的门 / 走廊、转头看向新方向（视角变了）、点了远处的另一个空间、暗示时间跳跃的物件（如时钟）
+
+必须输出严格 JSON：
+{
+  "freeformAction": "玩家想做什么的一句中文描述，例如「想拿起桌上的钥匙」",
+  "classify": "insert-beat" 或 "change-scene",
+  "reasoning": "一句话说明判断理由"
+}
+
+不要输出 JSON 以外的任何文本。`;
+
+export function buildVisionUserPrompt(scene: Scene | null): string {
+  if (!scene) return "请判断玩家意图，并以 JSON 格式返回。";
+  return `当前场景描述：${scene.scenePrompt}
+
+红点位置即为玩家点击位置。请判断玩家意图与分类，以 JSON 格式返回。`;
+}
+
+export type PainterCharacterInput = Pick<Character, "name" | "visualDescription">;
@@ -0,0 +1,39 @@
+import { interpretClick } from "@infiplot/ai-client";
+import type {
+  ClickIntent,
+  ProviderConfig,
+  Scene,
+  VisionClassify,
+} from "@infiplot/types";
+import { parseJsonLoose } from "./jsonParser";
+import { VISION_SYSTEM_PROMPT, buildVisionUserPrompt } from "./prompts";
+
+export type VisionInterpretation = {
+  intent: ClickIntent;
+  classify: VisionClassify;
+};
+
+export async function interpret(
+  config: ProviderConfig,
+  annotatedImageBase64: string,
+  scene: Scene | null,
+): Promise<VisionInterpretation> {
+  const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(scene)}`;
+  const raw = await interpretClick(config, annotatedImageBase64, userPrompt);
+  const parsed = parseJsonLoose<{
+    freeformAction?: string;
+    classify?: string;
+    reasoning?: string;
+  }>(raw);
+
+  const classify: VisionClassify =
+    parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
+
+  return {
+    intent: {
+      freeformAction: parsed.freeformAction?.trim() || "玩家点了画面，但意图不明",
+      reasoning: parsed.reasoning?.trim() || "",
+    },
+    classify,
+  };
+}
@@ -0,0 +1,68 @@
+import { synthesize } from "@infiplot/tts-client";
+import type { BeatAudio, CharacterVoice, TtsConfig } from "@infiplot/types";
+
+// Per-beat synth budget. MiMo's median synth is 3–7s; the tail can spike
+// to 30–70s under concurrent load. Capping here means a single bad beat
+// degrades to silent in <15s instead of blocking the whole UI flow.
+const SYNTH_TIMEOUT_MS = 15000;
+
+// Race the work against a timer; on either outcome clear the timer (otherwise
+// the success path leaks a 15s-pending reject closure into Node's timer heap,
+// per-synth call). On timeout, abort the supplied controller so the underlying
+// HTTP request is cancelled — otherwise MiMo's 30-70s tail keeps the socket
+// open and the quota burning long after we've returned audio:null.
+async function withTimeout<T>(
+  p: Promise<T>,
+  ms: number,
+  label: string,
+  ctrl: AbortController,
+): Promise<T> {
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  try {
+    return await Promise.race([
+      p,
+      new Promise<T>((_, reject) => {
+        timer = setTimeout(() => {
+          ctrl.abort();
+          reject(new Error(`${label} timed out after ${ms}ms`));
+        }, ms);
+      }),
+    ]);
+  } finally {
+    if (timer) clearTimeout(timer);
+  }
+}
+
+// Synthesize audio for one beat. Caller is expected to have already
+// resolved the speaker's voice (from session.characters in the client) —
+// passing it directly here keeps the /api/beat-audio payload small and
+// makes this function pure with respect to session state.
+// Returns null on error or timeout; caller treats null as "play silent."
+//
+// (Voice PROVISIONING — designing a voice for a new character from a
+// voiceDescription — lives in agents/characterDesigner.ts now. This file
+// only handles per-beat SYNTHESIS using an already-provisioned voice.)
+export async function synthesizeBeat(
+  cfg: TtsConfig,
+  voice: CharacterVoice,
+  beat: { id: string; line: string; lineDelivery?: string },
+): Promise<BeatAudio | null> {
+  const t = Date.now();
+  const ctrl = new AbortController();
+  try {
+    const { audioBase64, mimeType } = await withTimeout(
+      synthesize(cfg, voice, beat.line, beat.lineDelivery, ctrl.signal),
+      SYNTH_TIMEOUT_MS,
+      `synth ${beat.id}`,
+      ctrl,
+    );
+    console.log(`  [voice ${beat.id}] synth=${Date.now() - t}ms`);
+    return { base64: audioBase64, mime: mimeType };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(
+      `[voice] synth degraded for ${beat.id} (after ${Date.now() - t}ms): ${msg}`,
+    );
+    return null;
+  }
+}