From 6f8125570a52dba21ba268714db3051a003273c6 Mon Sep 17 00:00:00 2001
From: yuanzonghao <yuanzonghao123@gmail.com>
Date: Wed, 24 Jun 2026 18:36:35 +0800
Subject: [PATCH] feat(play): always generate new scene for freeform text input
 + enhance insert-beat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User feedback: custom interactions rarely produce new story content because
the classifier heavily biased toward insert-beat (single reaction, no scene
change). Three changes to fix this:

1. Freeform text input now always triggers a full scene generation (skips
   the classify step entirely) — users who type expect the story to advance.

2. Vision (background click) classifier de-biased: prompt now favors
   change-scene when uncertain, and the code fallback flipped from
   insert-beat to change-scene. insert-beat narrowed to pure observation.

3. Insert-beat enhanced: generates 1-3 beats (was 1) with follow-up
   choices (was: loop back to original beat). Even when vision classifies
   as insert-beat, the player gets richer content and new options.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 app/[locale]/play/page.tsx | 140 +++++++++++++++----------------------
 lib/engine/director.ts     |  58 +++++++++------
 lib/engine/orchestrator.ts |  56 +++++++--------
 lib/engine/prompts.ts      |  43 ++++++++----
 lib/engine/vision.ts       |   2 +-
 lib/types/index.ts         |  11 +++
 6 files changed, 160 insertions(+), 150 deletions(-)

diff --git a/app/[locale]/play/page.tsx b/app/[locale]/play/page.tsx
index 2400db6..02b11f6 100644
--- a/app/[locale]/play/page.tsx
+++ b/app/[locale]/play/page.tsx
@@ -34,7 +34,6 @@ import {
   startSession,
   requestScene,
   visionDecide,
-  classifyFreeform,
   requestInsertBeat,
   getTtsProvider,
   AuthRequiredError,
@@ -2248,68 +2247,12 @@ function PlayInner() {
     setPhase("vision-thinking");
 
     try {
-      const decision = await classifyFreeform({
-        session,
-        freeformText: text,
-      });
-
-      if (decision.classify === "insert-beat") {
-        // Interactive beat: NPC responds to the player's action, scene stays
-        setPhase("inserting-beat");
-        const { partial, characters: insertChars } = await requestInsertBeat({
-          session,
-          freeformAction: decision.freeformAction,
-          clientTts: !!byoTtsRef.current,
-        });
-
-        const fromBeatId =
-          currentBeatRef.current?.id ?? currentScene.entryBeatId;
-        const newBeatId = `b_ins_${Date.now()}_${Math.random()
-          .toString(36)
-          .slice(2, 6)}`;
-        const newBeat: Beat = {
-          id: newBeatId,
-          narration: partial.narration,
-          speaker: partial.speaker,
-          line: partial.line,
-          lineDelivery: partial.lineDelivery,
-          next: { type: "continue", nextBeatId: fromBeatId },
-        };
-
-        const patched: Scene = {
-          ...currentScene,
-          beats: [...currentScene.beats, newBeat],
-        };
-        const nextVisited = [...visitedBeatsRef.current, newBeatId];
-        visitedBeatsRef.current = nextVisited;
-        const nextSession: Session = {
-          ...session,
-          history: session.history.map((h, i, arr) =>
-            i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h,
-          ),
-          characters: insertChars,
-        };
-        setSession(nextSession);
-        setCurrentScene(patched);
-        setCurrentBeatId(newBeatId);
-        if (newBeat.speaker && newBeat.line) {
-          void fetchBeatAudio(nextSession, {
-            id: newBeatId,
-            speaker: newBeat.speaker,
-            line: newBeat.line,
-            lineDelivery: newBeat.lineDelivery,
-          });
-        }
-        setLastExitLabel(decision.freeformAction);
-        setPhase("ready");
-        return;
-      }
-
-      // change-scene path
+      // Always generate a new scene for freeform text input — the player
+      // typed something, so they expect the story to move forward.
       const visited = [...visitedBeatsRef.current];
       const exit: SceneExit = {
         kind: "freeform",
-        action: decision.freeformAction,
+        action: text,
       };
       clearPool(poolRef.current);
 
@@ -2335,7 +2278,7 @@ function PlayInner() {
         promise,
         exit,
         visited,
-        decision.freeformAction,
+        text,
         () => onFreeformInput(text),
         { kind: "freeform", text },
       );
@@ -2365,7 +2308,7 @@ function PlayInner() {
 
       if (decision.classify === "insert-beat") {
         setPhase("inserting-beat");
-        const { partial, characters: insertChars } = await requestInsertBeat({
+        const { partial, extraBeats, followUpChoices, characters: insertChars } = await requestInsertBeat({
           session,
           freeformAction: decision.intent.freeformAction,
           clientTts: !!byoTtsRef.current,
@@ -2373,42 +2316,69 @@ function PlayInner() {
 
         const fromBeatId =
           currentBeatRef.current?.id ?? currentScene.entryBeatId;
-        const newBeatId = `b_ins_${Date.now()}_${Math.random()
-          .toString(36)
-          .slice(2, 6)}`;
-        const newBeat: Beat = {
-          id: newBeatId,
-          narration: partial.narration,
-          speaker: partial.speaker,
-          line: partial.line,
-          lineDelivery: partial.lineDelivery,
-          next: { type: "continue", nextBeatId: fromBeatId },
-        };
+        const allPartials = [partial, ...(extraBeats ?? [])];
+        const newBeats: Beat[] = [];
+        const newBeatIds: string[] = [];
+
+        for (const [i, p] of allPartials.entries()) {
+          const id = `b_ins_${Date.now()}_${Math.random().toString(36).slice(2, 6)}_${i}`;
+          newBeatIds.push(id);
+          newBeats.push({
+            id,
+            narration: p.narration,
+            speaker: p.speaker,
+            line: p.line,
+            lineDelivery: p.lineDelivery,
+            next: { type: "continue", nextBeatId: "" },
+          });
+        }
+
+        // Chain beats: each points to the next; last one gets choices or falls back to original beat
+        for (let i = 0; i < newBeats.length - 1; i++) {
+          newBeats[i]!.next = { type: "continue", nextBeatId: newBeatIds[i + 1]! };
+        }
+
+        const lastInsertedBeat = newBeats[newBeats.length - 1]!;
+        if (followUpChoices && followUpChoices.length > 0) {
+          lastInsertedBeat.next = {
+            type: "choice",
+            choices: followUpChoices.map((c, ci) => ({
+              id: `c_ins_${Date.now()}_${Math.random().toString(36).slice(2, 6)}_${ci}`,
+              label: c.label,
+              effect: { kind: "change-scene" as const, nextSceneSeed: c.effect },
+            })),
+          };
+        } else {
+          lastInsertedBeat.next = { type: "continue", nextBeatId: fromBeatId };
+        }
 
         const patched: Scene = {
           ...currentScene,
-          beats: [...currentScene.beats, newBeat],
+          beats: [...currentScene.beats, ...newBeats],
         };
+        const nextVisited = [...visitedBeatsRef.current, ...newBeatIds];
+        visitedBeatsRef.current = nextVisited;
 
         const nextSession: Session = {
           ...session,
           history: session.history.map((h, i, arr) =>
-            i === arr.length - 1 ? { ...h, scene: patched } : h,
+            i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h,
           ),
           characters: insertChars,
         };
         setSession(nextSession);
         setCurrentScene(patched);
-        setCurrentBeatId(newBeatId);
-        // Insert-beat doesn't change scene.id, so the scene effect won't
-        // re-fire — manually kick off the audio fetch for the new beat.
-        if (newBeat.speaker && newBeat.line) {
-          void fetchBeatAudio(nextSession, {
-            id: newBeatId,
-            speaker: newBeat.speaker,
-            line: newBeat.line,
-            lineDelivery: newBeat.lineDelivery,
-          });
+        setCurrentBeatId(newBeatIds[0]!);
+
+        for (const nb of newBeats) {
+          if (nb.speaker && nb.line) {
+            void fetchBeatAudio(nextSession, {
+              id: nb.id,
+              speaker: nb.speaker,
+              line: nb.line,
+              lineDelivery: nb.lineDelivery,
+            });
+          }
         }
         setLastExitLabel(decision.intent.freeformAction);
         setPhase("ready");
diff --git a/lib/engine/director.ts b/lib/engine/director.ts
index 8a2b029..cf7b591 100644
--- a/lib/engine/director.ts
+++ b/lib/engine/director.ts
@@ -6,6 +6,7 @@ import type {
   Character,
   CharacterIntent,
   EngineConfig,
+  InsertBeatMulti,
   InsertBeatPartial,
   ProviderConfig,
   Scene,
@@ -562,17 +563,29 @@ export async function directScene(
 }
 
 // ──────────────────────────────────────────────────────────────────────
-//  directInsertBeat — single-agent path for vision-driven in-scene
-//  exploration. Generates ONE transient beat with NO new image, NO new
-//  characters. Multi-agent pipeline doesn't apply here (no rendering, no
-//  character introduction allowed by the prompt).
+//  directInsertBeat — single-agent path for in-scene exploration.
+//  Generates 1-3 beats with NO new image, NO new characters, plus
+//  follow-up choices so the player isn't dumped back to the old options.
 // ──────────────────────────────────────────────────────────────────────
 
+function coerceBeatPartial(raw: Record<string, unknown>): InsertBeatPartial | null {
+  const narration = (typeof raw.narration === "string" ? raw.narration.trim() : undefined) || undefined;
+  const rawSpeaker = (typeof raw.speaker === "string" ? raw.speaker.trim() : undefined) || undefined;
+  const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
+  const line = (typeof raw.line === "string" ? raw.line.trim() : undefined) || undefined;
+  const lineDelivery =
+    line && speaker !== POV_DISPLAY_NAME
+      ? ((typeof raw.lineDelivery === "string" ? raw.lineDelivery.trim() : undefined) || undefined)
+      : undefined;
+  if (!narration && !speaker && !line) return null;
+  return { narration, speaker, line, lineDelivery };
+}
+
 export async function directInsertBeat(
   config: ProviderConfig,
   session: Session,
   freeformAction: string,
-): Promise<InsertBeatPartial> {
+): Promise<{ beats: InsertBeatPartial[]; choices?: { label: string; effect: string }[] }> {
   const raw = await chat(
     config,
     [
@@ -585,22 +598,27 @@ export async function directInsertBeat(
     { temperature: 0.9, tag: "insert-beat" },
   );
 
-  const parsed = parseJsonLoose<InsertBeatPartial>(raw);
+  const parsed = parseJsonLoose<InsertBeatMulti & InsertBeatPartial>(raw);
 
-  const narration = parsed.narration?.trim() || undefined;
-  const rawSpeaker = parsed.speaker?.trim() || undefined;
-  // Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through.
-  const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
-  const line = parsed.line?.trim() || undefined;
-  // lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你")
-  // TTS is intentionally skipped on the client, so lineDelivery is dropped.
-  const lineDelivery =
-    line && speaker !== POV_DISPLAY_NAME
-      ? parsed.lineDelivery?.trim() || undefined
+  // New multi-beat format: { beats: [...], choices: [...] }
+  if (Array.isArray(parsed.beats) && parsed.beats.length > 0) {
+    const beats = parsed.beats
+      .slice(0, 3)
+      .map((b) => coerceBeatPartial(b as Record<string, unknown>))
+      .filter((b): b is InsertBeatPartial => b !== null);
+    if (beats.length === 0) {
+      beats.push({ narration: "（你停下脚步，环视片刻。）" });
+    }
+    const choices = Array.isArray(parsed.choices)
+      ? parsed.choices
+          .slice(0, 2)
+          .filter((c) => c && typeof c.label === "string" && c.label.trim() && typeof c.effect === "string" && c.effect.trim())
+          .map((c) => ({ label: c.label.trim(), effect: c.effect.trim() }))
       : undefined;
-
-  if (!narration && !speaker && !line) {
-    return { narration: "（你停下脚步，环视片刻。）" };
+    return { beats, choices: choices?.length ? choices : undefined };
   }
-  return { narration, speaker, line, lineDelivery };
+
+  // Legacy single-beat fallback
+  const single = coerceBeatPartial(parsed as Record<string, unknown>);
+  return { beats: [single ?? { narration: "（你停下脚步，环视片刻。）" }] };
 }
diff --git a/lib/engine/orchestrator.ts b/lib/engine/orchestrator.ts
index ea6ba7f..364a590 100644
--- a/lib/engine/orchestrator.ts
+++ b/lib/engine/orchestrator.ts
@@ -196,45 +196,43 @@ export async function requestInsertBeat(
 ): Promise<InsertBeatResponse> {
   const tTotal = Date.now();
 
-  const partial = await directInsertBeat(
+  const result = await directInsertBeat(
     config.text,
     req.session,
     req.freeformAction,
   );
 
-  // INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines
-  // to narration so the player still sees the text (the client only renders
-  // `line` when there is a `speaker`).
-  //
-  // Exception (Pattern B): speaker = "你" is the player speaking. No
-  // Character record exists for "你" (intentional — TTS is skipped), so we
-  // must NOT demote it; the client renders the dialog box correctly.
-  // directInsertBeat already normalized POV variants to "你" before this
-  // guard, so a literal "你" here is always Pattern B player dialog.
-  if (
-    partial.speaker &&
-    partial.speaker !== "你" &&
-    !req.session.characters.some((c) => c.name === partial.speaker)
-  ) {
-    console.warn(
-      `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
-    );
-    const promotedNarration =
-      [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
-    tlog("[insert-beat] TOTAL", tTotal);
-    return {
-      partial: {
-        narration: promotedNarration,
+  // Guard every beat: promote unregistered speakers to narration.
+  const guardedBeats = result.beats.map((partial) => {
+    if (
+      partial.speaker &&
+      partial.speaker !== "你" &&
+      !req.session.characters.some((c) => c.name === partial.speaker)
+    ) {
+      console.warn(
+        `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
+      );
+      return {
+        narration:
+          [partial.narration, partial.line].filter(Boolean).join("\n") || undefined,
         speaker: undefined,
         line: undefined,
         lineDelivery: undefined,
-      },
-      characters: req.session.characters,
-    };
-  }
+      };
+    }
+    return partial;
+  });
+
+  const first = guardedBeats[0] ?? { narration: "（你停下脚步，环视片刻。）" };
+  const extra = guardedBeats.slice(1);
 
   tlog("[insert-beat] TOTAL", tTotal);
-  return { partial, characters: req.session.characters };
+  return {
+    partial: first,
+    extraBeats: extra.length > 0 ? extra : undefined,
+    followUpChoices: result.choices,
+    characters: req.session.characters,
+  };
 }
 
 // ──────────────────────────────────────────────────────────────────────
diff --git a/lib/engine/prompts.ts b/lib/engine/prompts.ts
index c18bf7a..23c9e7e 100644
--- a/lib/engine/prompts.ts
+++ b/lib/engine/prompts.ts
@@ -572,18 +572,27 @@ STRICT RULES:
 //  Single-agent path; no character design / no rendering involved.
 // ──────────────────────────────────────────────────────────────────────
 
-export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作（可能是点击画面中的某个物件/角色，也可能是主动输入了一句话/动作）。请基于此动作，写出**一个有实质内容的 beat**。
+export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作（可能是点击画面中的某个物件/角色，也可能是主动输入了一句话/动作）。请基于此动作，写出**1-3 个有实质内容的 beat**，并在最后给出 2 个后续选项供玩家选择。
 
 核心原则——**玩家的动作必须得到回应**：
 - 如果当前场景有 NPC 在场，NPC **必须对玩家的动作做出反应**（说话、表情变化、动作回应）。用 narration 描述玩家的动作，用 speaker + line 写 NPC 的回应。
 - 如果场景中没有 NPC（纯环境），可以用 narration 描述玩家的观察/发现，给玩家一个新细节或情绪波动。
 - 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了，世界要有反馈。
 
+beat 数量指引：
+- 简单观察/短回应：1 个 beat 即可
+- 有来有回的对话/有展开的互动：2-3 个 beat，让反应更有层次
+- 每个 beat 的 narration + line ≤100 字
+
+后续选项（choices）——每次**必须**给出 2 个选项：
+- 选项应**承接刚才的互动**，给玩家自然的下一步
+- 至少一个选项应能推动剧情前进（如"继续追问"、"走过去看看"、"做出某个决定"）
+- label：玩家看到的选项文字（≤15字）
+- effect：描述选这个选项后会发生什么（供下一个编剧参考）
+
 文本风格约束：
 - narration / line 用中文，**纯净可显示文本**，不要写 (叹气)(语速快) 这类配音标注
-- narration 与 line 加起来 ≤100 字
 - 不要打破当前场景的物理状态（玩家仍在原地）
-- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
 - 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流（show, don't tell）
 - 白描为主：聚焦可观察的五感与物理特征，以角色的动作/神态本身传递情绪，不要以作者角度解释或议论；不写角色眼神/语气里的情绪（这些从台词与动作中自行体会）
 
@@ -604,13 +613,16 @@ speaker 字段允许的取值**只有两种**（与主路径 Writer 一致 — P
 
 必须输出严格 JSON：
 {
-  "narration": "...",
-  "speaker": "...",
-  "line": "...",
-  "lineDelivery": "..."
+  "beats": [
+    { "narration": "...", "speaker": "...", "line": "...", "lineDelivery": "..." }
+  ],
+  "choices": [
+    { "label": "选项文字", "effect": "选此选项后的剧情走向" },
+    { "label": "选项文字", "effect": "选此选项后的剧情走向" }
+  ]
 }
 
-narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
+不要输出 JSON 以外的任何文本。`;
 
 export function buildInsertBeatUserMessage(
   session: Session,
@@ -655,7 +667,7 @@ export function buildInsertBeatUserMessage(
   }
 
   parts.push(`\n玩家此刻的自由动作：${freeformAction}`);
-  parts.push("\n请生成一个有实质回应的 beat，严格以 JSON 格式返回。");
+  parts.push("\n请生成 beat（1-3 个）和 2 个后续选项，严格以 JSON 格式返回。");
   const langDirective = buildLanguageDirective(session.language);
   if (langDirective) parts.push(langDirective);
   return parts.join("\n");
@@ -670,11 +682,12 @@ export function buildInsertBeatUserMessage(
 export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置（HTML 上的选项按钮不会走到你这里）。你的任务是：
 1. 看清红点指向画面里的什么（物件、角色、空间、远处的方向）
 2. 推断玩家想干什么
-3. 判断这个动作是「场内探索」（不该换图）还是「场景切换」（要换图）
+3. 判断这个动作是「场内探索」还是「场景切换」
 
 判断准则：
-- "insert-beat"（场内探索）：观察画面里某个细节、自言自语、和当前角色继续互动、看一眼某个物件
-- "change-scene"（场景切换）：走向画面深处的门 / 走廊、转头看向新方向（视角变了）、点了远处的另一个空间、暗示时间跳跃的物件（如时钟）
+- "change-scene"（场景切换）：走向画面深处的门 / 走廊、转头看向新方向（视角变了）、点了远处的另一个空间、暗示时间跳跃的物件（如时钟）、调查某个物件/线索导致剧情发展、与角色进行有实质影响的互动
+- "insert-beat"（场内探索）：**仅限**纯粹的观察——看一眼某个无剧情意义的装饰、环顾四周
+- 拿不准时偏向 "change-scene"——玩家主动点击画面说明想要推进剧情
 
 必须输出严格 JSON：
 {
@@ -704,9 +717,9 @@ export const FREEFORM_CLASSIFY_SYSTEM = `你是交互视觉小说的意图分类
 2. "change-scene"：玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景
 
 判断准则：
-- 大多数对话类输入（问问题、说一句话、对角色做出反应）→ "insert-beat"
-- 明确要离开当前场景、去别的地方、跳过时间、做出改变人物关系的重大决定 → "change-scene"
-- 拿不准时偏向 "insert-beat"（场内互动成本低，体验更流畅）
+- "change-scene"：大多数主动输入——问问题、说一句话、做一个动作、对角色做出反应、想去别的地方、做出决定、推动剧情 → 玩家花精力打字说明想让故事前进
+- "insert-beat"：**仅限**纯粹的环境观察或无实际影响的自言自语
+- 拿不准时偏向 "change-scene"——玩家主动输入说明想要推进剧情
 
 必须输出严格 JSON：
 {
diff --git a/lib/engine/vision.ts b/lib/engine/vision.ts
index 45cbde0..08fdbd8 100644
--- a/lib/engine/vision.ts
+++ b/lib/engine/vision.ts
@@ -27,7 +27,7 @@ export async function interpret(
   }>(raw);
 
   const classify: VisionClassify =
-    parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
+    parsed.classify === "insert-beat" ? "insert-beat" : "change-scene";
 
   return {
     intent: {
diff --git a/lib/types/index.ts b/lib/types/index.ts
index 4d97cd1..6ea0eaa 100644
--- a/lib/types/index.ts
+++ b/lib/types/index.ts
@@ -695,8 +695,19 @@ export type InsertBeatPartial = {
   lineDelivery?: string;
 };
 
+/** Multi-beat response: 1-3 beats + optional follow-up choices. */
+export type InsertBeatMulti = {
+  beats: InsertBeatPartial[];
+  /** Follow-up choices shown after the last beat (max 2). */
+  choices?: { label: string; effect: string }[];
+};
+
 export type InsertBeatResponse = {
   partial: InsertBeatPartial;
+  /** Additional beats beyond the first (for richer insert-beat interactions). */
+  extraBeats?: InsertBeatPartial[];
+  /** Follow-up choices shown after the last inserted beat. */
+  followUpChoices?: { label: string; effect: string }[];
   characters: Character[];
 };