From 6f8125570a52dba21ba268714db3051a003273c6 Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Wed, 24 Jun 2026 18:36:35 +0800 Subject: [PATCH] feat(play): always generate new scene for freeform text input + enhance insert-beat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User feedback: custom interactions rarely produce new story content because the classifier heavily biased toward insert-beat (single reaction, no scene change). Three changes to fix this: 1. Freeform text input now always triggers a full scene generation (skips the classify step entirely) — users who type expect the story to advance. 2. Vision (background click) classifier de-biased: prompt now favors change-scene when uncertain, and the code fallback flipped from insert-beat to change-scene. insert-beat narrowed to pure observation. 3. Insert-beat enhanced: generates 1-3 beats (was 1) with follow-up choices (was: loop back to original beat). Even when vision classifies as insert-beat, the player gets richer content and new options. Co-Authored-By: Claude Opus 4.6 --- app/[locale]/play/page.tsx | 140 +++++++++++++++---------------------- lib/engine/director.ts | 58 +++++++++------ lib/engine/orchestrator.ts | 56 +++++++-------- lib/engine/prompts.ts | 43 ++++++++---- lib/engine/vision.ts | 2 +- lib/types/index.ts | 11 +++ 6 files changed, 160 insertions(+), 150 deletions(-) diff --git a/app/[locale]/play/page.tsx b/app/[locale]/play/page.tsx index 2400db6..02b11f6 100644 --- a/app/[locale]/play/page.tsx +++ b/app/[locale]/play/page.tsx @@ -34,7 +34,6 @@ import { startSession, requestScene, visionDecide, - classifyFreeform, requestInsertBeat, getTtsProvider, AuthRequiredError, @@ -2248,68 +2247,12 @@ function PlayInner() { setPhase("vision-thinking"); try { - const decision = await classifyFreeform({ - session, - freeformText: text, - }); - - if (decision.classify === "insert-beat") { - // Interactive beat: NPC responds to the player's action, scene stays - setPhase("inserting-beat"); - const { partial, characters: insertChars } = await requestInsertBeat({ - session, - freeformAction: decision.freeformAction, - clientTts: !!byoTtsRef.current, - }); - - const fromBeatId = - currentBeatRef.current?.id ?? currentScene.entryBeatId; - const newBeatId = `b_ins_${Date.now()}_${Math.random() - .toString(36) - .slice(2, 6)}`; - const newBeat: Beat = { - id: newBeatId, - narration: partial.narration, - speaker: partial.speaker, - line: partial.line, - lineDelivery: partial.lineDelivery, - next: { type: "continue", nextBeatId: fromBeatId }, - }; - - const patched: Scene = { - ...currentScene, - beats: [...currentScene.beats, newBeat], - }; - const nextVisited = [...visitedBeatsRef.current, newBeatId]; - visitedBeatsRef.current = nextVisited; - const nextSession: Session = { - ...session, - history: session.history.map((h, i, arr) => - i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h, - ), - characters: insertChars, - }; - setSession(nextSession); - setCurrentScene(patched); - setCurrentBeatId(newBeatId); - if (newBeat.speaker && newBeat.line) { - void fetchBeatAudio(nextSession, { - id: newBeatId, - speaker: newBeat.speaker, - line: newBeat.line, - lineDelivery: newBeat.lineDelivery, - }); - } - setLastExitLabel(decision.freeformAction); - setPhase("ready"); - return; - } - - // change-scene path + // Always generate a new scene for freeform text input — the player + // typed something, so they expect the story to move forward. const visited = [...visitedBeatsRef.current]; const exit: SceneExit = { kind: "freeform", - action: decision.freeformAction, + action: text, }; clearPool(poolRef.current); @@ -2335,7 +2278,7 @@ function PlayInner() { promise, exit, visited, - decision.freeformAction, + text, () => onFreeformInput(text), { kind: "freeform", text }, ); @@ -2365,7 +2308,7 @@ function PlayInner() { if (decision.classify === "insert-beat") { setPhase("inserting-beat"); - const { partial, characters: insertChars } = await requestInsertBeat({ + const { partial, extraBeats, followUpChoices, characters: insertChars } = await requestInsertBeat({ session, freeformAction: decision.intent.freeformAction, clientTts: !!byoTtsRef.current, @@ -2373,42 +2316,69 @@ function PlayInner() { const fromBeatId = currentBeatRef.current?.id ?? currentScene.entryBeatId; - const newBeatId = `b_ins_${Date.now()}_${Math.random() - .toString(36) - .slice(2, 6)}`; - const newBeat: Beat = { - id: newBeatId, - narration: partial.narration, - speaker: partial.speaker, - line: partial.line, - lineDelivery: partial.lineDelivery, - next: { type: "continue", nextBeatId: fromBeatId }, - }; + const allPartials = [partial, ...(extraBeats ?? [])]; + const newBeats: Beat[] = []; + const newBeatIds: string[] = []; + + for (const [i, p] of allPartials.entries()) { + const id = `b_ins_${Date.now()}_${Math.random().toString(36).slice(2, 6)}_${i}`; + newBeatIds.push(id); + newBeats.push({ + id, + narration: p.narration, + speaker: p.speaker, + line: p.line, + lineDelivery: p.lineDelivery, + next: { type: "continue", nextBeatId: "" }, + }); + } + + // Chain beats: each points to the next; last one gets choices or falls back to original beat + for (let i = 0; i < newBeats.length - 1; i++) { + newBeats[i]!.next = { type: "continue", nextBeatId: newBeatIds[i + 1]! }; + } + + const lastInsertedBeat = newBeats[newBeats.length - 1]!; + if (followUpChoices && followUpChoices.length > 0) { + lastInsertedBeat.next = { + type: "choice", + choices: followUpChoices.map((c, ci) => ({ + id: `c_ins_${Date.now()}_${Math.random().toString(36).slice(2, 6)}_${ci}`, + label: c.label, + effect: { kind: "change-scene" as const, nextSceneSeed: c.effect }, + })), + }; + } else { + lastInsertedBeat.next = { type: "continue", nextBeatId: fromBeatId }; + } const patched: Scene = { ...currentScene, - beats: [...currentScene.beats, newBeat], + beats: [...currentScene.beats, ...newBeats], }; + const nextVisited = [...visitedBeatsRef.current, ...newBeatIds]; + visitedBeatsRef.current = nextVisited; const nextSession: Session = { ...session, history: session.history.map((h, i, arr) => - i === arr.length - 1 ? { ...h, scene: patched } : h, + i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h, ), characters: insertChars, }; setSession(nextSession); setCurrentScene(patched); - setCurrentBeatId(newBeatId); - // Insert-beat doesn't change scene.id, so the scene effect won't - // re-fire — manually kick off the audio fetch for the new beat. - if (newBeat.speaker && newBeat.line) { - void fetchBeatAudio(nextSession, { - id: newBeatId, - speaker: newBeat.speaker, - line: newBeat.line, - lineDelivery: newBeat.lineDelivery, - }); + setCurrentBeatId(newBeatIds[0]!); + + for (const nb of newBeats) { + if (nb.speaker && nb.line) { + void fetchBeatAudio(nextSession, { + id: nb.id, + speaker: nb.speaker, + line: nb.line, + lineDelivery: nb.lineDelivery, + }); + } } setLastExitLabel(decision.intent.freeformAction); setPhase("ready"); diff --git a/lib/engine/director.ts b/lib/engine/director.ts index 8a2b029..cf7b591 100644 --- a/lib/engine/director.ts +++ b/lib/engine/director.ts @@ -6,6 +6,7 @@ import type { Character, CharacterIntent, EngineConfig, + InsertBeatMulti, InsertBeatPartial, ProviderConfig, Scene, @@ -562,17 +563,29 @@ export async function directScene( } // ────────────────────────────────────────────────────────────────────── -// directInsertBeat — single-agent path for vision-driven in-scene -// exploration. Generates ONE transient beat with NO new image, NO new -// characters. Multi-agent pipeline doesn't apply here (no rendering, no -// character introduction allowed by the prompt). +// directInsertBeat — single-agent path for in-scene exploration. +// Generates 1-3 beats with NO new image, NO new characters, plus +// follow-up choices so the player isn't dumped back to the old options. // ────────────────────────────────────────────────────────────────────── +function coerceBeatPartial(raw: Record): InsertBeatPartial | null { + const narration = (typeof raw.narration === "string" ? raw.narration.trim() : undefined) || undefined; + const rawSpeaker = (typeof raw.speaker === "string" ? raw.speaker.trim() : undefined) || undefined; + const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined; + const line = (typeof raw.line === "string" ? raw.line.trim() : undefined) || undefined; + const lineDelivery = + line && speaker !== POV_DISPLAY_NAME + ? ((typeof raw.lineDelivery === "string" ? raw.lineDelivery.trim() : undefined) || undefined) + : undefined; + if (!narration && !speaker && !line) return null; + return { narration, speaker, line, lineDelivery }; +} + export async function directInsertBeat( config: ProviderConfig, session: Session, freeformAction: string, -): Promise { +): Promise<{ beats: InsertBeatPartial[]; choices?: { label: string; effect: string }[] }> { const raw = await chat( config, [ @@ -585,22 +598,27 @@ export async function directInsertBeat( { temperature: 0.9, tag: "insert-beat" }, ); - const parsed = parseJsonLoose(raw); + const parsed = parseJsonLoose(raw); - const narration = parsed.narration?.trim() || undefined; - const rawSpeaker = parsed.speaker?.trim() || undefined; - // Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through. - const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined; - const line = parsed.line?.trim() || undefined; - // lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你") - // TTS is intentionally skipped on the client, so lineDelivery is dropped. - const lineDelivery = - line && speaker !== POV_DISPLAY_NAME - ? parsed.lineDelivery?.trim() || undefined + // New multi-beat format: { beats: [...], choices: [...] } + if (Array.isArray(parsed.beats) && parsed.beats.length > 0) { + const beats = parsed.beats + .slice(0, 3) + .map((b) => coerceBeatPartial(b as Record)) + .filter((b): b is InsertBeatPartial => b !== null); + if (beats.length === 0) { + beats.push({ narration: "(你停下脚步,环视片刻。)" }); + } + const choices = Array.isArray(parsed.choices) + ? parsed.choices + .slice(0, 2) + .filter((c) => c && typeof c.label === "string" && c.label.trim() && typeof c.effect === "string" && c.effect.trim()) + .map((c) => ({ label: c.label.trim(), effect: c.effect.trim() })) : undefined; - - if (!narration && !speaker && !line) { - return { narration: "(你停下脚步,环视片刻。)" }; + return { beats, choices: choices?.length ? choices : undefined }; } - return { narration, speaker, line, lineDelivery }; + + // Legacy single-beat fallback + const single = coerceBeatPartial(parsed as Record); + return { beats: [single ?? { narration: "(你停下脚步,环视片刻。)" }] }; } diff --git a/lib/engine/orchestrator.ts b/lib/engine/orchestrator.ts index ea6ba7f..364a590 100644 --- a/lib/engine/orchestrator.ts +++ b/lib/engine/orchestrator.ts @@ -196,45 +196,43 @@ export async function requestInsertBeat( ): Promise { const tTotal = Date.now(); - const partial = await directInsertBeat( + const result = await directInsertBeat( config.text, req.session, req.freeformAction, ); - // INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines - // to narration so the player still sees the text (the client only renders - // `line` when there is a `speaker`). - // - // Exception (Pattern B): speaker = "你" is the player speaking. No - // Character record exists for "你" (intentional — TTS is skipped), so we - // must NOT demote it; the client renders the dialog box correctly. - // directInsertBeat already normalized POV variants to "你" before this - // guard, so a literal "你" here is always Pattern B player dialog. - if ( - partial.speaker && - partial.speaker !== "你" && - !req.session.characters.some((c) => c.name === partial.speaker) - ) { - console.warn( - `[insert-beat] unregistered speaker "${partial.speaker}" ignored`, - ); - const promotedNarration = - [partial.narration, partial.line].filter(Boolean).join("\n") || undefined; - tlog("[insert-beat] TOTAL", tTotal); - return { - partial: { - narration: promotedNarration, + // Guard every beat: promote unregistered speakers to narration. + const guardedBeats = result.beats.map((partial) => { + if ( + partial.speaker && + partial.speaker !== "你" && + !req.session.characters.some((c) => c.name === partial.speaker) + ) { + console.warn( + `[insert-beat] unregistered speaker "${partial.speaker}" ignored`, + ); + return { + narration: + [partial.narration, partial.line].filter(Boolean).join("\n") || undefined, speaker: undefined, line: undefined, lineDelivery: undefined, - }, - characters: req.session.characters, - }; - } + }; + } + return partial; + }); + + const first = guardedBeats[0] ?? { narration: "(你停下脚步,环视片刻。)" }; + const extra = guardedBeats.slice(1); tlog("[insert-beat] TOTAL", tTotal); - return { partial, characters: req.session.characters }; + return { + partial: first, + extraBeats: extra.length > 0 ? extra : undefined, + followUpChoices: result.choices, + characters: req.session.characters, + }; } // ────────────────────────────────────────────────────────────────────── diff --git a/lib/engine/prompts.ts b/lib/engine/prompts.ts index c18bf7a..23c9e7e 100644 --- a/lib/engine/prompts.ts +++ b/lib/engine/prompts.ts @@ -572,18 +572,27 @@ STRICT RULES: // Single-agent path; no character design / no rendering involved. // ────────────────────────────────────────────────────────────────────── -export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作(可能是点击画面中的某个物件/角色,也可能是主动输入了一句话/动作)。请基于此动作,写出**一个有实质内容的 beat**。 +export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作(可能是点击画面中的某个物件/角色,也可能是主动输入了一句话/动作)。请基于此动作,写出**1-3 个有实质内容的 beat**,并在最后给出 2 个后续选项供玩家选择。 核心原则——**玩家的动作必须得到回应**: - 如果当前场景有 NPC 在场,NPC **必须对玩家的动作做出反应**(说话、表情变化、动作回应)。用 narration 描述玩家的动作,用 speaker + line 写 NPC 的回应。 - 如果场景中没有 NPC(纯环境),可以用 narration 描述玩家的观察/发现,给玩家一个新细节或情绪波动。 - 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了,世界要有反馈。 +beat 数量指引: +- 简单观察/短回应:1 个 beat 即可 +- 有来有回的对话/有展开的互动:2-3 个 beat,让反应更有层次 +- 每个 beat 的 narration + line ≤100 字 + +后续选项(choices)——每次**必须**给出 2 个选项: +- 选项应**承接刚才的互动**,给玩家自然的下一步 +- 至少一个选项应能推动剧情前进(如"继续追问"、"走过去看看"、"做出某个决定") +- label:玩家看到的选项文字(≤15字) +- effect:描述选这个选项后会发生什么(供下一个编剧参考) + 文本风格约束: - narration / line 用中文,**纯净可显示文本**,不要写 (叹气)(语速快) 这类配音标注 -- narration 与 line 加起来 ≤100 字 - 不要打破当前场景的物理状态(玩家仍在原地) -- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat - 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流(show, don't tell) - 白描为主:聚焦可观察的五感与物理特征,以角色的动作/神态本身传递情绪,不要以作者角度解释或议论;不写角色眼神/语气里的情绪(这些从台词与动作中自行体会) @@ -604,13 +613,16 @@ speaker 字段允许的取值**只有两种**(与主路径 Writer 一致 — P 必须输出严格 JSON: { - "narration": "...", - "speaker": "...", - "line": "...", - "lineDelivery": "..." + "beats": [ + { "narration": "...", "speaker": "...", "line": "...", "lineDelivery": "..." } + ], + "choices": [ + { "label": "选项文字", "effect": "选此选项后的剧情走向" }, + { "label": "选项文字", "effect": "选此选项后的剧情走向" } + ] } -narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`; +不要输出 JSON 以外的任何文本。`; export function buildInsertBeatUserMessage( session: Session, @@ -655,7 +667,7 @@ export function buildInsertBeatUserMessage( } parts.push(`\n玩家此刻的自由动作:${freeformAction}`); - parts.push("\n请生成一个有实质回应的 beat,严格以 JSON 格式返回。"); + parts.push("\n请生成 beat(1-3 个)和 2 个后续选项,严格以 JSON 格式返回。"); const langDirective = buildLanguageDirective(session.language); if (langDirective) parts.push(langDirective); return parts.join("\n"); @@ -670,11 +682,12 @@ export function buildInsertBeatUserMessage( export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置(HTML 上的选项按钮不会走到你这里)。你的任务是: 1. 看清红点指向画面里的什么(物件、角色、空间、远处的方向) 2. 推断玩家想干什么 -3. 判断这个动作是「场内探索」(不该换图)还是「场景切换」(要换图) +3. 判断这个动作是「场内探索」还是「场景切换」 判断准则: -- "insert-beat"(场内探索):观察画面里某个细节、自言自语、和当前角色继续互动、看一眼某个物件 -- "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟) +- "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟)、调查某个物件/线索导致剧情发展、与角色进行有实质影响的互动 +- "insert-beat"(场内探索):**仅限**纯粹的观察——看一眼某个无剧情意义的装饰、环顾四周 +- 拿不准时偏向 "change-scene"——玩家主动点击画面说明想要推进剧情 必须输出严格 JSON: { @@ -704,9 +717,9 @@ export const FREEFORM_CLASSIFY_SYSTEM = `你是交互视觉小说的意图分类 2. "change-scene":玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景 判断准则: -- 大多数对话类输入(问问题、说一句话、对角色做出反应)→ "insert-beat" -- 明确要离开当前场景、去别的地方、跳过时间、做出改变人物关系的重大决定 → "change-scene" -- 拿不准时偏向 "insert-beat"(场内互动成本低,体验更流畅) +- "change-scene":大多数主动输入——问问题、说一句话、做一个动作、对角色做出反应、想去别的地方、做出决定、推动剧情 → 玩家花精力打字说明想让故事前进 +- "insert-beat":**仅限**纯粹的环境观察或无实际影响的自言自语 +- 拿不准时偏向 "change-scene"——玩家主动输入说明想要推进剧情 必须输出严格 JSON: { diff --git a/lib/engine/vision.ts b/lib/engine/vision.ts index 45cbde0..08fdbd8 100644 --- a/lib/engine/vision.ts +++ b/lib/engine/vision.ts @@ -27,7 +27,7 @@ export async function interpret( }>(raw); const classify: VisionClassify = - parsed.classify === "change-scene" ? "change-scene" : "insert-beat"; + parsed.classify === "insert-beat" ? "insert-beat" : "change-scene"; return { intent: { diff --git a/lib/types/index.ts b/lib/types/index.ts index 4d97cd1..6ea0eaa 100644 --- a/lib/types/index.ts +++ b/lib/types/index.ts @@ -695,8 +695,19 @@ export type InsertBeatPartial = { lineDelivery?: string; }; +/** Multi-beat response: 1-3 beats + optional follow-up choices. */ +export type InsertBeatMulti = { + beats: InsertBeatPartial[]; + /** Follow-up choices shown after the last beat (max 2). */ + choices?: { label: string; effect: string }[]; +}; + export type InsertBeatResponse = { partial: InsertBeatPartial; + /** Additional beats beyond the first (for richer insert-beat interactions). */ + extraBeats?: InsertBeatPartial[]; + /** Follow-up choices shown after the last inserted beat. */ + followUpChoices?: { label: string; effect: string }[]; characters: Character[]; };