Merge pull request #106 from zonghaoyuan/feat/freeform-always-new-scene

feat(play): freeform input always generates new scene + enhanced insert-beat
This commit is contained in:
Zonghao Yuan
2026-06-24 19:40:04 +08:00
committed by GitHub
6 changed files with 135 additions and 151 deletions
+42 -85
View File
@@ -34,7 +34,6 @@ import {
startSession, startSession,
requestScene, requestScene,
visionDecide, visionDecide,
classifyFreeform,
requestInsertBeat, requestInsertBeat,
getTtsProvider, getTtsProvider,
AuthRequiredError, AuthRequiredError,
@@ -2250,68 +2249,12 @@ function PlayInner() {
setPhase("vision-thinking"); setPhase("vision-thinking");
try { try {
const decision = await classifyFreeform({ // Always generate a new scene for freeform text input — the player
session, // typed something, so they expect the story to move forward.
freeformText: text,
});
if (decision.classify === "insert-beat") {
// Interactive beat: NPC responds to the player's action, scene stays
setPhase("inserting-beat");
const { partial, characters: insertChars } = await requestInsertBeat({
session,
freeformAction: decision.freeformAction,
clientTts: !!byoTtsRef.current,
});
const fromBeatId =
currentBeatRef.current?.id ?? currentScene.entryBeatId;
const newBeatId = `b_ins_${Date.now()}_${Math.random()
.toString(36)
.slice(2, 6)}`;
const newBeat: Beat = {
id: newBeatId,
narration: partial.narration,
speaker: partial.speaker,
line: partial.line,
lineDelivery: partial.lineDelivery,
next: { type: "continue", nextBeatId: fromBeatId },
};
const patched: Scene = {
...currentScene,
beats: [...currentScene.beats, newBeat],
};
const nextVisited = [...visitedBeatsRef.current, newBeatId];
visitedBeatsRef.current = nextVisited;
const nextSession: Session = {
...session,
history: session.history.map((h, i, arr) =>
i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h,
),
characters: insertChars,
};
setSession(nextSession);
setCurrentScene(patched);
setCurrentBeatId(newBeatId);
if (newBeat.speaker && newBeat.line) {
void fetchBeatAudio(nextSession, {
id: newBeatId,
speaker: newBeat.speaker,
line: newBeat.line,
lineDelivery: newBeat.lineDelivery,
});
}
setLastExitLabel(decision.freeformAction);
setPhase("ready");
return;
}
// change-scene path
const visited = [...visitedBeatsRef.current]; const visited = [...visitedBeatsRef.current];
const exit: SceneExit = { const exit: SceneExit = {
kind: "freeform", kind: "freeform",
action: decision.freeformAction, action: text,
}; };
clearPool(poolRef.current); clearPool(poolRef.current);
@@ -2337,7 +2280,7 @@ function PlayInner() {
promise, promise,
exit, exit,
visited, visited,
decision.freeformAction, text,
() => onFreeformInput(text), () => onFreeformInput(text),
{ kind: "freeform", text }, { kind: "freeform", text },
); );
@@ -2367,7 +2310,7 @@ function PlayInner() {
if (decision.classify === "insert-beat") { if (decision.classify === "insert-beat") {
setPhase("inserting-beat"); setPhase("inserting-beat");
const { partial, characters: insertChars } = await requestInsertBeat({ const { partial, extraBeats, characters: insertChars } = await requestInsertBeat({
session, session,
freeformAction: decision.intent.freeformAction, freeformAction: decision.intent.freeformAction,
clientTts: !!byoTtsRef.current, clientTts: !!byoTtsRef.current,
@@ -2375,42 +2318,56 @@ function PlayInner() {
const fromBeatId = const fromBeatId =
currentBeatRef.current?.id ?? currentScene.entryBeatId; currentBeatRef.current?.id ?? currentScene.entryBeatId;
const newBeatId = `b_ins_${Date.now()}_${Math.random() const allPartials = [partial, ...(extraBeats ?? [])];
.toString(36) const newBeats: Beat[] = [];
.slice(2, 6)}`; const newBeatIds: string[] = [];
const newBeat: Beat = {
id: newBeatId, for (const [i, p] of allPartials.entries()) {
narration: partial.narration, const id = `b_ins_${Date.now()}_${Math.random().toString(36).slice(2, 6)}_${i}`;
speaker: partial.speaker, newBeatIds.push(id);
line: partial.line, newBeats.push({
lineDelivery: partial.lineDelivery, id,
next: { type: "continue", nextBeatId: fromBeatId }, narration: p.narration,
}; speaker: p.speaker,
line: p.line,
lineDelivery: p.lineDelivery,
next: { type: "continue", nextBeatId: "" },
});
}
// Chain beats: each points to the next; last one loops back to original beat
for (let i = 0; i < newBeats.length - 1; i++) {
newBeats[i]!.next = { type: "continue", nextBeatId: newBeatIds[i + 1]! };
}
newBeats[newBeats.length - 1]!.next = { type: "continue", nextBeatId: fromBeatId };
const patched: Scene = { const patched: Scene = {
...currentScene, ...currentScene,
beats: [...currentScene.beats, newBeat], beats: [...currentScene.beats, ...newBeats],
}; };
const nextVisited = [...visitedBeatsRef.current, ...newBeatIds];
visitedBeatsRef.current = nextVisited;
const nextSession: Session = { const nextSession: Session = {
...session, ...session,
history: session.history.map((h, i, arr) => history: session.history.map((h, i, arr) =>
i === arr.length - 1 ? { ...h, scene: patched } : h, i === arr.length - 1 ? { ...h, scene: patched, visitedBeatIds: nextVisited } : h,
), ),
characters: insertChars, characters: insertChars,
}; };
setSession(nextSession); setSession(nextSession);
setCurrentScene(patched); setCurrentScene(patched);
setCurrentBeatId(newBeatId); setCurrentBeatId(newBeatIds[0]!);
// Insert-beat doesn't change scene.id, so the scene effect won't
// re-fire — manually kick off the audio fetch for the new beat. for (const nb of newBeats) {
if (newBeat.speaker && newBeat.line) { if (nb.speaker && nb.line) {
void fetchBeatAudio(nextSession, { void fetchBeatAudio(nextSession, {
id: newBeatId, id: nb.id,
speaker: newBeat.speaker, speaker: nb.speaker,
line: newBeat.line, line: nb.line,
lineDelivery: newBeat.lineDelivery, lineDelivery: nb.lineDelivery,
}); });
}
} }
setLastExitLabel(decision.intent.freeformAction); setLastExitLabel(decision.intent.freeformAction);
setPhase("ready"); setPhase("ready");
+40 -21
View File
@@ -6,6 +6,7 @@ import type {
Character, Character,
CharacterIntent, CharacterIntent,
EngineConfig, EngineConfig,
InsertBeatMulti,
InsertBeatPartial, InsertBeatPartial,
ProviderConfig, ProviderConfig,
Scene, Scene,
@@ -562,17 +563,32 @@ export async function directScene(
} }
// ────────────────────────────────────────────────────────────────────── // ──────────────────────────────────────────────────────────────────────
// directInsertBeat — single-agent path for vision-driven in-scene // directInsertBeat — single-agent path for in-scene exploration.
// exploration. Generates ONE transient beat with NO new image, NO new // Generates 1-3 beats with NO new image, NO new characters, plus
// characters. Multi-agent pipeline doesn't apply here (no rendering, no // follow-up choices so the player isn't dumped back to the old options.
// character introduction allowed by the prompt).
// ────────────────────────────────────────────────────────────────────── // ──────────────────────────────────────────────────────────────────────
function coerceBeatPartial(raw: Record<string, unknown>): InsertBeatPartial | null {
const narration = (typeof raw.narration === "string" ? raw.narration.trim() : undefined) || undefined;
const rawSpeaker = (typeof raw.speaker === "string" ? raw.speaker.trim() : undefined) || undefined;
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = (typeof raw.line === "string" ? raw.line.trim() : undefined) || undefined;
const lineDelivery =
line && speaker !== POV_DISPLAY_NAME
? ((typeof raw.lineDelivery === "string" ? raw.lineDelivery.trim() : undefined) || undefined)
: undefined;
if (!narration && !speaker && !line) return null;
if (line && !speaker) {
return { narration: [narration, line].filter(Boolean).join("\n") || undefined };
}
return { narration, speaker, line, lineDelivery };
}
export async function directInsertBeat( export async function directInsertBeat(
config: ProviderConfig, config: ProviderConfig,
session: Session, session: Session,
freeformAction: string, freeformAction: string,
): Promise<InsertBeatPartial> { ): Promise<InsertBeatPartial[]> {
const raw = await chat( const raw = await chat(
config, config,
[ [
@@ -585,22 +601,25 @@ export async function directInsertBeat(
{ temperature: 0.9, tag: "insert-beat" }, { temperature: 0.9, tag: "insert-beat" },
); );
const parsed = parseJsonLoose<InsertBeatPartial>(raw); const parsed = parseJsonLoose<InsertBeatMulti & InsertBeatPartial>(raw);
const narration = parsed.narration?.trim() || undefined; // Multi-beat format: { beats: [...] }
const rawSpeaker = parsed.speaker?.trim() || undefined; if (Array.isArray(parsed.beats) && parsed.beats.length > 0) {
// Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through. const beats = parsed.beats
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined; .slice(0, 3)
const line = parsed.line?.trim() || undefined; .map((b) =>
// lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你") b && typeof b === "object"
// TTS is intentionally skipped on the client, so lineDelivery is dropped. ? coerceBeatPartial(b as Record<string, unknown>)
const lineDelivery = : null,
line && speaker !== POV_DISPLAY_NAME )
? parsed.lineDelivery?.trim() || undefined .filter((b): b is InsertBeatPartial => b !== null);
: undefined; if (beats.length === 0) {
beats.push({ narration: "(你停下脚步,环视片刻。)" });
if (!narration && !speaker && !line) { }
return { narration: "(你停下脚步,环视片刻。)" }; return beats;
} }
return { narration, speaker, line, lineDelivery };
// Legacy single-beat fallback
const single = coerceBeatPartial(parsed as Record<string, unknown>);
return [single ?? { narration: "(你停下脚步,环视片刻。)" }];
} }
+26 -29
View File
@@ -196,45 +196,42 @@ export async function requestInsertBeat(
): Promise<InsertBeatResponse> { ): Promise<InsertBeatResponse> {
const tTotal = Date.now(); const tTotal = Date.now();
const partial = await directInsertBeat( const result = await directInsertBeat(
config.text, config.text,
req.session, req.session,
req.freeformAction, req.freeformAction,
); );
// INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines // Guard every beat: promote unregistered speakers to narration.
// to narration so the player still sees the text (the client only renders const guardedBeats = result.map((partial) => {
// `line` when there is a `speaker`). if (
// partial.speaker &&
// Exception (Pattern B): speaker = "你" is the player speaking. No partial.speaker !== "你" &&
// Character record exists for "你" (intentional — TTS is skipped), so we !req.session.characters.some((c) => c.name === partial.speaker)
// must NOT demote it; the client renders the dialog box correctly. ) {
// directInsertBeat already normalized POV variants to "你" before this console.warn(
// guard, so a literal "你" here is always Pattern B player dialog. `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
if ( );
partial.speaker && return {
partial.speaker !== "你" && narration:
!req.session.characters.some((c) => c.name === partial.speaker) [partial.narration, partial.line].filter(Boolean).join("\n") || undefined,
) {
console.warn(
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
);
const promotedNarration =
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
tlog("[insert-beat] TOTAL", tTotal);
return {
partial: {
narration: promotedNarration,
speaker: undefined, speaker: undefined,
line: undefined, line: undefined,
lineDelivery: undefined, lineDelivery: undefined,
}, };
characters: req.session.characters, }
}; return partial;
} });
const first = guardedBeats[0] ?? { narration: "(你停下脚步,环视片刻。)" };
const extra = guardedBeats.slice(1);
tlog("[insert-beat] TOTAL", tTotal); tlog("[insert-beat] TOTAL", tTotal);
return { partial, characters: req.session.characters }; return {
partial: first,
extraBeats: extra.length > 0 ? extra : undefined,
characters: req.session.characters,
};
} }
// ────────────────────────────────────────────────────────────────────── // ──────────────────────────────────────────────────────────────────────
+19 -15
View File
@@ -572,18 +572,22 @@ STRICT RULES:
// Single-agent path; no character design / no rendering involved. // Single-agent path; no character design / no rendering involved.
// ────────────────────────────────────────────────────────────────────── // ──────────────────────────────────────────────────────────────────────
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作(可能是点击画面中的某个物件/角色,也可能是主动输入了一句话/动作)。请基于此动作,写出**个有实质内容的 beat**。 export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个自由动作(可能是点击画面中的某个物件/角色,也可能是主动输入了一句话/动作)。请基于此动作,写出**1-3 个有实质内容的 beat**。
核心原则——**玩家的动作必须得到回应**: 核心原则——**玩家的动作必须得到回应**:
- 如果当前场景有 NPC 在场,NPC **必须对玩家的动作做出反应**(说话、表情变化、动作回应)。用 narration 描述玩家的动作,用 speaker + line 写 NPC 的回应。 - 如果当前场景有 NPC 在场,NPC **必须对玩家的动作做出反应**(说话、表情变化、动作回应)。用 narration 描述玩家的动作,用 speaker + line 写 NPC 的回应。
- 如果场景中没有 NPC(纯环境),可以用 narration 描述玩家的观察/发现,给玩家一个新细节或情绪波动。 - 如果场景中没有 NPC(纯环境),可以用 narration 描述玩家的观察/发现,给玩家一个新细节或情绪波动。
- 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了,世界要有反馈。 - 不要写"你想做什么但没做"这种无意义的犹豫——玩家已经做了,世界要有反馈。
beat 数量指引:
- 简单观察/短回应:1 个 beat 即可
- 有来有回的对话/有展开的互动:2-3 个 beat,让反应更有层次
- 每个 beat 的 narration + line ≤100 字
文本风格约束: 文本风格约束:
- narration / line 用中文,**纯净可显示文本**,不要写 (叹气)(语速快) 这类配音标注 - narration / line 用中文,**纯净可显示文本**,不要写 (叹气)(语速快) 这类配音标注
- narration 与 line 加起来 ≤100 字
- 不要打破当前场景的物理状态(玩家仍在原地) - 不要打破当前场景的物理状态(玩家仍在原地)
- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat - 不要生成选项或下一步指引——播完后玩家会自然回到原来的选项
- 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流(show, don't tell - 内容要"有所得"——一个新细节、一丝潜台词、一次真实的交流(show, don't tell
- 白描为主:聚焦可观察的五感与物理特征,以角色的动作/神态本身传递情绪,不要以作者角度解释或议论;不写角色眼神/语气里的情绪(这些从台词与动作中自行体会) - 白描为主:聚焦可观察的五感与物理特征,以角色的动作/神态本身传递情绪,不要以作者角度解释或议论;不写角色眼神/语气里的情绪(这些从台词与动作中自行体会)
@@ -604,13 +608,12 @@ speaker 字段允许的取值**只有两种**(与主路径 Writer 一致 — P
必须输出严格 JSON 必须输出严格 JSON
{ {
"narration": "...", "beats": [
"speaker": "...", { "narration": "...", "speaker": "...", "line": "...", "lineDelivery": "..." }
"line": "...", ]
"lineDelivery": "..."
} }
narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`; 不要输出 JSON 以外的任何文本。`;
export function buildInsertBeatUserMessage( export function buildInsertBeatUserMessage(
session: Session, session: Session,
@@ -655,7 +658,7 @@ export function buildInsertBeatUserMessage(
} }
parts.push(`\n玩家此刻的自由动作:${freeformAction}`); parts.push(`\n玩家此刻的自由动作:${freeformAction}`);
parts.push("\n请生成一个有实质回应的 beat,严格以 JSON 格式返回。"); parts.push("\n请生成 1-3 个 beat,严格以 JSON 格式返回。");
const langDirective = buildLanguageDirective(session.language); const langDirective = buildLanguageDirective(session.language);
if (langDirective) parts.push(langDirective); if (langDirective) parts.push(langDirective);
return parts.join("\n"); return parts.join("\n");
@@ -670,11 +673,12 @@ export function buildInsertBeatUserMessage(
export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置(HTML 上的选项按钮不会走到你这里)。你的任务是: export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置(HTML 上的选项按钮不会走到你这里)。你的任务是:
1. 看清红点指向画面里的什么(物件、角色、空间、远处的方向) 1. 看清红点指向画面里的什么(物件、角色、空间、远处的方向)
2. 推断玩家想干什么 2. 推断玩家想干什么
3. 判断这个动作是「场内探索」(不该换图)还是「场景切换」(要换图) 3. 判断这个动作是「场内探索」还是「场景切换」
判断准则: 判断准则:
- "insert-beat"(场内探索):观察画面里某个细节、自言自语、和当前角色继续互动、看一眼某个物件 - "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟)、调查某个物件/线索导致剧情发展、与角色进行有实质影响的互动
- "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟) - "insert-beat"(场内探索):**仅限**纯粹的观察——看一眼某个无剧情意义的装饰、环顾四周
- 拿不准时偏向 "change-scene"——玩家主动点击画面说明想要推进剧情
必须输出严格 JSON 必须输出严格 JSON
{ {
@@ -704,9 +708,9 @@ export const FREEFORM_CLASSIFY_SYSTEM = `你是交互视觉小说的意图分类
2. "change-scene":玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景 2. "change-scene":玩家想去别的地方、做出重大决定、推动剧情到新阶段 → 切换到全新场景
判断准则: 判断准则:
- 大多数对话类输入(问问题、说一句话、对角色做出反应)→ "insert-beat" - "change-scene":大多数主动输入——问问题、说一句话、做一个动作、对角色做出反应、想去别的地方、做出决定、推动剧情 → 玩家花精力打字说明想让故事前进
- 明确要离开当前场景、去别的地方、跳过时间、做出改变人物关系的重大决定 → "change-scene" - "insert-beat"**仅限**纯粹的环境观察或无实际影响的自言自语
- 拿不准时偏向 "insert-beat"(场内互动成本低,体验更流畅) - 拿不准时偏向 "change-scene"——玩家主动输入说明想要推进剧情
必须输出严格 JSON 必须输出严格 JSON
{ {
+1 -1
View File
@@ -27,7 +27,7 @@ export async function interpret(
}>(raw); }>(raw);
const classify: VisionClassify = const classify: VisionClassify =
parsed.classify === "change-scene" ? "change-scene" : "insert-beat"; parsed.classify === "insert-beat" ? "insert-beat" : "change-scene";
return { return {
intent: { intent: {
+7
View File
@@ -695,8 +695,15 @@ export type InsertBeatPartial = {
lineDelivery?: string; lineDelivery?: string;
}; };
/** Multi-beat response: 1-3 beats. */
export type InsertBeatMulti = {
beats: InsertBeatPartial[];
};
export type InsertBeatResponse = { export type InsertBeatResponse = {
partial: InsertBeatPartial; partial: InsertBeatPartial;
/** Additional beats beyond the first (for richer insert-beat interactions). */
extraBeats?: InsertBeatPartial[];
characters: Character[]; characters: Character[];
}; };