feat: scene/beat architecture — decouple dialogue from image generation (#2)

Replace the one-image-per-interaction model with scenes that hold multiple
dialogue beats. The image regenerates only on scene-change actions; tapping
through beats and in-scene choices are instant and zero-network.

Squashed from #2:
- feat: scene/beat architecture — decouple dialogue from image generation
- fix: harden LLM-output parsing, prefetch lifecycle, and typewriter (PR review)
- fix: dedupe beat ids; fallback narration on empty insert-beat (PR review #2)

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Zonghao Yuan
2026-05-28 15:20:12 +08:00
committed by GitHub
parent d116c2e3b5
commit d1f13d51a3
13 changed files with 1275 additions and 402 deletions
+46 -36
View File
@@ -1,8 +1,9 @@
import type {
ClickIntent,
EngineConfig,
InteractRequest,
InteractResponse,
InsertBeatRequest,
InsertBeatResponse,
SceneRequest,
SceneResponse,
Session,
StartRequest,
StartResponse,
@@ -10,7 +11,7 @@ import type {
VisionResponse,
} from "@yume/types";
import { annotateClick } from "./annotate";
import { direct } from "./director";
import { directInsertBeat, directScene } from "./director";
import { render } from "./renderer";
import { interpret } from "./vision";
@@ -18,6 +19,10 @@ function newSessionId(): string {
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
}
// ──────────────────────────────────────────────────────────────────────
// startSession — first scene + image
// ──────────────────────────────────────────────────────────────────────
export async function startSession(
config: EngineConfig,
req: StartRequest,
@@ -30,51 +35,56 @@ export async function startSession(
history: [],
};
const frame = await direct(config.text, session);
const imageBase64 = await render(config.image, frame, session.styleGuide);
const scene = await directScene(config.text, session);
const imageBase64 = await render(config.image, scene, session.styleGuide);
return {
sessionId: session.id,
frame,
scene,
imageBase64,
};
}
export async function visionTurn(
// ──────────────────────────────────────────────────────────────────────
// requestScene — generate the NEXT scene + image.
// Frontend passes a session whose latest history entry has `exit` set.
// Also used for prefetch speculation (frontend synthesizes the exit).
// ──────────────────────────────────────────────────────────────────────
export async function requestScene(
config: EngineConfig,
req: SceneRequest,
): Promise<SceneResponse> {
const scene = await directScene(config.text, req.session);
const imageBase64 = await render(config.image, scene, req.session.styleGuide);
return { scene, imageBase64 };
}
// ──────────────────────────────────────────────────────────────────────
// visionDecide — interprets a background click into intent + classify.
// ──────────────────────────────────────────────────────────────────────
export async function visionDecide(
config: EngineConfig,
req: VisionRequest,
): Promise<VisionResponse> {
const annotated = await annotateClick(req.prevImageBase64, req.click);
const lastFrame = req.session.history.at(-1)?.frame;
const uiElements = lastFrame?.uiElements ?? [];
const intent = await interpret(config.vision, annotated, uiElements);
return { intent };
const current = req.session.history.at(-1)?.scene ?? null;
return interpret(config.vision, annotated, current);
}
export async function takeTurn(
// ──────────────────────────────────────────────────────────────────────
// requestInsertBeat — generates a transient in-scene beat (no image regen)
// ──────────────────────────────────────────────────────────────────────
export async function requestInsertBeat(
config: EngineConfig,
req: InteractRequest,
): Promise<InteractResponse> {
const updatedSession: Session = {
...req.session,
history: req.session.history.map((entry, idx, arr) =>
idx === arr.length - 1
? { ...entry, click: req.click, intent: req.intent }
: entry,
),
};
const nextFrame = await direct(config.text, updatedSession);
const nextImage = await render(
config.image,
nextFrame,
updatedSession.styleGuide,
req: InsertBeatRequest,
): Promise<InsertBeatResponse> {
const partial = await directInsertBeat(
config.text,
req.session,
req.freeformAction,
);
return {
session: updatedSession,
frame: nextFrame,
imageBase64: nextImage,
intent: req.intent,
};
return { partial };
}