Files
infiplot-web/packages/engine/src/vision.ts
T
Zonghao Yuan d1f13d51a3 feat: scene/beat architecture — decouple dialogue from image generation (#2)
Replace the one-image-per-interaction model with scenes that hold multiple
dialogue beats. The image regenerates only on scene-change actions; tapping
through beats and in-scene choices are instant and zero-network.

Squashed from #2:
- feat: scene/beat architecture — decouple dialogue from image generation
- fix: harden LLM-output parsing, prefetch lifecycle, and typewriter (PR review)
- fix: dedupe beat ids; fallback narration on empty insert-beat (PR review #2)

🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 15:20:12 +08:00

40 lines
1.1 KiB
TypeScript

import { interpretClick } from "@yume/ai-client";
import type {
ClickIntent,
ProviderConfig,
Scene,
VisionClassify,
} from "@yume/types";
import { parseJsonLoose } from "./jsonParser";
import { VISION_SYSTEM_PROMPT, buildVisionUserPrompt } from "./prompts";
export type VisionInterpretation = {
intent: ClickIntent;
classify: VisionClassify;
};
export async function interpret(
config: ProviderConfig,
annotatedImageBase64: string,
scene: Scene | null,
): Promise<VisionInterpretation> {
const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(scene)}`;
const raw = await interpretClick(config, annotatedImageBase64, userPrompt);
const parsed = parseJsonLoose<{
freeformAction?: string;
classify?: string;
reasoning?: string;
}>(raw);
const classify: VisionClassify =
parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
return {
intent: {
freeformAction: parsed.freeformAction?.trim() || "玩家点了画面,但意图不明",
reasoning: parsed.reasoning?.trim() || "",
},
classify,
};
}