d1f13d51a3
Replace the one-image-per-interaction model with scenes that hold multiple dialogue beats. The image regenerates only on scene-change actions; tapping through beats and in-scene choices are instant and zero-network. Squashed from #2: - feat: scene/beat architecture — decouple dialogue from image generation - fix: harden LLM-output parsing, prefetch lifecycle, and typewriter (PR review) - fix: dedupe beat ids; fallback narration on empty insert-beat (PR review #2) 🤖 Generated with [Claude Code](https://claude.com/claude-code)
40 lines
1.1 KiB
TypeScript
40 lines
1.1 KiB
TypeScript
import { interpretClick } from "@yume/ai-client";
|
|
import type {
|
|
ClickIntent,
|
|
ProviderConfig,
|
|
Scene,
|
|
VisionClassify,
|
|
} from "@yume/types";
|
|
import { parseJsonLoose } from "./jsonParser";
|
|
import { VISION_SYSTEM_PROMPT, buildVisionUserPrompt } from "./prompts";
|
|
|
|
export type VisionInterpretation = {
|
|
intent: ClickIntent;
|
|
classify: VisionClassify;
|
|
};
|
|
|
|
export async function interpret(
|
|
config: ProviderConfig,
|
|
annotatedImageBase64: string,
|
|
scene: Scene | null,
|
|
): Promise<VisionInterpretation> {
|
|
const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(scene)}`;
|
|
const raw = await interpretClick(config, annotatedImageBase64, userPrompt);
|
|
const parsed = parseJsonLoose<{
|
|
freeformAction?: string;
|
|
classify?: string;
|
|
reasoning?: string;
|
|
}>(raw);
|
|
|
|
const classify: VisionClassify =
|
|
parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
|
|
|
|
return {
|
|
intent: {
|
|
freeformAction: parsed.freeformAction?.trim() || "玩家点了画面,但意图不明",
|
|
reasoning: parsed.reasoning?.trim() || "",
|
|
},
|
|
classify,
|
|
};
|
|
}
|