feat: separate UI choices from AI image (bypass vision)

HTML choice buttons now call /api/interact directly, bypassing the ~4s Vision roundtrip. Free-form background clicks still go through Vision as before.
This commit is contained in:
Qi Chen
2026-05-25 20:47:33 +08:00
committed by GitHub
parent bf8f356e37
commit d116c2e3b5
3 changed files with 314 additions and 99 deletions
+92 -57
View File
@@ -131,6 +131,7 @@ function PlayInner() {
worldSetting: finalPayload.worldSetting,
styleGuide: finalPayload.styleGuide,
history: [{ frame: data.frame }],
characters: [],
});
setFrame(data.frame);
setImageBase64(data.imageBase64);
@@ -183,6 +184,82 @@ function PlayInner() {
};
}, [frame?.id, session?.id]);
// ── Shared result applier ────────────────────────────────────────────
async function applyInteractResult(
resultPromise: Promise<InteractResponse>,
clickIntent: ClickIntent,
click?: { x: number; y: number },
) {
const result = await resultPromise;
// Overwrite synthetic prefetch intent with the real click intent
const lastIdx = result.session.history.length - 1;
const patched: InteractResponse = {
...result,
intent: clickIntent,
session: {
...result.session,
history: result.session.history.map((entry, idx) =>
idx === lastIdx ? { ...entry, click, intent: clickIntent } : entry,
),
},
};
const updatedHistory = [
...patched.session.history,
{ frame: patched.frame },
];
setSession({ ...patched.session, history: updatedHistory });
setFrame(patched.frame);
setImageBase64(patched.imageBase64);
setIntent(clickIntent);
setPendingClick(null);
setTurnNum((t) => t + 1);
setPhase("ready");
}
// ── HTML button click — bypasses Vision entirely ──────────────────────
async function handleChoiceSelect(choiceId: string, label: string) {
if (phase !== "ready" || !session) return;
setPhase("interacting");
setIntent(null);
const clickIntent: ClickIntent = {
targetId: choiceId,
targetLabel: label,
reasoning: "direct-button-click",
};
const cacheSnapshot = prefetchRef.current;
const cached = cacheSnapshot[choiceId];
try {
if (cached) {
// Cache hit — zero extra wait
await applyInteractResult(cached, clickIntent);
} else {
// Cache miss — call interact directly (no Vision roundtrip)
prefetchAbortRef.current?.abort();
const res = await fetch("/api/interact", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session, intent: clickIntent }),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? res.statusText);
}
await applyInteractResult(
res.json() as Promise<InteractResponse>,
clickIntent,
);
}
} catch (e) {
setError(String(e));
setPendingClick(null);
setPhase("ready");
}
}
// ── Background / free-form click — still uses Vision ─────────────────
async function handleClick(click: { x: number; y: number }) {
if (phase !== "ready" || !session || !imageBase64) return;
setPhase("interacting");
@@ -192,15 +269,10 @@ function PlayInner() {
const cacheSnapshot = prefetchRef.current;
try {
// Step 1: Vision (~4s) — figure out what the user actually clicked
const visionRes = await fetch("/api/vision", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
session,
prevImageBase64: imageBase64,
click,
}),
body: JSON.stringify({ session, prevImageBase64: imageBase64, click }),
});
if (!visionRes.ok) {
const j = (await visionRes.json().catch(() => ({}))) as {
@@ -211,31 +283,13 @@ function PlayInner() {
const { intent: clickIntent } =
(await visionRes.json()) as VisionResponse;
// Step 2: Cache lookup
const cached = clickIntent.targetId
? cacheSnapshot[clickIntent.targetId]
: undefined;
let result: InteractResponse;
if (cached) {
// Cache hit — await the prefetched promise (mostly already resolved)
result = await cached;
// Overwrite the synthetic prefetch intent on history with the real one
const lastIdx = result.session.history.length - 1;
result = {
...result,
intent: clickIntent,
session: {
...result.session,
history: result.session.history.map((entry, idx) =>
idx === lastIdx
? { ...entry, click, intent: clickIntent }
: entry,
),
},
};
await applyInteractResult(cached, clickIntent, click);
} else {
// Cache miss (free-form click) — abort wasted prefetches, run live
prefetchAbortRef.current?.abort();
const liveRes = await fetch("/api/interact", {
method: "POST",
@@ -248,18 +302,12 @@ function PlayInner() {
};
throw new Error(j.error ?? liveRes.statusText);
}
result = (await liveRes.json()) as InteractResponse;
await applyInteractResult(
liveRes.json() as Promise<InteractResponse>,
clickIntent,
click,
);
}
// Apply the result: append new frame to history
const updatedHistory = [...result.session.history, { frame: result.frame }];
setSession({ ...result.session, history: updatedHistory });
setFrame(result.frame);
setImageBase64(result.imageBase64);
setIntent(clickIntent);
setPendingClick(null);
setTurnNum((t) => t + 1);
setPhase("ready");
} catch (e) {
setError(String(e));
setPendingClick(null);
@@ -295,8 +343,10 @@ function PlayInner() {
<PlayCanvas
imageBase64={imageBase64}
phase={phase}
frame={frame}
pendingClick={pendingClick}
onClick={handleClick}
onSelectChoice={handleChoiceSelect}
fullViewport
/>
</div>
@@ -326,37 +376,22 @@ function PlayInner() {
<PlayCanvas
imageBase64={imageBase64}
phase={phase}
frame={frame}
pendingClick={pendingClick}
onClick={handleClick}
onSelectChoice={handleChoiceSelect}
/>
<div className="mt-7 md:mt-9 max-w-md w-full text-center min-h-[64px] flex items-center justify-center">
<div className="mt-4 max-w-md w-full text-center min-h-[28px] flex items-center justify-center">
{phase === "loading-first" && (
<p className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
· · · · · ·
</p>
)}
{phase === "interacting" && (
<div className="flex flex-col items-center gap-2 animate-fade-in">
<p className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
AI · · · · · · ·
</p>
<p className="font-serif italic text-clay-400 text-xs">
·
</p>
</div>
)}
{phase === "ready" && intent?.targetLabel && (
<p className="font-serif italic text-clay-500 text-base leading-relaxed animate-fade-in max-w-[320px]">
<span className="text-[9px] smallcaps not-italic text-clay-400 mr-2 align-middle">
· · ·
</span>
<span className="align-middle">{intent.targetLabel}</span>
</p>
)}
{phase === "ready" && !intent && turnNum > 0 && (
<p className="text-[10px] smallcaps text-clay-400 animate-fade-in">
· · · · · ·
<p className="text-[9px] smallcaps text-clay-400 animate-fade-in">
<span className="mr-2"> · · ·</span>
<span className="text-clay-600">{intent.targetLabel}</span>
</p>
)}
</div>
+207 -9
View File
@@ -1,28 +1,124 @@
"use client";
import { useRef, useState } from "react";
import { useEffect, useRef, useState } from "react";
import type { StoryFrame } from "@yume/types";
export type Phase = "loading-first" | "ready" | "interacting";
const SHADOW =
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
// ── Typewriter hook ────────────────────────────────────────────────────
function useTypewriter(text: string, speed = 28): string {
const [displayed, setDisplayed] = useState("");
const textRef = useRef(text);
useEffect(() => {
// Reset immediately when the text changes
setDisplayed("");
textRef.current = text;
if (!text) return;
let i = 0;
const id = setInterval(() => {
i += 1;
setDisplayed(text.slice(0, i));
if (i >= text.length) clearInterval(id);
}, speed);
return () => clearInterval(id);
}, [text, speed]);
return displayed;
}
// ── Choice button ──────────────────────────────────────────────────────
function ChoiceButton({
index,
label,
disabled,
onClick,
}: {
index: number;
label: string;
disabled: boolean;
onClick: () => void;
}) {
return (
<button
type="button"
disabled={disabled}
onClick={onClick}
className="group relative flex-1 min-w-0 px-4 py-3 text-left transition-all duration-200
disabled:opacity-50 disabled:cursor-wait"
style={{
background: "rgba(20, 14, 8, 0.68)",
border: "1.5px solid rgba(180, 140, 80, 0.65)",
borderRadius: "6px",
backdropFilter: "blur(8px)",
WebkitBackdropFilter: "blur(8px)",
boxShadow: "0 2px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(200,165,90,0.12)",
}}
>
{/* Hover shimmer overlay */}
<span
className="absolute inset-0 rounded-[5px] opacity-0 group-hover:opacity-100 transition-opacity duration-200 pointer-events-none"
style={{
background: "rgba(180,140,60,0.10)",
border: "1.5px solid rgba(200,165,90,0.85)",
}}
/>
<span className="relative flex items-baseline gap-2">
<span
className="shrink-0 font-serif text-[11px] num"
style={{ color: "rgba(195,155,75,0.9)" }}
>
{index + 1}.
</span>
<span
className="font-serif text-[13px] md:text-[14px] leading-snug"
style={{ color: "rgba(245,235,210,0.95)" }}
>
{label}
</span>
</span>
</button>
);
}
// ── Main component ─────────────────────────────────────────────────────
export function PlayCanvas({
imageBase64,
phase,
frame,
pendingClick,
onClick,
onSelectChoice,
fullViewport = false,
}: {
imageBase64: string | null;
phase: Phase;
frame: StoryFrame | null;
pendingClick: { x: number; y: number } | null;
onClick: (click: { x: number; y: number }) => void;
onSelectChoice?: (choiceId: string, label: string) => void;
fullViewport?: boolean;
}) {
const imgRef = useRef<HTMLImageElement>(null);
const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
const choices = frame?.uiElements.filter((e) => e.kind === "choice") ?? [];
const dialogueText = frame
? [frame.speaker ? `${frame.speaker}${frame.line ?? ""}` : frame.line, frame.narration]
.filter(Boolean)
.join("\n")
: "";
const narrationOnly = !frame?.speaker && !frame?.line && !!frame?.narration;
const displayBody = frame?.speaker
? frame.line ?? ""
: frame?.narration ?? "";
const typedBody = useTypewriter(displayBody, 30);
function handleClick(e: React.MouseEvent<HTMLImageElement>) {
if (phase !== "ready" || !imgRef.current) return;
const rect = imgRef.current.getBoundingClientRect();
@@ -37,16 +133,13 @@ export function PlayCanvas({
const interactive = phase === "ready" && !!imageBase64;
const dimmed = phase === "interacting";
// 16:9 sizing — letterbox into available viewport
const sizeStyle = fullViewport
? { maxWidth: "100vw", maxHeight: "100dvh" }
: { maxWidth: "96vw", maxHeight: "calc(100dvh - 280px)" };
: { maxWidth: "96vw", maxHeight: "calc(100dvh - 200px)" };
// Placeholder needs an explicit width for aspect-video to compute height.
// Pick the largest 16:9 box that fits in the available viewport.
const placeholderWidth = fullViewport
? "min(100vw, calc(100dvh * 16 / 9))"
: "min(96vw, calc((100dvh - 280px) * 16 / 9))";
: "min(96vw, calc((100dvh - 200px) * 16 / 9))";
return (
<div
@@ -57,6 +150,7 @@ export function PlayCanvas({
className="relative inline-block"
style={{ boxShadow: fullViewport ? "none" : SHADOW }}
>
{/* ── Background image ── */}
<img
key={imageBase64.slice(-48)}
ref={imgRef}
@@ -68,17 +162,121 @@ export function PlayCanvas({
setDims({ w: img.naturalWidth, h: img.naturalHeight });
}}
draggable={false}
className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${interactive ? "cursor-pointer" : "cursor-wait"} ${dimmed ? "opacity-30" : "opacity-100"}`}
className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${
interactive ? "cursor-pointer" : "cursor-wait"
} ${dimmed ? "opacity-40" : "opacity-100"}`}
style={sizeStyle}
/>
{/* ── Top/bottom gradient vignette ── */}
{!fullViewport && (
<>
<div className="absolute inset-x-0 top-0 h-10 bg-gradient-to-b from-clay-900/12 to-transparent pointer-events-none" />
<div className="absolute inset-x-0 bottom-0 h-10 bg-gradient-to-t from-clay-900/12 to-transparent pointer-events-none" />
</>
)}
{/* ══════════════════════════════════════════════════════════
PREFAB UI OVERLAY — rendered on top of image
══════════════════════════════════════════════════════════ */}
{frame && (
<div className="absolute inset-0 flex flex-col justify-end pointer-events-none select-none">
{/* ── Choices row ── */}
{choices.length > 0 && (
<div
className="pointer-events-auto px-[3%] pb-[1.5%] flex gap-[1.5%] items-stretch"
>
{choices.map((choice, i) => (
<ChoiceButton
key={choice.id}
index={i}
label={choice.label}
disabled={phase !== "ready"}
onClick={() => onSelectChoice?.(choice.id, choice.label)}
/>
))}
</div>
)}
{/* ── Dialogue / narration box ── */}
{(frame.narration || frame.line) && (
<div
className="pointer-events-none mx-[2%] mb-[2%] px-[3%] py-[2.2%] relative"
style={{
background: "rgba(14, 10, 6, 0.72)",
border: "1.5px solid rgba(175, 138, 72, 0.60)",
borderRadius: "6px",
backdropFilter: "blur(10px)",
WebkitBackdropFilter: "blur(10px)",
boxShadow:
"0 4px 24px rgba(0,0,0,0.55), inset 0 1px 0 rgba(200,165,90,0.10)",
}}
>
{/* Inner golden corner decoration */}
<span
className="absolute top-[6px] left-[8px] text-[10px] opacity-40 pointer-events-none"
style={{ color: "rgba(195,155,75,1)" }}
aria-hidden
>
</span>
<span
className="absolute top-[6px] right-[8px] text-[10px] opacity-40 pointer-events-none"
style={{ color: "rgba(195,155,75,1)" }}
aria-hidden
>
</span>
{/* Speaker name tag */}
{frame.speaker && (
<p
className="font-serif text-[11px] md:text-[12px] smallcaps mb-[0.6em]"
style={{ color: "rgba(205,165,90,0.92)" }}
>
{frame.speaker}
</p>
)}
{/* Main text */}
<p
className="font-serif leading-[1.85] text-[13px] md:text-[15px]"
style={{ color: "rgba(245,235,210,0.95)" }}
>
{typedBody}
{/* Narration only — also show secondary line */}
{frame.speaker && frame.narration && (
<span
className="block mt-[0.5em] italic text-[12px] md:text-[13px]"
style={{ color: "rgba(200,185,155,0.78)" }}
>
{frame.narration}
</span>
)}
</p>
{/* Scroll hint ▼ */}
<span
className="absolute bottom-[6px] right-[10px] text-[10px] animate-slow-pulse"
style={{ color: "rgba(195,155,75,0.7)" }}
aria-hidden
>
</span>
</div>
)}
</div>
)}
{/* Loading/interacting dim overlay */}
{phase === "interacting" && (
<div className="absolute inset-0 flex items-center justify-center pointer-events-none">
<p className="text-[10px] smallcaps text-cream-50/70 animate-slow-pulse">
AI · · · · · · ·
</p>
</div>
)}
{/* Click ripple indicator */}
{pendingClick && (
<>
<div
@@ -133,7 +331,7 @@ export function PlayCanvas({
{dims ? `${dims.w} × ${dims.h} · png` : "—"}
</span>
<span className="text-[9px] smallcaps text-clay-400">
{phase === "ready" ? "任 · 意 · 点 · 击" : "···"}
{phase === "ready" ? (choices.length > 0 ? "选 · 择 · 一 · 项" : "任 · 意 · 点 · 击") : "···"}
</span>
</div>
)}
+15 -33
View File
@@ -1,4 +1,5 @@
import type { Session, StoryFrame, UIElement } from "@yume/types";
import type { Character, Session, StoryFrame, UIElement } from "@yume/types";
export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的编剧导演。每次根据世界观、画风和历史,输出当前画面要呈现的内容。
@@ -19,7 +20,7 @@ export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的编剧导演。
- narration / line 中文,scenePrompt 英文
- 默认 3 个 choice 元素,可以根据情境额外加 menu/item/custom(罕见)
- 选项必须能切实推进剧情,且互不重复
- scenePrompt 描述当前的画面,不要包括 UI 元素UI 元素会另外渲染
- scenePrompt 描述当前的画面,不要包括 UI 元素
- 单帧旁白与台词加起来控制在 80 字以内
- 不要输出 JSON 以外的任何文本`;
@@ -55,44 +56,25 @@ export function buildImagePrompt(
frame: StoryFrame,
styleGuide: string,
): string {
const choiceList = frame.uiElements
.filter((e) => e.kind === "choice")
.map((e, i) => `${i + 1}. ${e.label}`)
.join("\n");
const extraUI = frame.uiElements
.filter((e) => e.kind !== "choice")
.map((e) => `- ${e.kind}: ${e.label}`)
.join("\n");
return `Generate a landscape 16:9 cinematic visual novel UI screen, widescreen format (1792x1024 or equivalent).
return `Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024).
ART STYLE: ${styleGuide}
(Match this style consistently — for the scene art AND the UI elements.
For example: anime → traditional galgame dialogue box; cyberpunk → neon HUD;
stick figure → hand-drawn paper UI; cinematic realism → minimalist film overlay.)
SCENE (fills the entire 16:9 canvas as a cinematic widescreen background):
SCENE (fill the ENTIRE canvas — no UI elements, no text overlays):
${frame.scenePrompt}
DIALOGUE PANEL (cinematic bottom band, semi-transparent, spans full width, occupies the lower ~25% of the frame):
${frame.speaker ? `Speaker name displayed prominently above the dialogue text: "${frame.speaker}"` : "Narration only — no speaker tag."}
${frame.line ? `Dialogue text: "${frame.line}"` : ""}
${frame.narration ? `Narration text (italic if speaker also present): "${frame.narration}"` : ""}
CHOICE PANEL (three clearly tappable buttons, arranged HORIZONTALLY in a row across the lower-third of the frame, ABOVE or overlaid on the dialogue band; equally sized; centered in the safe zone of the 16:9 canvas):
${choiceList}
${extraUI ? `\nADDITIONAL UI ELEMENTS:\n${extraUI}` : ""}
CRITICAL LAYOUT REQUIREMENTS:
- 16:9 LANDSCAPE orientation — wider than tall. Do NOT produce a portrait/square image.
- All text and buttons must be inside the central safe zone (avoid the outer 8% on every side), so the viewport can letterbox without cropping any UI.
- All text must be perfectly legible (high contrast, readable size).
- Choice buttons must be clearly distinguishable as interactive elements, arranged horizontally left-to-right in the order listed above.
- Choice text must NOT be cropped, NOT overlap with character faces or the dialogue panel.
- The image is the entire interface — no external chrome will be added.`;
STRICT RULES — NEVER violate these:
- DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay.
- DO NOT draw any buttons, choice options, menu items, or interactive UI elements.
- DO NOT render any Chinese or English text anywhere in the image.
- DO NOT add any HUD, interface chrome, or game UI elements.
- The image is a PURE BACKGROUND SCENE ONLY. All UI will be added as HTML on top.
- 16:9 LANDSCAPE orientation — wider than tall. No portrait or square output.
- Leave the bottom 35% of the frame relatively uncluttered (darker or softer) so overlaid UI panels remain readable.
- Characters or key scene elements should be positioned in the upper 65% of the frame.`;
}
export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。用户在视觉小说界面上点击了红色圆点位置,你要根据红点位置和图中可见的 UI 元素,判断用户的意图。
必须输出严格 JSON