feat: separate UI choices from AI image (bypass vision)

HTML choice buttons now call /api/interact directly, bypassing the ~4s Vision roundtrip. Free-form background clicks still go through Vision as before.
This commit is contained in:
Qi Chen
2026-05-25 20:47:33 +08:00
committed by GitHub
parent bf8f356e37
commit d116c2e3b5
3 changed files with 314 additions and 99 deletions
+92 -57
View File
@@ -131,6 +131,7 @@ function PlayInner() {
worldSetting: finalPayload.worldSetting,
styleGuide: finalPayload.styleGuide,
history: [{ frame: data.frame }],
characters: [],
});
setFrame(data.frame);
setImageBase64(data.imageBase64);
@@ -183,6 +184,82 @@ function PlayInner() {
};
}, [frame?.id, session?.id]);
// ── Shared result applier ────────────────────────────────────────────
async function applyInteractResult(
resultPromise: Promise<InteractResponse>,
clickIntent: ClickIntent,
click?: { x: number; y: number },
) {
const result = await resultPromise;
// Overwrite synthetic prefetch intent with the real click intent
const lastIdx = result.session.history.length - 1;
const patched: InteractResponse = {
...result,
intent: clickIntent,
session: {
...result.session,
history: result.session.history.map((entry, idx) =>
idx === lastIdx ? { ...entry, click, intent: clickIntent } : entry,
),
},
};
const updatedHistory = [
...patched.session.history,
{ frame: patched.frame },
];
setSession({ ...patched.session, history: updatedHistory });
setFrame(patched.frame);
setImageBase64(patched.imageBase64);
setIntent(clickIntent);
setPendingClick(null);
setTurnNum((t) => t + 1);
setPhase("ready");
}
// ── HTML button click — bypasses Vision entirely ──────────────────────
async function handleChoiceSelect(choiceId: string, label: string) {
if (phase !== "ready" || !session) return;
setPhase("interacting");
setIntent(null);
const clickIntent: ClickIntent = {
targetId: choiceId,
targetLabel: label,
reasoning: "direct-button-click",
};
const cacheSnapshot = prefetchRef.current;
const cached = cacheSnapshot[choiceId];
try {
if (cached) {
// Cache hit — zero extra wait
await applyInteractResult(cached, clickIntent);
} else {
// Cache miss — call interact directly (no Vision roundtrip)
prefetchAbortRef.current?.abort();
const res = await fetch("/api/interact", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session, intent: clickIntent }),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? res.statusText);
}
await applyInteractResult(
res.json() as Promise<InteractResponse>,
clickIntent,
);
}
} catch (e) {
setError(String(e));
setPendingClick(null);
setPhase("ready");
}
}
// ── Background / free-form click — still uses Vision ─────────────────
async function handleClick(click: { x: number; y: number }) {
if (phase !== "ready" || !session || !imageBase64) return;
setPhase("interacting");
@@ -192,15 +269,10 @@ function PlayInner() {
const cacheSnapshot = prefetchRef.current;
try {
// Step 1: Vision (~4s) — figure out what the user actually clicked
const visionRes = await fetch("/api/vision", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
session,
prevImageBase64: imageBase64,
click,
}),
body: JSON.stringify({ session, prevImageBase64: imageBase64, click }),
});
if (!visionRes.ok) {
const j = (await visionRes.json().catch(() => ({}))) as {
@@ -211,31 +283,13 @@ function PlayInner() {
const { intent: clickIntent } =
(await visionRes.json()) as VisionResponse;
// Step 2: Cache lookup
const cached = clickIntent.targetId
? cacheSnapshot[clickIntent.targetId]
: undefined;
let result: InteractResponse;
if (cached) {
// Cache hit — await the prefetched promise (mostly already resolved)
result = await cached;
// Overwrite the synthetic prefetch intent on history with the real one
const lastIdx = result.session.history.length - 1;
result = {
...result,
intent: clickIntent,
session: {
...result.session,
history: result.session.history.map((entry, idx) =>
idx === lastIdx
? { ...entry, click, intent: clickIntent }
: entry,
),
},
};
await applyInteractResult(cached, clickIntent, click);
} else {
// Cache miss (free-form click) — abort wasted prefetches, run live
prefetchAbortRef.current?.abort();
const liveRes = await fetch("/api/interact", {
method: "POST",
@@ -248,18 +302,12 @@ function PlayInner() {
};
throw new Error(j.error ?? liveRes.statusText);
}
result = (await liveRes.json()) as InteractResponse;
await applyInteractResult(
liveRes.json() as Promise<InteractResponse>,
clickIntent,
click,
);
}
// Apply the result: append new frame to history
const updatedHistory = [...result.session.history, { frame: result.frame }];
setSession({ ...result.session, history: updatedHistory });
setFrame(result.frame);
setImageBase64(result.imageBase64);
setIntent(clickIntent);
setPendingClick(null);
setTurnNum((t) => t + 1);
setPhase("ready");
} catch (e) {
setError(String(e));
setPendingClick(null);
@@ -295,8 +343,10 @@ function PlayInner() {
<PlayCanvas
imageBase64={imageBase64}
phase={phase}
frame={frame}
pendingClick={pendingClick}
onClick={handleClick}
onSelectChoice={handleChoiceSelect}
fullViewport
/>
</div>
@@ -326,37 +376,22 @@ function PlayInner() {
<PlayCanvas
imageBase64={imageBase64}
phase={phase}
frame={frame}
pendingClick={pendingClick}
onClick={handleClick}
onSelectChoice={handleChoiceSelect}
/>
<div className="mt-7 md:mt-9 max-w-md w-full text-center min-h-[64px] flex items-center justify-center">
<div className="mt-4 max-w-md w-full text-center min-h-[28px] flex items-center justify-center">
{phase === "loading-first" && (
<p className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
· · · · · ·
</p>
)}
{phase === "interacting" && (
<div className="flex flex-col items-center gap-2 animate-fade-in">
<p className="text-[10px] smallcaps text-clay-500 animate-slow-pulse">
AI · · · · · · ·
</p>
<p className="font-serif italic text-clay-400 text-xs">
·
</p>
</div>
)}
{phase === "ready" && intent?.targetLabel && (
<p className="font-serif italic text-clay-500 text-base leading-relaxed animate-fade-in max-w-[320px]">
<span className="text-[9px] smallcaps not-italic text-clay-400 mr-2 align-middle">
· · ·
</span>
<span className="align-middle">{intent.targetLabel}</span>
</p>
)}
{phase === "ready" && !intent && turnNum > 0 && (
<p className="text-[10px] smallcaps text-clay-400 animate-fade-in">
· · · · · ·
<p className="text-[9px] smallcaps text-clay-400 animate-fade-in">
<span className="mr-2"> · · ·</span>
<span className="text-clay-600">{intent.targetLabel}</span>
</p>
)}
</div>