From 9cedfa66e45a71cd576367b50ce8e803b5c3f55d Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Tue, 12 May 2026 19:38:03 +0800 Subject: [PATCH] feat: prefetch, vision split, provider adapter, UI polish Engine - Split /api/vision out from /api/interact so client can drive prefetch + cache lookup independently of click interpretation - Image client switched to chat-completions+modalities API (OpenRouter/ provider style), supporting markdown image URL responses - annotateClick now resizes to 768w before composite to keep vision payloads small and avoid CDN timeouts - Prompts updated to mention "JSON" in user messages (required by Gemini's strict JSON mode) - Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision (with 60s hard timeout) Client - Parallel prefetch of all three choice branches on each new frame - Effect deliberately excludes phase from deps so user-click doesn't abort in-flight prefetches - Cache hit/miss/free-form fallback handled in handleClick - PlayCanvas reads img naturalWidth/Height and adapts container to whatever aspect AI returns (no more cropped third choice) - max-width raised to 560px, max-height calc(100dvh - 200px) Misc - README env-path corrected to apps/web/.env.local - users.md: BGM/TTS idea note - .env.example moved into apps/web alongside next config Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 21 ---- README.md | 4 +- apps/web/.env.example | 24 ++++ apps/web/app/api/interact/route.ts | 4 +- apps/web/app/api/vision/route.ts | 32 ++++++ apps/web/app/layout.tsx | 2 +- apps/web/app/play/page.tsx | 119 +++++++++++++++++--- apps/web/components/PlayCanvas.tsx | 133 +++++++++++++---------- apps/web/next-env.d.ts | 2 + packages/ai-client/src/chat.ts | 3 +- packages/ai-client/src/fetchWithRetry.ts | 39 +++++++ packages/ai-client/src/image.ts | 74 +++++++++---- packages/ai-client/src/vision.ts | 27 +++-- packages/engine/src/annotate.ts | 22 ++-- packages/engine/src/index.ts | 2 +- packages/engine/src/orchestrator.ts | 27 +++-- packages/engine/src/prompts.ts | 6 +- packages/engine/src/renderer.ts | 2 +- packages/types/src/index.ts | 12 +- vercel.json | 1 + 20 files changed, 405 insertions(+), 151 deletions(-) delete mode 100644 .env.example create mode 100644 apps/web/.env.example create mode 100644 apps/web/app/api/vision/route.ts create mode 100644 packages/ai-client/src/fetchWithRetry.ts diff --git a/.env.example b/.env.example deleted file mode 100644 index 6547b5c..0000000 --- a/.env.example +++ /dev/null @@ -1,21 +0,0 @@ -# ============================================================= -# Dada — AI Visual Novel -# Three independently configurable AI providers -# Any OpenAI-compatible endpoint works (OpenAI, Anthropic, Gemini, -# OpenRouter, DeepSeek, Ollama, ...). -# ============================================================= - -# ---- 1. Text LLM (story director) ----------------------------- -TEXT_BASE_URL=https://api.anthropic.com/v1 -TEXT_API_KEY=sk-ant-xxx -TEXT_MODEL=claude-opus-4-7 - -# ---- 2. Image generator (renders the whole UI screen) --------- -IMAGE_BASE_URL=https://api.openai.com/v1 -IMAGE_API_KEY=sk-xxx -IMAGE_MODEL=gpt-image-2 - -# ---- 3. Vision model (interprets where the user clicked) ------ -VISION_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai -VISION_API_KEY=xxx -VISION_MODEL=gemini-3-flash diff --git a/README.md b/README.md index 077afca..3bba298 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Three providers, all independently configurable. Any OpenAI-compatible chat / im | Image · UI renderer | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `gpt-image-2` via OpenAI | | Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google | -See `.env.example` for the exact shape. +See `apps/web/.env.example` for the exact shape. --- @@ -59,7 +59,7 @@ Requires Node 20+ and pnpm 9+. ```bash pnpm install -cp .env.example .env.local +cp apps/web/.env.example apps/web/.env.local # fill in the nine env vars pnpm dev # open http://localhost:3000 diff --git a/apps/web/.env.example b/apps/web/.env.example new file mode 100644 index 0000000..fe9c11e --- /dev/null +++ b/apps/web/.env.example @@ -0,0 +1,24 @@ +# ============================================================= +# Dada — AI Visual Novel +# Three independently configurable AI providers +# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI, +# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama). +# +# Image generation uses the chat-completions + modalities API +# (OpenRouter-style), NOT the legacy /images/generations endpoint. +# ============================================================= + +# ---- 1. Text LLM (story director) ----------------------------- +TEXT_BASE_URL=https://openrouter.ai/api/v1 +TEXT_API_KEY=sk-or-v1-xxx +TEXT_MODEL=~anthropic/claude-sonnet-latest + +# ---- 2. Image generator (renders the whole UI screen) --------- +IMAGE_BASE_URL=https://openrouter.ai/api/v1 +IMAGE_API_KEY=sk-or-v1-xxx +IMAGE_MODEL=openai/gpt-5.4-image-2 + +# ---- 3. Vision model (interprets where the user clicked) ------ +VISION_BASE_URL=https://openrouter.ai/api/v1 +VISION_API_KEY=sk-or-v1-xxx +VISION_MODEL=~google/gemini-flash-latest diff --git a/apps/web/app/api/interact/route.ts b/apps/web/app/api/interact/route.ts index c33510a..45872cb 100644 --- a/apps/web/app/api/interact/route.ts +++ b/apps/web/app/api/interact/route.ts @@ -14,9 +14,9 @@ export async function POST(req: Request) { return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); } - if (!body.session || !body.prevImageBase64 || !body.click) { + if (!body.session || !body.intent) { return NextResponse.json( - { error: "session, prevImageBase64, click are required" }, + { error: "session and intent are required" }, { status: 400 }, ); } diff --git a/apps/web/app/api/vision/route.ts b/apps/web/app/api/vision/route.ts new file mode 100644 index 0000000..864d751 --- /dev/null +++ b/apps/web/app/api/vision/route.ts @@ -0,0 +1,32 @@ +import { visionTurn } from "@dada/engine"; +import type { VisionRequest } from "@dada/types"; +import { NextResponse } from "next/server"; +import { loadEngineConfig } from "@/lib/config"; + +export const runtime = "nodejs"; +export const maxDuration = 60; + +export async function POST(req: Request) { + let body: VisionRequest; + try { + body = (await req.json()) as VisionRequest; + } catch { + return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); + } + + if (!body.session || !body.prevImageBase64 || !body.click) { + return NextResponse.json( + { error: "session, prevImageBase64, click are required" }, + { status: 400 }, + ); + } + + try { + const config = loadEngineConfig(); + const result = await visionTurn(config, body); + return NextResponse.json(result); + } catch (err) { + const message = err instanceof Error ? err.message : "Unknown error"; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/apps/web/app/layout.tsx b/apps/web/app/layout.tsx index 6c73574..c09d866 100644 --- a/apps/web/app/layout.tsx +++ b/apps/web/app/layout.tsx @@ -13,7 +13,7 @@ export default function RootLayout({ children: React.ReactNode; }) { return ( - + (null); const [turnNum, setTurnNum] = useState(0); const [error, setError] = useState(null); + const startedRef = useRef(false); + const prefetchAbortRef = useRef(null); + const prefetchRef = useRef>>({}); useEffect(() => { if (startedRef.current) return; @@ -88,14 +92,60 @@ function PlayInner() { .catch((e) => setError(String(e))); }, [params, router]); + // Prefetch next-frame candidates whenever current frame becomes ready. + // All three fire in parallel for fastest cache fill. NOT depending on + // `phase` — we don't want to abort in-flight prefetches just because + // the user clicked. They should continue so handleClick can await them. + useEffect(() => { + if (!session || !frame) return; + + prefetchAbortRef.current?.abort(); + const ctrl = new AbortController(); + prefetchAbortRef.current = ctrl; + + const choices = frame.uiElements.filter((e) => e.kind === "choice"); + const promises: Record> = {}; + + for (const choice of choices) { + const syntheticIntent: ClickIntent = { + targetId: choice.id, + targetLabel: choice.label, + reasoning: "prefetch", + }; + const p = fetch("/api/interact", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ session, intent: syntheticIntent }), + signal: ctrl.signal, + }).then(async (r) => { + if (!r.ok) { + const j = (await r.json().catch(() => ({}))) as { error?: string }; + throw new Error(j.error ?? r.statusText); + } + return r.json() as Promise; + }); + p.catch(() => {}); + promises[choice.id] = p; + } + + prefetchRef.current = promises; + + return () => { + ctrl.abort(); + }; + }, [frame?.id, session?.id]); + async function handleClick(click: { x: number; y: number }) { if (phase !== "ready" || !session || !imageBase64) return; setPhase("interacting"); setPendingClick(click); setIntent(null); + const cacheSnapshot = prefetchRef.current; + try { - const res = await fetch("/api/interact", { + // Step 1: Vision (~4s) — figure out what the user actually clicked + const visionRes = await fetch("/api/vision", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ @@ -104,20 +154,61 @@ function PlayInner() { click, }), }); - if (!res.ok) { - const j = (await res.json().catch(() => ({}))) as { error?: string }; - throw new Error(j.error ?? res.statusText); + if (!visionRes.ok) { + const j = (await visionRes.json().catch(() => ({}))) as { + error?: string; + }; + throw new Error(j.error ?? visionRes.statusText); } - const data = (await res.json()) as InteractResponse; + const { intent: clickIntent } = + (await visionRes.json()) as VisionResponse; - const updatedHistory = [ - ...data.session.history, - { frame: data.frame }, - ]; - setSession({ ...data.session, history: updatedHistory }); - setFrame(data.frame); - setImageBase64(data.imageBase64); - setIntent(data.intent); + // Step 2: Cache lookup + const cached = clickIntent.targetId + ? cacheSnapshot[clickIntent.targetId] + : undefined; + + let result: InteractResponse; + if (cached) { + // Cache hit — await the prefetched promise (mostly already resolved) + result = await cached; + // Overwrite the synthetic prefetch intent on history with the real one + const lastIdx = result.session.history.length - 1; + result = { + ...result, + intent: clickIntent, + session: { + ...result.session, + history: result.session.history.map((entry, idx) => + idx === lastIdx + ? { ...entry, click, intent: clickIntent } + : entry, + ), + }, + }; + } else { + // Cache miss (free-form click) — abort wasted prefetches, run live + prefetchAbortRef.current?.abort(); + const liveRes = await fetch("/api/interact", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ session, intent: clickIntent, click }), + }); + if (!liveRes.ok) { + const j = (await liveRes.json().catch(() => ({}))) as { + error?: string; + }; + throw new Error(j.error ?? liveRes.statusText); + } + result = (await liveRes.json()) as InteractResponse; + } + + // Apply the result: append new frame to history + const updatedHistory = [...result.session.history, { frame: result.frame }]; + setSession({ ...result.session, history: updatedHistory }); + setFrame(result.frame); + setImageBase64(result.imageBase64); + setIntent(clickIntent); setPendingClick(null); setTurnNum((t) => t + 1); setPhase("ready"); @@ -189,7 +280,7 @@ function PlayInner() { AI · is · painting · the · next · moment

- this usually takes 12–20 seconds + cached choices resolve in seconds · free-form takes longer

)} diff --git a/apps/web/components/PlayCanvas.tsx b/apps/web/components/PlayCanvas.tsx index 3a60a79..e38a7fe 100644 --- a/apps/web/components/PlayCanvas.tsx +++ b/apps/web/components/PlayCanvas.tsx @@ -1,9 +1,12 @@ "use client"; -import { useRef } from "react"; +import { useRef, useState } from "react"; export type Phase = "loading-first" | "ready" | "interacting"; +const SHADOW = + "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)"; + export function PlayCanvas({ imageBase64, phase, @@ -15,11 +18,12 @@ export function PlayCanvas({ pendingClick: { x: number; y: number } | null; onClick: (click: { x: number; y: number }) => void; }) { - const ref = useRef(null); + const imgRef = useRef(null); + const [dims, setDims] = useState<{ w: number; h: number } | null>(null); - function handleClick(e: React.MouseEvent) { - if (phase !== "ready" || !ref.current || !imageBase64) return; - const rect = ref.current.getBoundingClientRect(); + function handleClick(e: React.MouseEvent) { + if (phase !== "ready" || !imgRef.current) return; + const rect = imgRef.current.getBoundingClientRect(); const x = (e.clientX - rect.left) / rect.width; const y = (e.clientY - rect.top) / rect.height; onClick({ @@ -32,70 +36,81 @@ export function PlayCanvas({ const dimmed = phase === "interacting"; return ( -
-
- {imageBase64 ? ( +
+ {imageBase64 ? ( +
Generated frame { + const img = e.currentTarget; + setDims({ w: img.naturalWidth, h: img.naturalHeight }); + }} draggable={false} + className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${interactive ? "cursor-pointer" : "cursor-wait"} ${dimmed ? "opacity-30" : "opacity-100"}`} + style={{ + maxWidth: "min(560px, 92vw)", + maxHeight: "calc(100dvh - 200px)", + }} /> - ) : ( -
-
-

- Painting · the · first · frame -

-
- )} -
-
+
+
- {pendingClick && ( - <> -
-
- - )} -
+ {pendingClick && ( + <> +
+
+ + )} +
+ ) : ( +
+
+

+ Painting · the · first · frame +

+
+ )} -
+
- 1024 × 1536 · png + {dims ? `${dims.w} × ${dims.h} · png` : "—"} {phase === "ready" ? "Tap · anywhere" : "···"} diff --git a/apps/web/next-env.d.ts b/apps/web/next-env.d.ts index 84ab714..c4b7818 100644 --- a/apps/web/next-env.d.ts +++ b/apps/web/next-env.d.ts @@ -1,4 +1,6 @@ /// /// +import "./.next/dev/types/routes.d.ts"; // NOTE: This file should not be edited +// see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/packages/ai-client/src/chat.ts b/packages/ai-client/src/chat.ts index 41f4b6d..6e05afd 100644 --- a/packages/ai-client/src/chat.ts +++ b/packages/ai-client/src/chat.ts @@ -1,4 +1,5 @@ import type { ProviderConfig } from "@dada/types"; +import { fetchWithRetry } from "./fetchWithRetry"; export type ChatMessage = { role: "system" | "user" | "assistant"; @@ -20,7 +21,7 @@ export async function chat( body.response_format = { type: "json_object" }; } - const res = await fetch(url, { + const res = await fetchWithRetry(url, { method: "POST", headers: { "Content-Type": "application/json", diff --git a/packages/ai-client/src/fetchWithRetry.ts b/packages/ai-client/src/fetchWithRetry.ts new file mode 100644 index 0000000..3f6531f --- /dev/null +++ b/packages/ai-client/src/fetchWithRetry.ts @@ -0,0 +1,39 @@ +type RetryInit = RequestInit & { retries?: number; retryDelayMs?: number }; + +export async function fetchWithRetry( + url: string, + init: RetryInit, +): Promise { + const { retries = 2, retryDelayMs = 1500, ...fetchInit } = init; + + let lastError: unknown; + for (let attempt = 0; attempt <= retries; attempt++) { + try { + const res = await fetch(url, fetchInit); + if (res.ok) return res; + // Don't retry 4xx (client errors won't fix themselves) + if (res.status >= 400 && res.status < 500) return res; + // 5xx: retry if we have budget left + if (attempt < retries) { + await sleep(retryDelayMs * (attempt + 1)); + continue; + } + return res; + } catch (err) { + lastError = err; + const isAbort = + err instanceof DOMException && err.name === "AbortError"; + if (isAbort) throw err; + if (attempt < retries) { + await sleep(retryDelayMs * (attempt + 1)); + continue; + } + throw err; + } + } + throw lastError; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts index 3296dc3..c87d12b 100644 --- a/packages/ai-client/src/image.ts +++ b/packages/ai-client/src/image.ts @@ -1,20 +1,29 @@ import type { ProviderConfig } from "@dada/types"; +import { fetchWithRetry } from "./fetchWithRetry"; + +type ImageUrlPart = { type: string; image_url?: { url?: string } }; +type ChatResponse = { + choices: { + message: { + content: string | ImageUrlPart[]; + images?: ImageUrlPart[]; + }; + }[]; +}; export async function generateImage( config: ProviderConfig, prompt: string, - opts?: { size?: string; quality?: "low" | "medium" | "high" | "auto" }, ): Promise { - const url = `${config.baseUrl.replace(/\/$/, "")}/images/generations`; - const body: Record = { + const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`; + + const body = { model: config.model, - prompt, - size: opts?.size ?? "1024x1536", - quality: opts?.quality ?? "medium", - n: 1, + modalities: ["image", "text"], + messages: [{ role: "user", content: prompt }], }; - const res = await fetch(url, { + const res = await fetchWithRetry(url, { method: "POST", headers: { "Content-Type": "application/json", @@ -25,20 +34,45 @@ export async function generateImage( if (!res.ok) { const text = await res.text(); - throw new Error(`Image API error ${res.status}: ${text}`); + throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`); } - const json = (await res.json()) as { - data: { b64_json?: string; url?: string }[]; - }; - const item = json.data[0]; - if (!item) throw new Error("Image API returned no data"); + const json = (await res.json()) as ChatResponse; + const msg = json.choices[0]?.message; + if (!msg) throw new Error("Image API returned no message"); - if (item.b64_json) return item.b64_json; - if (item.url) { - const imgRes = await fetch(item.url); - const buf = await imgRes.arrayBuffer(); - return Buffer.from(buf).toString("base64"); + // 1) OpenRouter-style: msg.images = [{ image_url: { url } }] + // 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }] + const structured: ImageUrlPart[] = []; + if (msg.images) structured.push(...msg.images); + if (Array.isArray(msg.content)) structured.push(...msg.content); + for (const part of structured) { + const u = part.image_url?.url; + if (u) return await urlToBase64(u); } - throw new Error("Image API returned neither b64_json nor url"); + + // 3) provider-style: content is a string with markdown image ![alt](url) + // or a bare URL fragment + if (typeof msg.content === "string") { + const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/); + if (md?.[1]) return await urlToBase64(md[1]); + const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i); + if (bare?.[0]) return await urlToBase64(bare[0]); + } + + throw new Error( + `No image found in response: ${JSON.stringify(msg).slice(0, 300)}`, + ); +} + +async function urlToBase64(url: string): Promise { + if (url.startsWith("data:")) { + const idx = url.indexOf("base64,"); + if (idx === -1) throw new Error("data URL is not base64-encoded"); + return url.slice(idx + "base64,".length); + } + const res = await fetch(url); + if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`); + const buf = await res.arrayBuffer(); + return Buffer.from(buf).toString("base64"); } diff --git a/packages/ai-client/src/vision.ts b/packages/ai-client/src/vision.ts index 0e3c91c..e652190 100644 --- a/packages/ai-client/src/vision.ts +++ b/packages/ai-client/src/vision.ts @@ -1,4 +1,5 @@ import type { ProviderConfig } from "@dada/types"; +import { fetchWithRetry } from "./fetchWithRetry"; export async function interpretClick( config: ProviderConfig, @@ -25,14 +26,24 @@ export async function interpretClick( response_format: { type: "json_object" }, }; - const res = await fetch(url, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${config.apiKey}`, - }, - body: JSON.stringify(body), - }); + const timeoutCtrl = new AbortController(); + const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000); + + let res: Response; + try { + res = await fetchWithRetry(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${config.apiKey}`, + }, + body: JSON.stringify(body), + signal: timeoutCtrl.signal, + retries: 0, + }); + } finally { + clearTimeout(timeoutId); + } if (!res.ok) { const text = await res.text(); diff --git a/packages/engine/src/annotate.ts b/packages/engine/src/annotate.ts index 3f7b412..77df5da 100644 --- a/packages/engine/src/annotate.ts +++ b/packages/engine/src/annotate.ts @@ -5,25 +5,31 @@ export async function annotateClick( click: { x: number; y: number }, ): Promise { const buf = Buffer.from(imageBase64, "base64"); - const meta = await sharp(buf).metadata(); - const w = meta.width ?? 1024; - const h = meta.height ?? 1536; + + const resized = await sharp(buf) + .resize({ width: 768, withoutEnlargement: true, fit: "inside" }) + .png() + .toBuffer(); + + const meta = await sharp(resized).metadata(); + const w = meta.width ?? 768; + const h = meta.height ?? 1152; const cx = Math.round(click.x * w); const cy = Math.round(click.y * h); - const r = Math.round(Math.min(w, h) * 0.025); - const stroke = Math.max(3, Math.round(r * 0.25)); + const r = Math.max(8, Math.round(Math.min(w, h) * 0.025)); + const stroke = Math.max(2, Math.round(r * 0.25)); - const svg = ` + const svg = ` `; - const out = await sharp(buf) + const out = await sharp(resized) .composite([{ input: Buffer.from(svg), top: 0, left: 0 }]) - .png() + .png({ compressionLevel: 9 }) .toBuffer(); return out.toString("base64"); diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts index 080cc10..f4915fb 100644 --- a/packages/engine/src/index.ts +++ b/packages/engine/src/index.ts @@ -1,3 +1,3 @@ -export { startSession, takeTurn } from "./orchestrator"; +export { startSession, takeTurn, visionTurn } from "./orchestrator"; export { annotateClick } from "./annotate"; export * from "./prompts"; diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts index 7c408ac..6fee9ba 100644 --- a/packages/engine/src/orchestrator.ts +++ b/packages/engine/src/orchestrator.ts @@ -1,10 +1,13 @@ import type { + ClickIntent, EngineConfig, InteractRequest, InteractResponse, Session, StartRequest, StartResponse, + VisionRequest, + VisionResponse, } from "@dada/types"; import { annotateClick } from "./annotate"; import { direct } from "./director"; @@ -37,21 +40,27 @@ export async function startSession( }; } +export async function visionTurn( + config: EngineConfig, + req: VisionRequest, +): Promise { + const annotated = await annotateClick(req.prevImageBase64, req.click); + const lastFrame = req.session.history.at(-1)?.frame; + const uiElements = lastFrame?.uiElements ?? []; + const intent = await interpret(config.vision, annotated, uiElements); + return { intent }; +} + export async function takeTurn( config: EngineConfig, req: InteractRequest, ): Promise { - const annotated = await annotateClick(req.prevImageBase64, req.click); - - const lastFrame = req.session.history.at(-1)?.frame; - const uiElements = lastFrame?.uiElements ?? []; - - const intent = await interpret(config.vision, annotated, uiElements); - const updatedSession: Session = { ...req.session, history: req.session.history.map((entry, idx, arr) => - idx === arr.length - 1 ? { ...entry, click: req.click, intent } : entry, + idx === arr.length - 1 + ? { ...entry, click: req.click, intent: req.intent } + : entry, ), }; @@ -66,6 +75,6 @@ export async function takeTurn( session: updatedSession, frame: nextFrame, imageBase64: nextImage, - intent, + intent: req.intent, }; } diff --git a/packages/engine/src/prompts.ts b/packages/engine/src/prompts.ts index 05ae03a..eae5177 100644 --- a/packages/engine/src/prompts.ts +++ b/packages/engine/src/prompts.ts @@ -29,7 +29,7 @@ export function buildDirectorUserMessage(session: Session): string { parts.push(`画风:${session.styleGuide}`); if (session.history.length === 0) { - parts.push("\n这是故事的开场。请生成开场画面。"); + parts.push("\n这是故事的开场。请生成开场画面,严格以 JSON 格式返回。"); return parts.join("\n"); } @@ -47,7 +47,7 @@ export function buildDirectorUserMessage(session: Session): string { parts.push(beat.join("\n")); }); - parts.push("\n请生成下一帧。"); + parts.push("\n请生成下一帧,严格以 JSON 格式返回。"); return parts.join("\n"); } @@ -111,5 +111,5 @@ export function buildVisionUserPrompt(uiElements: UIElement[]): string { return `当前画面包含以下已知 UI 元素: ${list} -红点位置即为用户点击位置。请判断用户的意图。`; +红点位置即为用户点击位置。请判断用户的意图,并以 JSON 格式返回结果。`; } diff --git a/packages/engine/src/renderer.ts b/packages/engine/src/renderer.ts index 3ff5535..0f5a9a9 100644 --- a/packages/engine/src/renderer.ts +++ b/packages/engine/src/renderer.ts @@ -8,5 +8,5 @@ export async function render( styleGuide: string, ): Promise { const prompt = buildImagePrompt(frame, styleGuide); - return generateImage(config, prompt, { size: "1024x1536", quality: "medium" }); + return generateImage(config, prompt); } diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 892afae..f021148 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -60,12 +60,22 @@ export type StartResponse = { imageBase64: string; }; -export type InteractRequest = { +export type VisionRequest = { session: Session; prevImageBase64: string; click: { x: number; y: number }; }; +export type VisionResponse = { + intent: ClickIntent; +}; + +export type InteractRequest = { + session: Session; + intent: ClickIntent; + click?: { x: number; y: number }; +}; + export type InteractResponse = { session: Session; frame: StoryFrame; diff --git a/vercel.json b/vercel.json index 25544e5..5af8dcf 100644 --- a/vercel.json +++ b/vercel.json @@ -5,6 +5,7 @@ "installCommand": "pnpm install", "functions": { "apps/web/app/api/interact/route.ts": { "maxDuration": 60 }, + "apps/web/app/api/vision/route.ts": { "maxDuration": 60 }, "apps/web/app/api/start/route.ts": { "maxDuration": 60 } } }