From 9cedfa66e45a71cd576367b50ce8e803b5c3f55d Mon Sep 17 00:00:00 2001
From: yuanzonghao <yuanzonghao123@gmail.com>
Date: Tue, 12 May 2026 19:38:03 +0800
Subject: [PATCH] feat: prefetch, vision split, provider adapter, UI polish

Engine
- Split /api/vision out from /api/interact so client can drive
  prefetch + cache lookup independently of click interpretation
- Image client switched to chat-completions+modalities API (OpenRouter/
  provider style), supporting markdown image URL responses
- annotateClick now resizes to 768w before composite to keep vision
  payloads small and avoid CDN timeouts
- Prompts updated to mention "JSON" in user messages (required by
  Gemini's strict JSON mode)
- Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision
  (with 60s hard timeout)

Client
- Parallel prefetch of all three choice branches on each new frame
- Effect deliberately excludes phase from deps so user-click doesn't
  abort in-flight prefetches
- Cache hit/miss/free-form fallback handled in handleClick
- PlayCanvas reads img naturalWidth/Height and adapts container to
  whatever aspect AI returns (no more cropped third choice)
- max-width raised to 560px, max-height calc(100dvh - 200px)

Misc
- README env-path corrected to apps/web/.env.local
- users.md: BGM/TTS idea note
- .env.example moved into apps/web alongside next config

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .env.example                             |  21 ----
 README.md                                |   4 +-
 apps/web/.env.example                    |  24 ++++
 apps/web/app/api/interact/route.ts       |   4 +-
 apps/web/app/api/vision/route.ts         |  32 ++++++
 apps/web/app/layout.tsx                  |   2 +-
 apps/web/app/play/page.tsx               | 119 +++++++++++++++++---
 apps/web/components/PlayCanvas.tsx       | 133 +++++++++++++----------
 apps/web/next-env.d.ts                   |   2 +
 packages/ai-client/src/chat.ts           |   3 +-
 packages/ai-client/src/fetchWithRetry.ts |  39 +++++++
 packages/ai-client/src/image.ts          |  74 +++++++++----
 packages/ai-client/src/vision.ts         |  27 +++--
 packages/engine/src/annotate.ts          |  22 ++--
 packages/engine/src/index.ts             |   2 +-
 packages/engine/src/orchestrator.ts      |  27 +++--
 packages/engine/src/prompts.ts           |   6 +-
 packages/engine/src/renderer.ts          |   2 +-
 packages/types/src/index.ts              |  12 +-
 vercel.json                              |   1 +
 20 files changed, 405 insertions(+), 151 deletions(-)
 delete mode 100644 .env.example
 create mode 100644 apps/web/.env.example
 create mode 100644 apps/web/app/api/vision/route.ts
 create mode 100644 packages/ai-client/src/fetchWithRetry.ts

diff --git a/.env.example b/.env.example
deleted file mode 100644
index 6547b5c..0000000
--- a/.env.example
+++ /dev/null
@@ -1,21 +0,0 @@
-# =============================================================
-# Dada — AI Visual Novel
-# Three independently configurable AI providers
-# Any OpenAI-compatible endpoint works (OpenAI, Anthropic, Gemini,
-# OpenRouter, DeepSeek, Ollama, ...).
-# =============================================================
-
-# ---- 1. Text LLM (story director) -----------------------------
-TEXT_BASE_URL=https://api.anthropic.com/v1
-TEXT_API_KEY=sk-ant-xxx
-TEXT_MODEL=claude-opus-4-7
-
-# ---- 2. Image generator (renders the whole UI screen) ---------
-IMAGE_BASE_URL=https://api.openai.com/v1
-IMAGE_API_KEY=sk-xxx
-IMAGE_MODEL=gpt-image-2
-
-# ---- 3. Vision model (interprets where the user clicked) ------
-VISION_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai
-VISION_API_KEY=xxx
-VISION_MODEL=gemini-3-flash
diff --git a/README.md b/README.md
index 077afca..3bba298 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Three providers, all independently configurable. Any OpenAI-compatible chat / im
 | Image · UI renderer   | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `gpt-image-2` via OpenAI |
 | Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google |
 
-See `.env.example` for the exact shape.
+See `apps/web/.env.example` for the exact shape.
 
 ---
 
@@ -59,7 +59,7 @@ Requires Node 20+ and pnpm 9+.
 
 ```bash
 pnpm install
-cp .env.example .env.local
+cp apps/web/.env.example apps/web/.env.local
 # fill in the nine env vars
 pnpm dev
 # open http://localhost:3000
diff --git a/apps/web/.env.example b/apps/web/.env.example
new file mode 100644
index 0000000..fe9c11e
--- /dev/null
+++ b/apps/web/.env.example
@@ -0,0 +1,24 @@
+# =============================================================
+# Dada — AI Visual Novel
+# Three independently configurable AI providers
+# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
+# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
+#
+# Image generation uses the chat-completions + modalities API
+# (OpenRouter-style), NOT the legacy /images/generations endpoint.
+# =============================================================
+
+# ---- 1. Text LLM (story director) -----------------------------
+TEXT_BASE_URL=https://openrouter.ai/api/v1
+TEXT_API_KEY=sk-or-v1-xxx
+TEXT_MODEL=~anthropic/claude-sonnet-latest
+
+# ---- 2. Image generator (renders the whole UI screen) ---------
+IMAGE_BASE_URL=https://openrouter.ai/api/v1
+IMAGE_API_KEY=sk-or-v1-xxx
+IMAGE_MODEL=openai/gpt-5.4-image-2
+
+# ---- 3. Vision model (interprets where the user clicked) ------
+VISION_BASE_URL=https://openrouter.ai/api/v1
+VISION_API_KEY=sk-or-v1-xxx
+VISION_MODEL=~google/gemini-flash-latest
diff --git a/apps/web/app/api/interact/route.ts b/apps/web/app/api/interact/route.ts
index c33510a..45872cb 100644
--- a/apps/web/app/api/interact/route.ts
+++ b/apps/web/app/api/interact/route.ts
@@ -14,9 +14,9 @@ export async function POST(req: Request) {
     return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
   }
 
-  if (!body.session || !body.prevImageBase64 || !body.click) {
+  if (!body.session || !body.intent) {
     return NextResponse.json(
-      { error: "session, prevImageBase64, click are required" },
+      { error: "session and intent are required" },
       { status: 400 },
     );
   }
diff --git a/apps/web/app/api/vision/route.ts b/apps/web/app/api/vision/route.ts
new file mode 100644
index 0000000..864d751
--- /dev/null
+++ b/apps/web/app/api/vision/route.ts
@@ -0,0 +1,32 @@
+import { visionTurn } from "@dada/engine";
+import type { VisionRequest } from "@dada/types";
+import { NextResponse } from "next/server";
+import { loadEngineConfig } from "@/lib/config";
+
+export const runtime = "nodejs";
+export const maxDuration = 60;
+
+export async function POST(req: Request) {
+  let body: VisionRequest;
+  try {
+    body = (await req.json()) as VisionRequest;
+  } catch {
+    return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
+  }
+
+  if (!body.session || !body.prevImageBase64 || !body.click) {
+    return NextResponse.json(
+      { error: "session, prevImageBase64, click are required" },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const config = loadEngineConfig();
+    const result = await visionTurn(config, body);
+    return NextResponse.json(result);
+  } catch (err) {
+    const message = err instanceof Error ? err.message : "Unknown error";
+    return NextResponse.json({ error: message }, { status: 500 });
+  }
+}
diff --git a/apps/web/app/layout.tsx b/apps/web/app/layout.tsx
index 6c73574..c09d866 100644
--- a/apps/web/app/layout.tsx
+++ b/apps/web/app/layout.tsx
@@ -13,7 +13,7 @@ export default function RootLayout({
   children: React.ReactNode;
 }) {
   return (
-    <html lang="zh-CN">
+    <html lang="zh-CN" suppressHydrationWarning>
       <head>
         <link rel="preconnect" href="https://fonts.googleapis.com" />
         <link
diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx
index 8104812..0ce2c20 100644
--- a/apps/web/app/play/page.tsx
+++ b/apps/web/app/play/page.tsx
@@ -11,6 +11,7 @@ import type {
   Session,
   StartResponse,
   StoryFrame,
+  VisionResponse,
 } from "@dada/types";
 
 function PlayInner() {
@@ -28,7 +29,10 @@ function PlayInner() {
   } | null>(null);
   const [turnNum, setTurnNum] = useState(0);
   const [error, setError] = useState<string | null>(null);
+
   const startedRef = useRef(false);
+  const prefetchAbortRef = useRef<AbortController | null>(null);
+  const prefetchRef = useRef<Record<string, Promise<InteractResponse>>>({});
 
   useEffect(() => {
     if (startedRef.current) return;
@@ -88,14 +92,60 @@ function PlayInner() {
       .catch((e) => setError(String(e)));
   }, [params, router]);
 
+  // Prefetch next-frame candidates whenever current frame becomes ready.
+  // All three fire in parallel for fastest cache fill. NOT depending on
+  // `phase` — we don't want to abort in-flight prefetches just because
+  // the user clicked. They should continue so handleClick can await them.
+  useEffect(() => {
+    if (!session || !frame) return;
+
+    prefetchAbortRef.current?.abort();
+    const ctrl = new AbortController();
+    prefetchAbortRef.current = ctrl;
+
+    const choices = frame.uiElements.filter((e) => e.kind === "choice");
+    const promises: Record<string, Promise<InteractResponse>> = {};
+
+    for (const choice of choices) {
+      const syntheticIntent: ClickIntent = {
+        targetId: choice.id,
+        targetLabel: choice.label,
+        reasoning: "prefetch",
+      };
+      const p = fetch("/api/interact", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ session, intent: syntheticIntent }),
+        signal: ctrl.signal,
+      }).then(async (r) => {
+        if (!r.ok) {
+          const j = (await r.json().catch(() => ({}))) as { error?: string };
+          throw new Error(j.error ?? r.statusText);
+        }
+        return r.json() as Promise<InteractResponse>;
+      });
+      p.catch(() => {});
+      promises[choice.id] = p;
+    }
+
+    prefetchRef.current = promises;
+
+    return () => {
+      ctrl.abort();
+    };
+  }, [frame?.id, session?.id]);
+
   async function handleClick(click: { x: number; y: number }) {
     if (phase !== "ready" || !session || !imageBase64) return;
     setPhase("interacting");
     setPendingClick(click);
     setIntent(null);
 
+    const cacheSnapshot = prefetchRef.current;
+
     try {
-      const res = await fetch("/api/interact", {
+      // Step 1: Vision (~4s) — figure out what the user actually clicked
+      const visionRes = await fetch("/api/vision", {
         method: "POST",
         headers: { "Content-Type": "application/json" },
         body: JSON.stringify({
@@ -104,20 +154,61 @@ function PlayInner() {
           click,
         }),
       });
-      if (!res.ok) {
-        const j = (await res.json().catch(() => ({}))) as { error?: string };
-        throw new Error(j.error ?? res.statusText);
+      if (!visionRes.ok) {
+        const j = (await visionRes.json().catch(() => ({}))) as {
+          error?: string;
+        };
+        throw new Error(j.error ?? visionRes.statusText);
       }
-      const data = (await res.json()) as InteractResponse;
+      const { intent: clickIntent } =
+        (await visionRes.json()) as VisionResponse;
 
-      const updatedHistory = [
-        ...data.session.history,
-        { frame: data.frame },
-      ];
-      setSession({ ...data.session, history: updatedHistory });
-      setFrame(data.frame);
-      setImageBase64(data.imageBase64);
-      setIntent(data.intent);
+      // Step 2: Cache lookup
+      const cached = clickIntent.targetId
+        ? cacheSnapshot[clickIntent.targetId]
+        : undefined;
+
+      let result: InteractResponse;
+      if (cached) {
+        // Cache hit — await the prefetched promise (mostly already resolved)
+        result = await cached;
+        // Overwrite the synthetic prefetch intent on history with the real one
+        const lastIdx = result.session.history.length - 1;
+        result = {
+          ...result,
+          intent: clickIntent,
+          session: {
+            ...result.session,
+            history: result.session.history.map((entry, idx) =>
+              idx === lastIdx
+                ? { ...entry, click, intent: clickIntent }
+                : entry,
+            ),
+          },
+        };
+      } else {
+        // Cache miss (free-form click) — abort wasted prefetches, run live
+        prefetchAbortRef.current?.abort();
+        const liveRes = await fetch("/api/interact", {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({ session, intent: clickIntent, click }),
+        });
+        if (!liveRes.ok) {
+          const j = (await liveRes.json().catch(() => ({}))) as {
+            error?: string;
+          };
+          throw new Error(j.error ?? liveRes.statusText);
+        }
+        result = (await liveRes.json()) as InteractResponse;
+      }
+
+      // Apply the result: append new frame to history
+      const updatedHistory = [...result.session.history, { frame: result.frame }];
+      setSession({ ...result.session, history: updatedHistory });
+      setFrame(result.frame);
+      setImageBase64(result.imageBase64);
+      setIntent(clickIntent);
       setPendingClick(null);
       setTurnNum((t) => t + 1);
       setPhase("ready");
@@ -189,7 +280,7 @@ function PlayInner() {
                 AI · is · painting · the · next · moment
               </p>
               <p className="font-serif italic text-clay-400 text-xs">
-                this usually takes 12–20 seconds
+                cached choices resolve in seconds · free-form takes longer
               </p>
             </div>
           )}
diff --git a/apps/web/components/PlayCanvas.tsx b/apps/web/components/PlayCanvas.tsx
index 3a60a79..e38a7fe 100644
--- a/apps/web/components/PlayCanvas.tsx
+++ b/apps/web/components/PlayCanvas.tsx
@@ -1,9 +1,12 @@
 "use client";
 
-import { useRef } from "react";
+import { useRef, useState } from "react";
 
 export type Phase = "loading-first" | "ready" | "interacting";
 
+const SHADOW =
+  "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
+
 export function PlayCanvas({
   imageBase64,
   phase,
@@ -15,11 +18,12 @@ export function PlayCanvas({
   pendingClick: { x: number; y: number } | null;
   onClick: (click: { x: number; y: number }) => void;
 }) {
-  const ref = useRef<HTMLDivElement>(null);
+  const imgRef = useRef<HTMLImageElement>(null);
+  const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
 
-  function handleClick(e: React.MouseEvent<HTMLDivElement>) {
-    if (phase !== "ready" || !ref.current || !imageBase64) return;
-    const rect = ref.current.getBoundingClientRect();
+  function handleClick(e: React.MouseEvent<HTMLImageElement>) {
+    if (phase !== "ready" || !imgRef.current) return;
+    const rect = imgRef.current.getBoundingClientRect();
     const x = (e.clientX - rect.left) / rect.width;
     const y = (e.clientY - rect.top) / rect.height;
     onClick({
@@ -32,70 +36,81 @@ export function PlayCanvas({
   const dimmed = phase === "interacting";
 
   return (
-    <div className="w-full max-w-[440px] mx-auto">
-      <div
-        ref={ref}
-        onClick={handleClick}
-        className={`relative aspect-[2/3] w-full overflow-hidden bg-cream-200 select-none ${interactive ? "cursor-pointer" : "cursor-wait"}`}
-        style={{
-          boxShadow:
-            "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)",
-        }}
-      >
-        {imageBase64 ? (
+    <div className="w-full flex flex-col items-center">
+      {imageBase64 ? (
+        <div className="relative inline-block" style={{ boxShadow: SHADOW }}>
           <img
             key={imageBase64.slice(-48)}
+            ref={imgRef}
             src={`data:image/png;base64,${imageBase64}`}
             alt="Generated frame"
-            className={`absolute inset-0 w-full h-full object-cover animate-fade-in transition-opacity duration-700 ease-out ${dimmed ? "opacity-30" : "opacity-100"}`}
+            onClick={handleClick}
+            onLoad={(e) => {
+              const img = e.currentTarget;
+              setDims({ w: img.naturalWidth, h: img.naturalHeight });
+            }}
             draggable={false}
+            className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${interactive ? "cursor-pointer" : "cursor-wait"} ${dimmed ? "opacity-30" : "opacity-100"}`}
+            style={{
+              maxWidth: "min(560px, 92vw)",
+              maxHeight: "calc(100dvh - 200px)",
+            }}
           />
-        ) : (
-          <div className="absolute inset-0 flex flex-col items-center justify-center gap-4">
-            <div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
-            <p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
-              Painting · the · first · frame
-            </p>
-          </div>
-        )}
 
-        <div className="absolute inset-x-0 top-0 h-12 bg-gradient-to-b from-clay-900/15 to-transparent pointer-events-none" />
-        <div className="absolute inset-x-0 bottom-0 h-12 bg-gradient-to-t from-clay-900/15 to-transparent pointer-events-none" />
+          <div className="absolute inset-x-0 top-0 h-10 bg-gradient-to-b from-clay-900/12 to-transparent pointer-events-none" />
+          <div className="absolute inset-x-0 bottom-0 h-10 bg-gradient-to-t from-clay-900/12 to-transparent pointer-events-none" />
 
-        {pendingClick && (
-          <>
-            <div
-              className="absolute rounded-full border border-ember-500 pointer-events-none"
-              style={{
-                left: `${pendingClick.x * 100}%`,
-                top: `${pendingClick.y * 100}%`,
-                transform: "translate(-50%, -50%)",
-                width: 30,
-                height: 30,
-                animation:
-                  "dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
-              }}
-            />
-            <div
-              className="absolute rounded-full pointer-events-none"
-              style={{
-                left: `${pendingClick.x * 100}%`,
-                top: `${pendingClick.y * 100}%`,
-                transform: "translate(-50%, -50%)",
-                width: 11,
-                height: 11,
-                background: "#D97A2E",
-                boxShadow:
-                  "0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
-              }}
-            />
-          </>
-        )}
-      </div>
+          {pendingClick && (
+            <>
+              <div
+                className="absolute rounded-full border border-ember-500 pointer-events-none"
+                style={{
+                  left: `${pendingClick.x * 100}%`,
+                  top: `${pendingClick.y * 100}%`,
+                  transform: "translate(-50%, -50%)",
+                  width: 30,
+                  height: 30,
+                  animation:
+                    "dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
+                }}
+              />
+              <div
+                className="absolute rounded-full pointer-events-none"
+                style={{
+                  left: `${pendingClick.x * 100}%`,
+                  top: `${pendingClick.y * 100}%`,
+                  transform: "translate(-50%, -50%)",
+                  width: 11,
+                  height: 11,
+                  background: "#D97A2E",
+                  boxShadow:
+                    "0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
+                }}
+              />
+            </>
+          )}
+        </div>
+      ) : (
+        <div
+          className="relative aspect-[2/3] bg-cream-200 flex flex-col items-center justify-center gap-4"
+          style={{
+            width: "min(560px, calc((100dvh - 200px) * 2 / 3), 92vw)",
+            boxShadow: SHADOW,
+          }}
+        >
+          <div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
+          <p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
+            Painting · the · first · frame
+          </p>
+        </div>
+      )}
 
-      <div className="flex items-center justify-between mt-3 px-1">
+      <div
+        className="flex items-center justify-between mt-3 px-1 w-full"
+        style={{ maxWidth: "min(560px, 92vw)" }}
+      >
         <span className="text-[9px] smallcaps text-clay-400 num">
-          1024 × 1536 · png
+          {dims ? `${dims.w} × ${dims.h} · png` : "—"}
         </span>
         <span className="text-[9px] smallcaps text-clay-400">
           {phase === "ready" ? "Tap · anywhere" : "···"}
diff --git a/apps/web/next-env.d.ts b/apps/web/next-env.d.ts
index 84ab714..c4b7818 100644
--- a/apps/web/next-env.d.ts
+++ b/apps/web/next-env.d.ts
@@ -1,4 +1,6 @@
 /// <reference types="next" />
 /// <reference types="next/image-types/global" />
+import "./.next/dev/types/routes.d.ts";
 
 // NOTE: This file should not be edited
+// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
diff --git a/packages/ai-client/src/chat.ts b/packages/ai-client/src/chat.ts
index 41f4b6d..6e05afd 100644
--- a/packages/ai-client/src/chat.ts
+++ b/packages/ai-client/src/chat.ts
@@ -1,4 +1,5 @@
 import type { ProviderConfig } from "@dada/types";
+import { fetchWithRetry } from "./fetchWithRetry";
 
 export type ChatMessage = {
   role: "system" | "user" | "assistant";
@@ -20,7 +21,7 @@ export async function chat(
     body.response_format = { type: "json_object" };
   }
 
-  const res = await fetch(url, {
+  const res = await fetchWithRetry(url, {
     method: "POST",
     headers: {
       "Content-Type": "application/json",
diff --git a/packages/ai-client/src/fetchWithRetry.ts b/packages/ai-client/src/fetchWithRetry.ts
new file mode 100644
index 0000000..3f6531f
--- /dev/null
+++ b/packages/ai-client/src/fetchWithRetry.ts
@@ -0,0 +1,39 @@
+type RetryInit = RequestInit & { retries?: number; retryDelayMs?: number };
+
+export async function fetchWithRetry(
+  url: string,
+  init: RetryInit,
+): Promise<Response> {
+  const { retries = 2, retryDelayMs = 1500, ...fetchInit } = init;
+
+  let lastError: unknown;
+  for (let attempt = 0; attempt <= retries; attempt++) {
+    try {
+      const res = await fetch(url, fetchInit);
+      if (res.ok) return res;
+      // Don't retry 4xx (client errors won't fix themselves)
+      if (res.status >= 400 && res.status < 500) return res;
+      // 5xx: retry if we have budget left
+      if (attempt < retries) {
+        await sleep(retryDelayMs * (attempt + 1));
+        continue;
+      }
+      return res;
+    } catch (err) {
+      lastError = err;
+      const isAbort =
+        err instanceof DOMException && err.name === "AbortError";
+      if (isAbort) throw err;
+      if (attempt < retries) {
+        await sleep(retryDelayMs * (attempt + 1));
+        continue;
+      }
+      throw err;
+    }
+  }
+  throw lastError;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts
index 3296dc3..c87d12b 100644
--- a/packages/ai-client/src/image.ts
+++ b/packages/ai-client/src/image.ts
@@ -1,20 +1,29 @@
 import type { ProviderConfig } from "@dada/types";
+import { fetchWithRetry } from "./fetchWithRetry";
+
+type ImageUrlPart = { type: string; image_url?: { url?: string } };
+type ChatResponse = {
+  choices: {
+    message: {
+      content: string | ImageUrlPart[];
+      images?: ImageUrlPart[];
+    };
+  }[];
+};
 
 export async function generateImage(
   config: ProviderConfig,
   prompt: string,
-  opts?: { size?: string; quality?: "low" | "medium" | "high" | "auto" },
 ): Promise<string> {
-  const url = `${config.baseUrl.replace(/\/$/, "")}/images/generations`;
-  const body: Record<string, unknown> = {
+  const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
+
+  const body = {
     model: config.model,
-    prompt,
-    size: opts?.size ?? "1024x1536",
-    quality: opts?.quality ?? "medium",
-    n: 1,
+    modalities: ["image", "text"],
+    messages: [{ role: "user", content: prompt }],
   };
 
-  const res = await fetch(url, {
+  const res = await fetchWithRetry(url, {
     method: "POST",
     headers: {
       "Content-Type": "application/json",
@@ -25,20 +34,45 @@ export async function generateImage(
 
   if (!res.ok) {
     const text = await res.text();
-    throw new Error(`Image API error ${res.status}: ${text}`);
+    throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`);
   }
 
-  const json = (await res.json()) as {
-    data: { b64_json?: string; url?: string }[];
-  };
-  const item = json.data[0];
-  if (!item) throw new Error("Image API returned no data");
+  const json = (await res.json()) as ChatResponse;
+  const msg = json.choices[0]?.message;
+  if (!msg) throw new Error("Image API returned no message");
 
-  if (item.b64_json) return item.b64_json;
-  if (item.url) {
-    const imgRes = await fetch(item.url);
-    const buf = await imgRes.arrayBuffer();
-    return Buffer.from(buf).toString("base64");
+  // 1) OpenRouter-style: msg.images = [{ image_url: { url } }]
+  // 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }]
+  const structured: ImageUrlPart[] = [];
+  if (msg.images) structured.push(...msg.images);
+  if (Array.isArray(msg.content)) structured.push(...msg.content);
+  for (const part of structured) {
+    const u = part.image_url?.url;
+    if (u) return await urlToBase64(u);
   }
-  throw new Error("Image API returned neither b64_json nor url");
+
+  // 3) provider-style: content is a string with markdown image ![alt](url)
+  //    or a bare URL fragment
+  if (typeof msg.content === "string") {
+    const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/);
+    if (md?.[1]) return await urlToBase64(md[1]);
+    const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i);
+    if (bare?.[0]) return await urlToBase64(bare[0]);
+  }
+
+  throw new Error(
+    `No image found in response: ${JSON.stringify(msg).slice(0, 300)}`,
+  );
+}
+
+async function urlToBase64(url: string): Promise<string> {
+  if (url.startsWith("data:")) {
+    const idx = url.indexOf("base64,");
+    if (idx === -1) throw new Error("data URL is not base64-encoded");
+    return url.slice(idx + "base64,".length);
+  }
+  const res = await fetch(url);
+  if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`);
+  const buf = await res.arrayBuffer();
+  return Buffer.from(buf).toString("base64");
 }
diff --git a/packages/ai-client/src/vision.ts b/packages/ai-client/src/vision.ts
index 0e3c91c..e652190 100644
--- a/packages/ai-client/src/vision.ts
+++ b/packages/ai-client/src/vision.ts
@@ -1,4 +1,5 @@
 import type { ProviderConfig } from "@dada/types";
+import { fetchWithRetry } from "./fetchWithRetry";
 
 export async function interpretClick(
   config: ProviderConfig,
@@ -25,14 +26,24 @@ export async function interpretClick(
     response_format: { type: "json_object" },
   };
 
-  const res = await fetch(url, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: `Bearer ${config.apiKey}`,
-    },
-    body: JSON.stringify(body),
-  });
+  const timeoutCtrl = new AbortController();
+  const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
+
+  let res: Response;
+  try {
+    res = await fetchWithRetry(url, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${config.apiKey}`,
+      },
+      body: JSON.stringify(body),
+      signal: timeoutCtrl.signal,
+      retries: 0,
+    });
+  } finally {
+    clearTimeout(timeoutId);
+  }
 
   if (!res.ok) {
     const text = await res.text();
diff --git a/packages/engine/src/annotate.ts b/packages/engine/src/annotate.ts
index 3f7b412..77df5da 100644
--- a/packages/engine/src/annotate.ts
+++ b/packages/engine/src/annotate.ts
@@ -5,25 +5,31 @@ export async function annotateClick(
   click: { x: number; y: number },
 ): Promise<string> {
   const buf = Buffer.from(imageBase64, "base64");
-  const meta = await sharp(buf).metadata();
-  const w = meta.width ?? 1024;
-  const h = meta.height ?? 1536;
+
+  const resized = await sharp(buf)
+    .resize({ width: 768, withoutEnlargement: true, fit: "inside" })
+    .png()
+    .toBuffer();
+
+  const meta = await sharp(resized).metadata();
+  const w = meta.width ?? 768;
+  const h = meta.height ?? 1152;
 
   const cx = Math.round(click.x * w);
   const cy = Math.round(click.y * h);
-  const r = Math.round(Math.min(w, h) * 0.025);
-  const stroke = Math.max(3, Math.round(r * 0.25));
+  const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
+  const stroke = Math.max(2, Math.round(r * 0.25));
 
-  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}">
+  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
     <circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
             stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
     <circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
             fill="rgba(255,255,255,1)" />
   </svg>`;
 
-  const out = await sharp(buf)
+  const out = await sharp(resized)
     .composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
-    .png()
+    .png({ compressionLevel: 9 })
     .toBuffer();
 
   return out.toString("base64");
diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts
index 080cc10..f4915fb 100644
--- a/packages/engine/src/index.ts
+++ b/packages/engine/src/index.ts
@@ -1,3 +1,3 @@
-export { startSession, takeTurn } from "./orchestrator";
+export { startSession, takeTurn, visionTurn } from "./orchestrator";
 export { annotateClick } from "./annotate";
 export * from "./prompts";
diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts
index 7c408ac..6fee9ba 100644
--- a/packages/engine/src/orchestrator.ts
+++ b/packages/engine/src/orchestrator.ts
@@ -1,10 +1,13 @@
 import type {
+  ClickIntent,
   EngineConfig,
   InteractRequest,
   InteractResponse,
   Session,
   StartRequest,
   StartResponse,
+  VisionRequest,
+  VisionResponse,
 } from "@dada/types";
 import { annotateClick } from "./annotate";
 import { direct } from "./director";
@@ -37,21 +40,27 @@ export async function startSession(
   };
 }
 
+export async function visionTurn(
+  config: EngineConfig,
+  req: VisionRequest,
+): Promise<VisionResponse> {
+  const annotated = await annotateClick(req.prevImageBase64, req.click);
+  const lastFrame = req.session.history.at(-1)?.frame;
+  const uiElements = lastFrame?.uiElements ?? [];
+  const intent = await interpret(config.vision, annotated, uiElements);
+  return { intent };
+}
+
 export async function takeTurn(
   config: EngineConfig,
   req: InteractRequest,
 ): Promise<InteractResponse> {
-  const annotated = await annotateClick(req.prevImageBase64, req.click);
-
-  const lastFrame = req.session.history.at(-1)?.frame;
-  const uiElements = lastFrame?.uiElements ?? [];
-
-  const intent = await interpret(config.vision, annotated, uiElements);
-
   const updatedSession: Session = {
     ...req.session,
     history: req.session.history.map((entry, idx, arr) =>
-      idx === arr.length - 1 ? { ...entry, click: req.click, intent } : entry,
+      idx === arr.length - 1
+        ? { ...entry, click: req.click, intent: req.intent }
+        : entry,
     ),
   };
 
@@ -66,6 +75,6 @@ export async function takeTurn(
     session: updatedSession,
     frame: nextFrame,
     imageBase64: nextImage,
-    intent,
+    intent: req.intent,
   };
 }
diff --git a/packages/engine/src/prompts.ts b/packages/engine/src/prompts.ts
index 05ae03a..eae5177 100644
--- a/packages/engine/src/prompts.ts
+++ b/packages/engine/src/prompts.ts
@@ -29,7 +29,7 @@ export function buildDirectorUserMessage(session: Session): string {
   parts.push(`画风：${session.styleGuide}`);
 
   if (session.history.length === 0) {
-    parts.push("\n这是故事的开场。请生成开场画面。");
+    parts.push("\n这是故事的开场。请生成开场画面，严格以 JSON 格式返回。");
     return parts.join("\n");
   }
 
@@ -47,7 +47,7 @@ export function buildDirectorUserMessage(session: Session): string {
     parts.push(beat.join("\n"));
   });
 
-  parts.push("\n请生成下一帧。");
+  parts.push("\n请生成下一帧，严格以 JSON 格式返回。");
   return parts.join("\n");
 }
 
@@ -111,5 +111,5 @@ export function buildVisionUserPrompt(uiElements: UIElement[]): string {
   return `当前画面包含以下已知 UI 元素：
 ${list}
 
-红点位置即为用户点击位置。请判断用户的意图。`;
+红点位置即为用户点击位置。请判断用户的意图，并以 JSON 格式返回结果。`;
 }
diff --git a/packages/engine/src/renderer.ts b/packages/engine/src/renderer.ts
index 3ff5535..0f5a9a9 100644
--- a/packages/engine/src/renderer.ts
+++ b/packages/engine/src/renderer.ts
@@ -8,5 +8,5 @@ export async function render(
   styleGuide: string,
 ): Promise<string> {
   const prompt = buildImagePrompt(frame, styleGuide);
-  return generateImage(config, prompt, { size: "1024x1536", quality: "medium" });
+  return generateImage(config, prompt);
 }
diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts
index 892afae..f021148 100644
--- a/packages/types/src/index.ts
+++ b/packages/types/src/index.ts
@@ -60,12 +60,22 @@ export type StartResponse = {
   imageBase64: string;
 };
 
-export type InteractRequest = {
+export type VisionRequest = {
   session: Session;
   prevImageBase64: string;
   click: { x: number; y: number };
 };
 
+export type VisionResponse = {
+  intent: ClickIntent;
+};
+
+export type InteractRequest = {
+  session: Session;
+  intent: ClickIntent;
+  click?: { x: number; y: number };
+};
+
 export type InteractResponse = {
   session: Session;
   frame: StoryFrame;
diff --git a/vercel.json b/vercel.json
index 25544e5..5af8dcf 100644
--- a/vercel.json
+++ b/vercel.json
@@ -5,6 +5,7 @@
   "installCommand": "pnpm install",
   "functions": {
     "apps/web/app/api/interact/route.ts": { "maxDuration": 60 },
+    "apps/web/app/api/vision/route.ts": { "maxDuration": 60 },
     "apps/web/app/api/start/route.ts": { "maxDuration": 60 }
   }
 }