feat: prefetch, vision split, provider adapter, UI polish

Engine - Split /api/vision out from /api/interact so client can drive prefetch + cache lookup independently of click interpretation - Image client switched to chat-completions+modalities API (OpenRouter/ provider style), supporting markdown image URL responses - annotateClick now resizes to 768w before composite to keep vision payloads small and avoid CDN timeouts - Prompts updated to mention "JSON" in user messages (required by Gemini's strict JSON mode) - Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision (with 60s hard timeout) Client - Parallel prefetch of all three choice branches on each new frame - Effect deliberately excludes phase from deps so user-click doesn't abort in-flight prefetches - Cache hit/miss/free-form fallback handled in handleClick - PlayCanvas reads img naturalWidth/Height and adapts container to whatever aspect AI returns (no more cropped third choice) - max-width raised to 560px, max-height calc(100dvh - 200px) Misc - README env-path corrected to apps/web/.env.local - users.md: BGM/TTS idea note - .env.example moved into apps/web alongside next config Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 19:38:03 +08:00
parent ad4b09c744
commit 9cedfa66e4
20 changed files with 405 additions and 151 deletions
@@ -0,0 +1,24 @@
+# =============================================================
+# Dada — AI Visual Novel
+# Three independently configurable AI providers
+# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
+# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
+#
+# Image generation uses the chat-completions + modalities API
+# (OpenRouter-style), NOT the legacy /images/generations endpoint.
+# =============================================================
+
+# ---- 1. Text LLM (story director) -----------------------------
+TEXT_BASE_URL=https://openrouter.ai/api/v1
+TEXT_API_KEY=sk-or-v1-xxx
+TEXT_MODEL=~anthropic/claude-sonnet-latest
+
+# ---- 2. Image generator (renders the whole UI screen) ---------
+IMAGE_BASE_URL=https://openrouter.ai/api/v1
+IMAGE_API_KEY=sk-or-v1-xxx
+IMAGE_MODEL=openai/gpt-5.4-image-2
+
+# ---- 3. Vision model (interprets where the user clicked) ------
+VISION_BASE_URL=https://openrouter.ai/api/v1
+VISION_API_KEY=sk-or-v1-xxx
+VISION_MODEL=~google/gemini-flash-latest
@@ -14,9 +14,9 @@ export async function POST(req: Request) {
    return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
  }

-  if (!body.session || !body.prevImageBase64 || !body.click) {
+  if (!body.session || !body.intent) {
    return NextResponse.json(
-      { error: "session, prevImageBase64, click are required" },
+      { error: "session and intent are required" },
      { status: 400 },
    );
  }
@@ -0,0 +1,32 @@
+import { visionTurn } from "@dada/engine";
+import type { VisionRequest } from "@dada/types";
+import { NextResponse } from "next/server";
+import { loadEngineConfig } from "@/lib/config";
+
+export const runtime = "nodejs";
+export const maxDuration = 60;
+
+export async function POST(req: Request) {
+  let body: VisionRequest;
+  try {
+    body = (await req.json()) as VisionRequest;
+  } catch {
+    return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
+  }
+
+  if (!body.session || !body.prevImageBase64 || !body.click) {
+    return NextResponse.json(
+      { error: "session, prevImageBase64, click are required" },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const config = loadEngineConfig();
+    const result = await visionTurn(config, body);
+    return NextResponse.json(result);
+  } catch (err) {
+    const message = err instanceof Error ? err.message : "Unknown error";
+    return NextResponse.json({ error: message }, { status: 500 });
+  }
+}
@@ -13,7 +13,7 @@ export default function RootLayout({
  children: React.ReactNode;
 }) {
  return (
-    <html lang="zh-CN">
+    <html lang="zh-CN" suppressHydrationWarning>
      <head>
        <link rel="preconnect" href="https://fonts.googleapis.com" />
        <link
@@ -11,6 +11,7 @@ import type {
  Session,
  StartResponse,
  StoryFrame,
+  VisionResponse,
 } from "@dada/types";

 function PlayInner() {
@@ -28,7 +29,10 @@ function PlayInner() {
  } | null>(null);
  const [turnNum, setTurnNum] = useState(0);
  const [error, setError] = useState<string | null>(null);
+
  const startedRef = useRef(false);
+  const prefetchAbortRef = useRef<AbortController | null>(null);
+  const prefetchRef = useRef<Record<string, Promise<InteractResponse>>>({});

  useEffect(() => {
    if (startedRef.current) return;
@@ -88,14 +92,60 @@ function PlayInner() {
      .catch((e) => setError(String(e)));
  }, [params, router]);

+  // Prefetch next-frame candidates whenever current frame becomes ready.
+  // All three fire in parallel for fastest cache fill. NOT depending on
+  // `phase` — we don't want to abort in-flight prefetches just because
+  // the user clicked. They should continue so handleClick can await them.
+  useEffect(() => {
+    if (!session || !frame) return;
+
+    prefetchAbortRef.current?.abort();
+    const ctrl = new AbortController();
+    prefetchAbortRef.current = ctrl;
+
+    const choices = frame.uiElements.filter((e) => e.kind === "choice");
+    const promises: Record<string, Promise<InteractResponse>> = {};
+
+    for (const choice of choices) {
+      const syntheticIntent: ClickIntent = {
+        targetId: choice.id,
+        targetLabel: choice.label,
+        reasoning: "prefetch",
+      };
+      const p = fetch("/api/interact", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ session, intent: syntheticIntent }),
+        signal: ctrl.signal,
+      }).then(async (r) => {
+        if (!r.ok) {
+          const j = (await r.json().catch(() => ({}))) as { error?: string };
+          throw new Error(j.error ?? r.statusText);
+        }
+        return r.json() as Promise<InteractResponse>;
+      });
+      p.catch(() => {});
+      promises[choice.id] = p;
+    }
+
+    prefetchRef.current = promises;
+
+    return () => {
+      ctrl.abort();
+    };
+  }, [frame?.id, session?.id]);
+
  async function handleClick(click: { x: number; y: number }) {
    if (phase !== "ready" || !session || !imageBase64) return;
    setPhase("interacting");
    setPendingClick(click);
    setIntent(null);

+    const cacheSnapshot = prefetchRef.current;
+
    try {
-      const res = await fetch("/api/interact", {
+      // Step 1: Vision (~4s) — figure out what the user actually clicked
+      const visionRes = await fetch("/api/vision", {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
@@ -104,20 +154,61 @@ function PlayInner() {
          click,
        }),
      });
-      if (!res.ok) {
-        const j = (await res.json().catch(() => ({}))) as { error?: string };
-        throw new Error(j.error ?? res.statusText);
+      if (!visionRes.ok) {
+        const j = (await visionRes.json().catch(() => ({}))) as {
+          error?: string;
+        };
+        throw new Error(j.error ?? visionRes.statusText);
      }
-      const data = (await res.json()) as InteractResponse;
+      const { intent: clickIntent } =
+        (await visionRes.json()) as VisionResponse;

-      const updatedHistory = [
-        ...data.session.history,
-        { frame: data.frame },
-      ];
-      setSession({ ...data.session, history: updatedHistory });
-      setFrame(data.frame);
-      setImageBase64(data.imageBase64);
-      setIntent(data.intent);
+      // Step 2: Cache lookup
+      const cached = clickIntent.targetId
+        ? cacheSnapshot[clickIntent.targetId]
+        : undefined;
+
+      let result: InteractResponse;
+      if (cached) {
+        // Cache hit — await the prefetched promise (mostly already resolved)
+        result = await cached;
+        // Overwrite the synthetic prefetch intent on history with the real one
+        const lastIdx = result.session.history.length - 1;
+        result = {
+          ...result,
+          intent: clickIntent,
+          session: {
+            ...result.session,
+            history: result.session.history.map((entry, idx) =>
+              idx === lastIdx
+                ? { ...entry, click, intent: clickIntent }
+                : entry,
+            ),
+          },
+        };
+      } else {
+        // Cache miss (free-form click) — abort wasted prefetches, run live
+        prefetchAbortRef.current?.abort();
+        const liveRes = await fetch("/api/interact", {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({ session, intent: clickIntent, click }),
+        });
+        if (!liveRes.ok) {
+          const j = (await liveRes.json().catch(() => ({}))) as {
+            error?: string;
+          };
+          throw new Error(j.error ?? liveRes.statusText);
+        }
+        result = (await liveRes.json()) as InteractResponse;
+      }
+
+      // Apply the result: append new frame to history
+      const updatedHistory = [...result.session.history, { frame: result.frame }];
+      setSession({ ...result.session, history: updatedHistory });
+      setFrame(result.frame);
+      setImageBase64(result.imageBase64);
+      setIntent(clickIntent);
      setPendingClick(null);
      setTurnNum((t) => t + 1);
      setPhase("ready");
@@ -189,7 +280,7 @@ function PlayInner() {
                AI · is · painting · the · next · moment
              </p>
              <p className="font-serif italic text-clay-400 text-xs">
-                this usually takes 12–20 seconds
+                cached choices resolve in seconds · free-form takes longer
              </p>
            </div>
          )}
@@ -1,9 +1,12 @@
 "use client";

-import { useRef } from "react";
+import { useRef, useState } from "react";

 export type Phase = "loading-first" | "ready" | "interacting";

+const SHADOW =
+  "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
+
 export function PlayCanvas({
  imageBase64,
  phase,
@@ -15,11 +18,12 @@ export function PlayCanvas({
  pendingClick: { x: number; y: number } | null;
  onClick: (click: { x: number; y: number }) => void;
 }) {
-  const ref = useRef<HTMLDivElement>(null);
+  const imgRef = useRef<HTMLImageElement>(null);
+  const [dims, setDims] = useState<{ w: number; h: number } | null>(null);

-  function handleClick(e: React.MouseEvent<HTMLDivElement>) {
-    if (phase !== "ready" || !ref.current || !imageBase64) return;
-    const rect = ref.current.getBoundingClientRect();
+  function handleClick(e: React.MouseEvent<HTMLImageElement>) {
+    if (phase !== "ready" || !imgRef.current) return;
+    const rect = imgRef.current.getBoundingClientRect();
    const x = (e.clientX - rect.left) / rect.width;
    const y = (e.clientY - rect.top) / rect.height;
    onClick({
@@ -32,70 +36,81 @@ export function PlayCanvas({
  const dimmed = phase === "interacting";

  return (
-    <div className="w-full max-w-[440px] mx-auto">
-      <div
-        ref={ref}
-        onClick={handleClick}
-        className={`relative aspect-[2/3] w-full overflow-hidden bg-cream-200 select-none ${interactive ? "cursor-pointer" : "cursor-wait"}`}
-        style={{
-          boxShadow:
-            "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)",
-        }}
-      >
-        {imageBase64 ? (
+    <div className="w-full flex flex-col items-center">
+      {imageBase64 ? (
+        <div className="relative inline-block" style={{ boxShadow: SHADOW }}>
          <img
            key={imageBase64.slice(-48)}
+            ref={imgRef}
            src={`data:image/png;base64,${imageBase64}`}
            alt="Generated frame"
-            className={`absolute inset-0 w-full h-full object-cover animate-fade-in transition-opacity duration-700 ease-out ${dimmed ? "opacity-30" : "opacity-100"}`}
+            onClick={handleClick}
+            onLoad={(e) => {
+              const img = e.currentTarget;
+              setDims({ w: img.naturalWidth, h: img.naturalHeight });
+            }}
            draggable={false}
+            className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${interactive ? "cursor-pointer" : "cursor-wait"} ${dimmed ? "opacity-30" : "opacity-100"}`}
+            style={{
+              maxWidth: "min(560px, 92vw)",
+              maxHeight: "calc(100dvh - 200px)",
+            }}
          />
-        ) : (
-          <div className="absolute inset-0 flex flex-col items-center justify-center gap-4">
-            <div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
-            <p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
-              Painting · the · first · frame
-            </p>
-          </div>
-        )}

-        <div className="absolute inset-x-0 top-0 h-12 bg-gradient-to-b from-clay-900/15 to-transparent pointer-events-none" />
-        <div className="absolute inset-x-0 bottom-0 h-12 bg-gradient-to-t from-clay-900/15 to-transparent pointer-events-none" />
+          <div className="absolute inset-x-0 top-0 h-10 bg-gradient-to-b from-clay-900/12 to-transparent pointer-events-none" />
+          <div className="absolute inset-x-0 bottom-0 h-10 bg-gradient-to-t from-clay-900/12 to-transparent pointer-events-none" />

-        {pendingClick && (
-          <>
-            <div
-              className="absolute rounded-full border border-ember-500 pointer-events-none"
-              style={{
-                left: `${pendingClick.x * 100}%`,
-                top: `${pendingClick.y * 100}%`,
-                transform: "translate(-50%, -50%)",
-                width: 30,
-                height: 30,
-                animation:
-                  "dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
-              }}
-            />
-            <div
-              className="absolute rounded-full pointer-events-none"
-              style={{
-                left: `${pendingClick.x * 100}%`,
-                top: `${pendingClick.y * 100}%`,
-                transform: "translate(-50%, -50%)",
-                width: 11,
-                height: 11,
-                background: "#D97A2E",
-                boxShadow:
-                  "0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
-              }}
-            />
-          </>
-        )}
-      </div>
+          {pendingClick && (
+            <>
+              <div
+                className="absolute rounded-full border border-ember-500 pointer-events-none"
+                style={{
+                  left: `${pendingClick.x * 100}%`,
+                  top: `${pendingClick.y * 100}%`,
+                  transform: "translate(-50%, -50%)",
+                  width: 30,
+                  height: 30,
+                  animation:
+                    "dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
+                }}
+              />
+              <div
+                className="absolute rounded-full pointer-events-none"
+                style={{
+                  left: `${pendingClick.x * 100}%`,
+                  top: `${pendingClick.y * 100}%`,
+                  transform: "translate(-50%, -50%)",
+                  width: 11,
+                  height: 11,
+                  background: "#D97A2E",
+                  boxShadow:
+                    "0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
+                }}
+              />
+            </>
+          )}
+        </div>
+      ) : (
+        <div
+          className="relative aspect-[2/3] bg-cream-200 flex flex-col items-center justify-center gap-4"
+          style={{
+            width: "min(560px, calc((100dvh - 200px) * 2 / 3), 92vw)",
+            boxShadow: SHADOW,
+          }}
+        >
+          <div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
+          <p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
+            Painting · the · first · frame
+          </p>
+        </div>
+      )}

-      <div className="flex items-center justify-between mt-3 px-1">
+      <div
+        className="flex items-center justify-between mt-3 px-1 w-full"
+        style={{ maxWidth: "min(560px, 92vw)" }}
+      >
        <span className="text-[9px] smallcaps text-clay-400 num">
-          1024 × 1536 · png
+          {dims ? `${dims.w} × ${dims.h} · png` : "—"}
        </span>
        <span className="text-[9px] smallcaps text-clay-400">
          {phase === "ready" ? "Tap · anywhere" : "···"}
@@ -1,4 +1,6 @@
 /// <reference types="next" />
 /// <reference types="next/image-types/global" />
+import "./.next/dev/types/routes.d.ts";

 // NOTE: This file should not be edited
+// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.