feat: prefetch, vision split, provider adapter, UI polish

Engine
- Split /api/vision out from /api/interact so client can drive
  prefetch + cache lookup independently of click interpretation
- Image client switched to chat-completions+modalities API (OpenRouter/
  provider style), supporting markdown image URL responses
- annotateClick now resizes to 768w before composite to keep vision
  payloads small and avoid CDN timeouts
- Prompts updated to mention "JSON" in user messages (required by
  Gemini's strict JSON mode)
- Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision
  (with 60s hard timeout)

Client
- Parallel prefetch of all three choice branches on each new frame
- Effect deliberately excludes phase from deps so user-click doesn't
  abort in-flight prefetches
- Cache hit/miss/free-form fallback handled in handleClick
- PlayCanvas reads img naturalWidth/Height and adapts container to
  whatever aspect AI returns (no more cropped third choice)
- max-width raised to 560px, max-height calc(100dvh - 200px)

Misc
- README env-path corrected to apps/web/.env.local
- users.md: BGM/TTS idea note
- .env.example moved into apps/web alongside next config

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
yuanzonghao
2026-05-12 19:38:03 +08:00
parent ad4b09c744
commit 9cedfa66e4
20 changed files with 405 additions and 151 deletions
+24
View File
@@ -0,0 +1,24 @@
# =============================================================
# Dada — AI Visual Novel
# Three independently configurable AI providers
# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
#
# Image generation uses the chat-completions + modalities API
# (OpenRouter-style), NOT the legacy /images/generations endpoint.
# =============================================================
# ---- 1. Text LLM (story director) -----------------------------
TEXT_BASE_URL=https://openrouter.ai/api/v1
TEXT_API_KEY=sk-or-v1-xxx
TEXT_MODEL=~anthropic/claude-sonnet-latest
# ---- 2. Image generator (renders the whole UI screen) ---------
IMAGE_BASE_URL=https://openrouter.ai/api/v1
IMAGE_API_KEY=sk-or-v1-xxx
IMAGE_MODEL=openai/gpt-5.4-image-2
# ---- 3. Vision model (interprets where the user clicked) ------
VISION_BASE_URL=https://openrouter.ai/api/v1
VISION_API_KEY=sk-or-v1-xxx
VISION_MODEL=~google/gemini-flash-latest
+2 -2
View File
@@ -14,9 +14,9 @@ export async function POST(req: Request) {
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
}
if (!body.session || !body.prevImageBase64 || !body.click) {
if (!body.session || !body.intent) {
return NextResponse.json(
{ error: "session, prevImageBase64, click are required" },
{ error: "session and intent are required" },
{ status: 400 },
);
}
+32
View File
@@ -0,0 +1,32 @@
import { visionTurn } from "@dada/engine";
import type { VisionRequest } from "@dada/types";
import { NextResponse } from "next/server";
import { loadEngineConfig } from "@/lib/config";
export const runtime = "nodejs";
export const maxDuration = 60;
export async function POST(req: Request) {
let body: VisionRequest;
try {
body = (await req.json()) as VisionRequest;
} catch {
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
}
if (!body.session || !body.prevImageBase64 || !body.click) {
return NextResponse.json(
{ error: "session, prevImageBase64, click are required" },
{ status: 400 },
);
}
try {
const config = loadEngineConfig();
const result = await visionTurn(config, body);
return NextResponse.json(result);
} catch (err) {
const message = err instanceof Error ? err.message : "Unknown error";
return NextResponse.json({ error: message }, { status: 500 });
}
}
+1 -1
View File
@@ -13,7 +13,7 @@ export default function RootLayout({
children: React.ReactNode;
}) {
return (
<html lang="zh-CN">
<html lang="zh-CN" suppressHydrationWarning>
<head>
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link
+105 -14
View File
@@ -11,6 +11,7 @@ import type {
Session,
StartResponse,
StoryFrame,
VisionResponse,
} from "@dada/types";
function PlayInner() {
@@ -28,7 +29,10 @@ function PlayInner() {
} | null>(null);
const [turnNum, setTurnNum] = useState(0);
const [error, setError] = useState<string | null>(null);
const startedRef = useRef(false);
const prefetchAbortRef = useRef<AbortController | null>(null);
const prefetchRef = useRef<Record<string, Promise<InteractResponse>>>({});
useEffect(() => {
if (startedRef.current) return;
@@ -88,14 +92,60 @@ function PlayInner() {
.catch((e) => setError(String(e)));
}, [params, router]);
// Prefetch next-frame candidates whenever current frame becomes ready.
// All three fire in parallel for fastest cache fill. NOT depending on
// `phase` — we don't want to abort in-flight prefetches just because
// the user clicked. They should continue so handleClick can await them.
useEffect(() => {
if (!session || !frame) return;
prefetchAbortRef.current?.abort();
const ctrl = new AbortController();
prefetchAbortRef.current = ctrl;
const choices = frame.uiElements.filter((e) => e.kind === "choice");
const promises: Record<string, Promise<InteractResponse>> = {};
for (const choice of choices) {
const syntheticIntent: ClickIntent = {
targetId: choice.id,
targetLabel: choice.label,
reasoning: "prefetch",
};
const p = fetch("/api/interact", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session, intent: syntheticIntent }),
signal: ctrl.signal,
}).then(async (r) => {
if (!r.ok) {
const j = (await r.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? r.statusText);
}
return r.json() as Promise<InteractResponse>;
});
p.catch(() => {});
promises[choice.id] = p;
}
prefetchRef.current = promises;
return () => {
ctrl.abort();
};
}, [frame?.id, session?.id]);
async function handleClick(click: { x: number; y: number }) {
if (phase !== "ready" || !session || !imageBase64) return;
setPhase("interacting");
setPendingClick(click);
setIntent(null);
const cacheSnapshot = prefetchRef.current;
try {
const res = await fetch("/api/interact", {
// Step 1: Vision (~4s) — figure out what the user actually clicked
const visionRes = await fetch("/api/vision", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
@@ -104,20 +154,61 @@ function PlayInner() {
click,
}),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? res.statusText);
if (!visionRes.ok) {
const j = (await visionRes.json().catch(() => ({}))) as {
error?: string;
};
throw new Error(j.error ?? visionRes.statusText);
}
const data = (await res.json()) as InteractResponse;
const { intent: clickIntent } =
(await visionRes.json()) as VisionResponse;
const updatedHistory = [
...data.session.history,
{ frame: data.frame },
];
setSession({ ...data.session, history: updatedHistory });
setFrame(data.frame);
setImageBase64(data.imageBase64);
setIntent(data.intent);
// Step 2: Cache lookup
const cached = clickIntent.targetId
? cacheSnapshot[clickIntent.targetId]
: undefined;
let result: InteractResponse;
if (cached) {
// Cache hit — await the prefetched promise (mostly already resolved)
result = await cached;
// Overwrite the synthetic prefetch intent on history with the real one
const lastIdx = result.session.history.length - 1;
result = {
...result,
intent: clickIntent,
session: {
...result.session,
history: result.session.history.map((entry, idx) =>
idx === lastIdx
? { ...entry, click, intent: clickIntent }
: entry,
),
},
};
} else {
// Cache miss (free-form click) — abort wasted prefetches, run live
prefetchAbortRef.current?.abort();
const liveRes = await fetch("/api/interact", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ session, intent: clickIntent, click }),
});
if (!liveRes.ok) {
const j = (await liveRes.json().catch(() => ({}))) as {
error?: string;
};
throw new Error(j.error ?? liveRes.statusText);
}
result = (await liveRes.json()) as InteractResponse;
}
// Apply the result: append new frame to history
const updatedHistory = [...result.session.history, { frame: result.frame }];
setSession({ ...result.session, history: updatedHistory });
setFrame(result.frame);
setImageBase64(result.imageBase64);
setIntent(clickIntent);
setPendingClick(null);
setTurnNum((t) => t + 1);
setPhase("ready");
@@ -189,7 +280,7 @@ function PlayInner() {
AI · is · painting · the · next · moment
</p>
<p className="font-serif italic text-clay-400 text-xs">
this usually takes 1220 seconds
cached choices resolve in seconds · free-form takes longer
</p>
</div>
)}
+74 -59
View File
@@ -1,9 +1,12 @@
"use client";
import { useRef } from "react";
import { useRef, useState } from "react";
export type Phase = "loading-first" | "ready" | "interacting";
const SHADOW =
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
export function PlayCanvas({
imageBase64,
phase,
@@ -15,11 +18,12 @@ export function PlayCanvas({
pendingClick: { x: number; y: number } | null;
onClick: (click: { x: number; y: number }) => void;
}) {
const ref = useRef<HTMLDivElement>(null);
const imgRef = useRef<HTMLImageElement>(null);
const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
function handleClick(e: React.MouseEvent<HTMLDivElement>) {
if (phase !== "ready" || !ref.current || !imageBase64) return;
const rect = ref.current.getBoundingClientRect();
function handleClick(e: React.MouseEvent<HTMLImageElement>) {
if (phase !== "ready" || !imgRef.current) return;
const rect = imgRef.current.getBoundingClientRect();
const x = (e.clientX - rect.left) / rect.width;
const y = (e.clientY - rect.top) / rect.height;
onClick({
@@ -32,70 +36,81 @@ export function PlayCanvas({
const dimmed = phase === "interacting";
return (
<div className="w-full max-w-[440px] mx-auto">
<div
ref={ref}
onClick={handleClick}
className={`relative aspect-[2/3] w-full overflow-hidden bg-cream-200 select-none ${interactive ? "cursor-pointer" : "cursor-wait"}`}
style={{
boxShadow:
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)",
}}
>
{imageBase64 ? (
<div className="w-full flex flex-col items-center">
{imageBase64 ? (
<div className="relative inline-block" style={{ boxShadow: SHADOW }}>
<img
key={imageBase64.slice(-48)}
ref={imgRef}
src={`data:image/png;base64,${imageBase64}`}
alt="Generated frame"
className={`absolute inset-0 w-full h-full object-cover animate-fade-in transition-opacity duration-700 ease-out ${dimmed ? "opacity-30" : "opacity-100"}`}
onClick={handleClick}
onLoad={(e) => {
const img = e.currentTarget;
setDims({ w: img.naturalWidth, h: img.naturalHeight });
}}
draggable={false}
className={`block w-auto h-auto select-none animate-fade-in transition-opacity duration-700 ease-out ${interactive ? "cursor-pointer" : "cursor-wait"} ${dimmed ? "opacity-30" : "opacity-100"}`}
style={{
maxWidth: "min(560px, 92vw)",
maxHeight: "calc(100dvh - 200px)",
}}
/>
) : (
<div className="absolute inset-0 flex flex-col items-center justify-center gap-4">
<div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
<p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
Painting · the · first · frame
</p>
</div>
)}
<div className="absolute inset-x-0 top-0 h-12 bg-gradient-to-b from-clay-900/15 to-transparent pointer-events-none" />
<div className="absolute inset-x-0 bottom-0 h-12 bg-gradient-to-t from-clay-900/15 to-transparent pointer-events-none" />
<div className="absolute inset-x-0 top-0 h-10 bg-gradient-to-b from-clay-900/12 to-transparent pointer-events-none" />
<div className="absolute inset-x-0 bottom-0 h-10 bg-gradient-to-t from-clay-900/12 to-transparent pointer-events-none" />
{pendingClick && (
<>
<div
className="absolute rounded-full border border-ember-500 pointer-events-none"
style={{
left: `${pendingClick.x * 100}%`,
top: `${pendingClick.y * 100}%`,
transform: "translate(-50%, -50%)",
width: 30,
height: 30,
animation:
"dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
}}
/>
<div
className="absolute rounded-full pointer-events-none"
style={{
left: `${pendingClick.x * 100}%`,
top: `${pendingClick.y * 100}%`,
transform: "translate(-50%, -50%)",
width: 11,
height: 11,
background: "#D97A2E",
boxShadow:
"0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
}}
/>
</>
)}
</div>
{pendingClick && (
<>
<div
className="absolute rounded-full border border-ember-500 pointer-events-none"
style={{
left: `${pendingClick.x * 100}%`,
top: `${pendingClick.y * 100}%`,
transform: "translate(-50%, -50%)",
width: 30,
height: 30,
animation:
"dada-ripple 1.6s cubic-bezier(0.16,1,0.3,1) infinite",
}}
/>
<div
className="absolute rounded-full pointer-events-none"
style={{
left: `${pendingClick.x * 100}%`,
top: `${pendingClick.y * 100}%`,
transform: "translate(-50%, -50%)",
width: 11,
height: 11,
background: "#D97A2E",
boxShadow:
"0 0 0 3px rgba(251,247,240,0.95), 0 0 14px rgba(217,122,46,0.55)",
}}
/>
</>
)}
</div>
) : (
<div
className="relative aspect-[2/3] bg-cream-200 flex flex-col items-center justify-center gap-4"
style={{
width: "min(560px, calc((100dvh - 200px) * 2 / 3), 92vw)",
boxShadow: SHADOW,
}}
>
<div className="w-1.5 h-1.5 bg-clay-500 rounded-full animate-slow-pulse" />
<p className="text-[9px] smallcaps text-clay-500 animate-slow-pulse">
Painting · the · first · frame
</p>
</div>
)}
<div className="flex items-center justify-between mt-3 px-1">
<div
className="flex items-center justify-between mt-3 px-1 w-full"
style={{ maxWidth: "min(560px, 92vw)" }}
>
<span className="text-[9px] smallcaps text-clay-400 num">
1024 × 1536 · png
{dims ? `${dims.w} × ${dims.h} · png` : "—"}
</span>
<span className="text-[9px] smallcaps text-clay-400">
{phase === "ready" ? "Tap · anywhere" : "···"}
+2
View File
@@ -1,4 +1,6 @@
/// <reference types="next" />
/// <reference types="next/image-types/global" />
import "./.next/dev/types/routes.d.ts";
// NOTE: This file should not be edited
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.