feat: prefetch, vision split, provider adapter, UI polish
Engine - Split /api/vision out from /api/interact so client can drive prefetch + cache lookup independently of click interpretation - Image client switched to chat-completions+modalities API (OpenRouter/ provider style), supporting markdown image URL responses - annotateClick now resizes to 768w before composite to keep vision payloads small and avoid CDN timeouts - Prompts updated to mention "JSON" in user messages (required by Gemini's strict JSON mode) - Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision (with 60s hard timeout) Client - Parallel prefetch of all three choice branches on each new frame - Effect deliberately excludes phase from deps so user-click doesn't abort in-flight prefetches - Cache hit/miss/free-form fallback handled in handleClick - PlayCanvas reads img naturalWidth/Height and adapts container to whatever aspect AI returns (no more cropped third choice) - max-width raised to 560px, max-height calc(100dvh - 200px) Misc - README env-path corrected to apps/web/.env.local - users.md: BGM/TTS idea note - .env.example moved into apps/web alongside next config Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import type { ProviderConfig } from "@dada/types";
|
||||
import { fetchWithRetry } from "./fetchWithRetry";
|
||||
|
||||
export type ChatMessage = {
|
||||
role: "system" | "user" | "assistant";
|
||||
@@ -20,7 +21,7 @@ export async function chat(
|
||||
body.response_format = { type: "json_object" };
|
||||
}
|
||||
|
||||
const res = await fetch(url, {
|
||||
const res = await fetchWithRetry(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
type RetryInit = RequestInit & { retries?: number; retryDelayMs?: number };
|
||||
|
||||
export async function fetchWithRetry(
|
||||
url: string,
|
||||
init: RetryInit,
|
||||
): Promise<Response> {
|
||||
const { retries = 2, retryDelayMs = 1500, ...fetchInit } = init;
|
||||
|
||||
let lastError: unknown;
|
||||
for (let attempt = 0; attempt <= retries; attempt++) {
|
||||
try {
|
||||
const res = await fetch(url, fetchInit);
|
||||
if (res.ok) return res;
|
||||
// Don't retry 4xx (client errors won't fix themselves)
|
||||
if (res.status >= 400 && res.status < 500) return res;
|
||||
// 5xx: retry if we have budget left
|
||||
if (attempt < retries) {
|
||||
await sleep(retryDelayMs * (attempt + 1));
|
||||
continue;
|
||||
}
|
||||
return res;
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
const isAbort =
|
||||
err instanceof DOMException && err.name === "AbortError";
|
||||
if (isAbort) throw err;
|
||||
if (attempt < retries) {
|
||||
await sleep(retryDelayMs * (attempt + 1));
|
||||
continue;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
@@ -1,20 +1,29 @@
|
||||
import type { ProviderConfig } from "@dada/types";
|
||||
import { fetchWithRetry } from "./fetchWithRetry";
|
||||
|
||||
type ImageUrlPart = { type: string; image_url?: { url?: string } };
|
||||
type ChatResponse = {
|
||||
choices: {
|
||||
message: {
|
||||
content: string | ImageUrlPart[];
|
||||
images?: ImageUrlPart[];
|
||||
};
|
||||
}[];
|
||||
};
|
||||
|
||||
export async function generateImage(
|
||||
config: ProviderConfig,
|
||||
prompt: string,
|
||||
opts?: { size?: string; quality?: "low" | "medium" | "high" | "auto" },
|
||||
): Promise<string> {
|
||||
const url = `${config.baseUrl.replace(/\/$/, "")}/images/generations`;
|
||||
const body: Record<string, unknown> = {
|
||||
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
|
||||
|
||||
const body = {
|
||||
model: config.model,
|
||||
prompt,
|
||||
size: opts?.size ?? "1024x1536",
|
||||
quality: opts?.quality ?? "medium",
|
||||
n: 1,
|
||||
modalities: ["image", "text"],
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
};
|
||||
|
||||
const res = await fetch(url, {
|
||||
const res = await fetchWithRetry(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
@@ -25,20 +34,45 @@ export async function generateImage(
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
throw new Error(`Image API error ${res.status}: ${text}`);
|
||||
throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`);
|
||||
}
|
||||
|
||||
const json = (await res.json()) as {
|
||||
data: { b64_json?: string; url?: string }[];
|
||||
};
|
||||
const item = json.data[0];
|
||||
if (!item) throw new Error("Image API returned no data");
|
||||
const json = (await res.json()) as ChatResponse;
|
||||
const msg = json.choices[0]?.message;
|
||||
if (!msg) throw new Error("Image API returned no message");
|
||||
|
||||
if (item.b64_json) return item.b64_json;
|
||||
if (item.url) {
|
||||
const imgRes = await fetch(item.url);
|
||||
const buf = await imgRes.arrayBuffer();
|
||||
return Buffer.from(buf).toString("base64");
|
||||
// 1) OpenRouter-style: msg.images = [{ image_url: { url } }]
|
||||
// 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }]
|
||||
const structured: ImageUrlPart[] = [];
|
||||
if (msg.images) structured.push(...msg.images);
|
||||
if (Array.isArray(msg.content)) structured.push(...msg.content);
|
||||
for (const part of structured) {
|
||||
const u = part.image_url?.url;
|
||||
if (u) return await urlToBase64(u);
|
||||
}
|
||||
throw new Error("Image API returned neither b64_json nor url");
|
||||
|
||||
// 3) provider-style: content is a string with markdown image 
|
||||
// or a bare URL fragment
|
||||
if (typeof msg.content === "string") {
|
||||
const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/);
|
||||
if (md?.[1]) return await urlToBase64(md[1]);
|
||||
const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i);
|
||||
if (bare?.[0]) return await urlToBase64(bare[0]);
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`No image found in response: ${JSON.stringify(msg).slice(0, 300)}`,
|
||||
);
|
||||
}
|
||||
|
||||
async function urlToBase64(url: string): Promise<string> {
|
||||
if (url.startsWith("data:")) {
|
||||
const idx = url.indexOf("base64,");
|
||||
if (idx === -1) throw new Error("data URL is not base64-encoded");
|
||||
return url.slice(idx + "base64,".length);
|
||||
}
|
||||
const res = await fetch(url);
|
||||
if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`);
|
||||
const buf = await res.arrayBuffer();
|
||||
return Buffer.from(buf).toString("base64");
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import type { ProviderConfig } from "@dada/types";
|
||||
import { fetchWithRetry } from "./fetchWithRetry";
|
||||
|
||||
export async function interpretClick(
|
||||
config: ProviderConfig,
|
||||
@@ -25,14 +26,24 @@ export async function interpretClick(
|
||||
response_format: { type: "json_object" },
|
||||
};
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
const timeoutCtrl = new AbortController();
|
||||
const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
|
||||
|
||||
let res: Response;
|
||||
try {
|
||||
res = await fetchWithRetry(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
signal: timeoutCtrl.signal,
|
||||
retries: 0,
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
|
||||
@@ -5,25 +5,31 @@ export async function annotateClick(
|
||||
click: { x: number; y: number },
|
||||
): Promise<string> {
|
||||
const buf = Buffer.from(imageBase64, "base64");
|
||||
const meta = await sharp(buf).metadata();
|
||||
const w = meta.width ?? 1024;
|
||||
const h = meta.height ?? 1536;
|
||||
|
||||
const resized = await sharp(buf)
|
||||
.resize({ width: 768, withoutEnlargement: true, fit: "inside" })
|
||||
.png()
|
||||
.toBuffer();
|
||||
|
||||
const meta = await sharp(resized).metadata();
|
||||
const w = meta.width ?? 768;
|
||||
const h = meta.height ?? 1152;
|
||||
|
||||
const cx = Math.round(click.x * w);
|
||||
const cy = Math.round(click.y * h);
|
||||
const r = Math.round(Math.min(w, h) * 0.025);
|
||||
const stroke = Math.max(3, Math.round(r * 0.25));
|
||||
const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
|
||||
const stroke = Math.max(2, Math.round(r * 0.25));
|
||||
|
||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}">
|
||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
|
||||
<circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
|
||||
stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
|
||||
<circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
|
||||
fill="rgba(255,255,255,1)" />
|
||||
</svg>`;
|
||||
|
||||
const out = await sharp(buf)
|
||||
const out = await sharp(resized)
|
||||
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
|
||||
.png()
|
||||
.png({ compressionLevel: 9 })
|
||||
.toBuffer();
|
||||
|
||||
return out.toString("base64");
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
export { startSession, takeTurn } from "./orchestrator";
|
||||
export { startSession, takeTurn, visionTurn } from "./orchestrator";
|
||||
export { annotateClick } from "./annotate";
|
||||
export * from "./prompts";
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
import type {
|
||||
ClickIntent,
|
||||
EngineConfig,
|
||||
InteractRequest,
|
||||
InteractResponse,
|
||||
Session,
|
||||
StartRequest,
|
||||
StartResponse,
|
||||
VisionRequest,
|
||||
VisionResponse,
|
||||
} from "@dada/types";
|
||||
import { annotateClick } from "./annotate";
|
||||
import { direct } from "./director";
|
||||
@@ -37,21 +40,27 @@ export async function startSession(
|
||||
};
|
||||
}
|
||||
|
||||
export async function visionTurn(
|
||||
config: EngineConfig,
|
||||
req: VisionRequest,
|
||||
): Promise<VisionResponse> {
|
||||
const annotated = await annotateClick(req.prevImageBase64, req.click);
|
||||
const lastFrame = req.session.history.at(-1)?.frame;
|
||||
const uiElements = lastFrame?.uiElements ?? [];
|
||||
const intent = await interpret(config.vision, annotated, uiElements);
|
||||
return { intent };
|
||||
}
|
||||
|
||||
export async function takeTurn(
|
||||
config: EngineConfig,
|
||||
req: InteractRequest,
|
||||
): Promise<InteractResponse> {
|
||||
const annotated = await annotateClick(req.prevImageBase64, req.click);
|
||||
|
||||
const lastFrame = req.session.history.at(-1)?.frame;
|
||||
const uiElements = lastFrame?.uiElements ?? [];
|
||||
|
||||
const intent = await interpret(config.vision, annotated, uiElements);
|
||||
|
||||
const updatedSession: Session = {
|
||||
...req.session,
|
||||
history: req.session.history.map((entry, idx, arr) =>
|
||||
idx === arr.length - 1 ? { ...entry, click: req.click, intent } : entry,
|
||||
idx === arr.length - 1
|
||||
? { ...entry, click: req.click, intent: req.intent }
|
||||
: entry,
|
||||
),
|
||||
};
|
||||
|
||||
@@ -66,6 +75,6 @@ export async function takeTurn(
|
||||
session: updatedSession,
|
||||
frame: nextFrame,
|
||||
imageBase64: nextImage,
|
||||
intent,
|
||||
intent: req.intent,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ export function buildDirectorUserMessage(session: Session): string {
|
||||
parts.push(`画风:${session.styleGuide}`);
|
||||
|
||||
if (session.history.length === 0) {
|
||||
parts.push("\n这是故事的开场。请生成开场画面。");
|
||||
parts.push("\n这是故事的开场。请生成开场画面,严格以 JSON 格式返回。");
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ export function buildDirectorUserMessage(session: Session): string {
|
||||
parts.push(beat.join("\n"));
|
||||
});
|
||||
|
||||
parts.push("\n请生成下一帧。");
|
||||
parts.push("\n请生成下一帧,严格以 JSON 格式返回。");
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
@@ -111,5 +111,5 @@ export function buildVisionUserPrompt(uiElements: UIElement[]): string {
|
||||
return `当前画面包含以下已知 UI 元素:
|
||||
${list}
|
||||
|
||||
红点位置即为用户点击位置。请判断用户的意图。`;
|
||||
红点位置即为用户点击位置。请判断用户的意图,并以 JSON 格式返回结果。`;
|
||||
}
|
||||
|
||||
@@ -8,5 +8,5 @@ export async function render(
|
||||
styleGuide: string,
|
||||
): Promise<string> {
|
||||
const prompt = buildImagePrompt(frame, styleGuide);
|
||||
return generateImage(config, prompt, { size: "1024x1536", quality: "medium" });
|
||||
return generateImage(config, prompt);
|
||||
}
|
||||
|
||||
@@ -60,12 +60,22 @@ export type StartResponse = {
|
||||
imageBase64: string;
|
||||
};
|
||||
|
||||
export type InteractRequest = {
|
||||
export type VisionRequest = {
|
||||
session: Session;
|
||||
prevImageBase64: string;
|
||||
click: { x: number; y: number };
|
||||
};
|
||||
|
||||
export type VisionResponse = {
|
||||
intent: ClickIntent;
|
||||
};
|
||||
|
||||
export type InteractRequest = {
|
||||
session: Session;
|
||||
intent: ClickIntent;
|
||||
click?: { x: number; y: number };
|
||||
};
|
||||
|
||||
export type InteractResponse = {
|
||||
session: Session;
|
||||
frame: StoryFrame;
|
||||
|
||||
Reference in New Issue
Block a user