feat: prefetch, vision split, provider adapter, UI polish

Engine
- Split /api/vision out from /api/interact so client can drive
  prefetch + cache lookup independently of click interpretation
- Image client switched to chat-completions+modalities API (OpenRouter/
  provider style), supporting markdown image URL responses
- annotateClick now resizes to 768w before composite to keep vision
  payloads small and avoid CDN timeouts
- Prompts updated to mention "JSON" in user messages (required by
  Gemini's strict JSON mode)
- Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision
  (with 60s hard timeout)

Client
- Parallel prefetch of all three choice branches on each new frame
- Effect deliberately excludes phase from deps so user-click doesn't
  abort in-flight prefetches
- Cache hit/miss/free-form fallback handled in handleClick
- PlayCanvas reads img naturalWidth/Height and adapts container to
  whatever aspect AI returns (no more cropped third choice)
- max-width raised to 560px, max-height calc(100dvh - 200px)

Misc
- README env-path corrected to apps/web/.env.local
- users.md: BGM/TTS idea note
- .env.example moved into apps/web alongside next config

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
yuanzonghao
2026-05-12 19:38:03 +08:00
parent ad4b09c744
commit 9cedfa66e4
20 changed files with 405 additions and 151 deletions
+14 -8
View File
@@ -5,25 +5,31 @@ export async function annotateClick(
click: { x: number; y: number },
): Promise<string> {
const buf = Buffer.from(imageBase64, "base64");
const meta = await sharp(buf).metadata();
const w = meta.width ?? 1024;
const h = meta.height ?? 1536;
const resized = await sharp(buf)
.resize({ width: 768, withoutEnlargement: true, fit: "inside" })
.png()
.toBuffer();
const meta = await sharp(resized).metadata();
const w = meta.width ?? 768;
const h = meta.height ?? 1152;
const cx = Math.round(click.x * w);
const cy = Math.round(click.y * h);
const r = Math.round(Math.min(w, h) * 0.025);
const stroke = Math.max(3, Math.round(r * 0.25));
const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
const stroke = Math.max(2, Math.round(r * 0.25));
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}">
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
<circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
<circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
fill="rgba(255,255,255,1)" />
</svg>`;
const out = await sharp(buf)
const out = await sharp(resized)
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
.png()
.png({ compressionLevel: 9 })
.toBuffer();
return out.toString("base64");
+1 -1
View File
@@ -1,3 +1,3 @@
export { startSession, takeTurn } from "./orchestrator";
export { startSession, takeTurn, visionTurn } from "./orchestrator";
export { annotateClick } from "./annotate";
export * from "./prompts";
+18 -9
View File
@@ -1,10 +1,13 @@
import type {
ClickIntent,
EngineConfig,
InteractRequest,
InteractResponse,
Session,
StartRequest,
StartResponse,
VisionRequest,
VisionResponse,
} from "@dada/types";
import { annotateClick } from "./annotate";
import { direct } from "./director";
@@ -37,21 +40,27 @@ export async function startSession(
};
}
export async function visionTurn(
config: EngineConfig,
req: VisionRequest,
): Promise<VisionResponse> {
const annotated = await annotateClick(req.prevImageBase64, req.click);
const lastFrame = req.session.history.at(-1)?.frame;
const uiElements = lastFrame?.uiElements ?? [];
const intent = await interpret(config.vision, annotated, uiElements);
return { intent };
}
export async function takeTurn(
config: EngineConfig,
req: InteractRequest,
): Promise<InteractResponse> {
const annotated = await annotateClick(req.prevImageBase64, req.click);
const lastFrame = req.session.history.at(-1)?.frame;
const uiElements = lastFrame?.uiElements ?? [];
const intent = await interpret(config.vision, annotated, uiElements);
const updatedSession: Session = {
...req.session,
history: req.session.history.map((entry, idx, arr) =>
idx === arr.length - 1 ? { ...entry, click: req.click, intent } : entry,
idx === arr.length - 1
? { ...entry, click: req.click, intent: req.intent }
: entry,
),
};
@@ -66,6 +75,6 @@ export async function takeTurn(
session: updatedSession,
frame: nextFrame,
imageBase64: nextImage,
intent,
intent: req.intent,
};
}
+3 -3
View File
@@ -29,7 +29,7 @@ export function buildDirectorUserMessage(session: Session): string {
parts.push(`画风:${session.styleGuide}`);
if (session.history.length === 0) {
parts.push("\n这是故事的开场。请生成开场画面。");
parts.push("\n这是故事的开场。请生成开场画面,严格以 JSON 格式返回。");
return parts.join("\n");
}
@@ -47,7 +47,7 @@ export function buildDirectorUserMessage(session: Session): string {
parts.push(beat.join("\n"));
});
parts.push("\n请生成下一帧。");
parts.push("\n请生成下一帧,严格以 JSON 格式返回。");
return parts.join("\n");
}
@@ -111,5 +111,5 @@ export function buildVisionUserPrompt(uiElements: UIElement[]): string {
return `当前画面包含以下已知 UI 元素:
${list}
红点位置即为用户点击位置。请判断用户的意图。`;
红点位置即为用户点击位置。请判断用户的意图,并以 JSON 格式返回结果`;
}
+1 -1
View File
@@ -8,5 +8,5 @@ export async function render(
styleGuide: string,
): Promise<string> {
const prompt = buildImagePrompt(frame, styleGuide);
return generateImage(config, prompt, { size: "1024x1536", quality: "medium" });
return generateImage(config, prompt);
}