feat: prefetch, vision split, provider adapter, UI polish

Engine - Split /api/vision out from /api/interact so client can drive prefetch + cache lookup independently of click interpretation - Image client switched to chat-completions+modalities API (OpenRouter/ provider style), supporting markdown image URL responses - annotateClick now resizes to 768w before composite to keep vision payloads small and avoid CDN timeouts - Prompts updated to mention "JSON" in user messages (required by Gemini's strict JSON mode) - Shared fetchWithRetry helper: 2 retries for chat/image, 0 for vision (with 60s hard timeout) Client - Parallel prefetch of all three choice branches on each new frame - Effect deliberately excludes phase from deps so user-click doesn't abort in-flight prefetches - Cache hit/miss/free-form fallback handled in handleClick - PlayCanvas reads img naturalWidth/Height and adapts container to whatever aspect AI returns (no more cropped third choice) - max-width raised to 560px, max-height calc(100dvh - 200px) Misc - README env-path corrected to apps/web/.env.local - users.md: BGM/TTS idea note - .env.example moved into apps/web alongside next config Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 19:38:03 +08:00
parent ad4b09c744
commit 9cedfa66e4
20 changed files with 405 additions and 151 deletions
@@ -5,25 +5,31 @@ export async function annotateClick(
  click: { x: number; y: number },
 ): Promise<string> {
  const buf = Buffer.from(imageBase64, "base64");
-  const meta = await sharp(buf).metadata();
-  const w = meta.width ?? 1024;
-  const h = meta.height ?? 1536;
+
+  const resized = await sharp(buf)
+    .resize({ width: 768, withoutEnlargement: true, fit: "inside" })
+    .png()
+    .toBuffer();
+
+  const meta = await sharp(resized).metadata();
+  const w = meta.width ?? 768;
+  const h = meta.height ?? 1152;

  const cx = Math.round(click.x * w);
  const cy = Math.round(click.y * h);
-  const r = Math.round(Math.min(w, h) * 0.025);
-  const stroke = Math.max(3, Math.round(r * 0.25));
+  const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
+  const stroke = Math.max(2, Math.round(r * 0.25));

-  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}">
+  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
    <circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
            stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
    <circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
            fill="rgba(255,255,255,1)" />
  </svg>`;

-  const out = await sharp(buf)
+  const out = await sharp(resized)
    .composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
-    .png()
+    .png({ compressionLevel: 9 })
    .toBuffer();

  return out.toString("base64");
@@ -1,3 +1,3 @@
-export { startSession, takeTurn } from "./orchestrator";
+export { startSession, takeTurn, visionTurn } from "./orchestrator";
 export { annotateClick } from "./annotate";
 export * from "./prompts";
@@ -1,10 +1,13 @@
 import type {
+  ClickIntent,
  EngineConfig,
  InteractRequest,
  InteractResponse,
  Session,
  StartRequest,
  StartResponse,
+  VisionRequest,
+  VisionResponse,
 } from "@dada/types";
 import { annotateClick } from "./annotate";
 import { direct } from "./director";
@@ -37,21 +40,27 @@ export async function startSession(
  };
 }

+export async function visionTurn(
+  config: EngineConfig,
+  req: VisionRequest,
+): Promise<VisionResponse> {
+  const annotated = await annotateClick(req.prevImageBase64, req.click);
+  const lastFrame = req.session.history.at(-1)?.frame;
+  const uiElements = lastFrame?.uiElements ?? [];
+  const intent = await interpret(config.vision, annotated, uiElements);
+  return { intent };
+}
+
 export async function takeTurn(
  config: EngineConfig,
  req: InteractRequest,
 ): Promise<InteractResponse> {
-  const annotated = await annotateClick(req.prevImageBase64, req.click);
-
-  const lastFrame = req.session.history.at(-1)?.frame;
-  const uiElements = lastFrame?.uiElements ?? [];
-
-  const intent = await interpret(config.vision, annotated, uiElements);
-
  const updatedSession: Session = {
    ...req.session,
    history: req.session.history.map((entry, idx, arr) =>
-      idx === arr.length - 1 ? { ...entry, click: req.click, intent } : entry,
+      idx === arr.length - 1
+        ? { ...entry, click: req.click, intent: req.intent }
+        : entry,
    ),
  };

@@ -66,6 +75,6 @@ export async function takeTurn(
    session: updatedSession,
    frame: nextFrame,
    imageBase64: nextImage,
-    intent,
+    intent: req.intent,
  };
 }
@@ -29,7 +29,7 @@ export function buildDirectorUserMessage(session: Session): string {
  parts.push(`画风：${session.styleGuide}`);

  if (session.history.length === 0) {
-    parts.push("\n这是故事的开场。请生成开场画面。");
+    parts.push("\n这是故事的开场。请生成开场画面，严格以 JSON 格式返回。");
    return parts.join("\n");
  }

@@ -47,7 +47,7 @@ export function buildDirectorUserMessage(session: Session): string {
    parts.push(beat.join("\n"));
  });

-  parts.push("\n请生成下一帧。");
+  parts.push("\n请生成下一帧，严格以 JSON 格式返回。");
  return parts.join("\n");
 }

@@ -111,5 +111,5 @@ export function buildVisionUserPrompt(uiElements: UIElement[]): string {
  return `当前画面包含以下已知 UI 元素：
 ${list}

-红点位置即为用户点击位置。请判断用户的意图。`;
+红点位置即为用户点击位置。请判断用户的意图，并以 JSON 格式返回结果。`;
 }
@@ -8,5 +8,5 @@ export async function render(
  styleGuide: string,
 ): Promise<string> {
  const prompt = buildImagePrompt(frame, styleGuide);
-  return generateImage(config, prompt, { size: "1024x1536", quality: "medium" });
+  return generateImage(config, prompt);
 }