refactor(engine): move click annotation from sharp to browser Canvas

The vision pipeline used sharp to draw a click marker on the scene image
server-side (engine/src/annotate.ts) and to render the MOCK_IMAGE
placeholder PNG (engine/src/mockImage.ts). Both moved off the runtime:

- annotateClick → apps/web/lib/annotateClient.ts (Canvas 2D in the
  browser; toDataURL → raw PNG base64 forwarded to /api/vision). Saves
  a server-side image re-fetch per click and frees the engine from
  sharp's native binding (which doesn't run on Cloudflare Workers).
- mockImageDataUri → self-describing SVG data URI (no rendering needed).

VisionRequest contract changes: prevImageUrl + click → annotatedImageBase64.
Server forwards the bytes straight to the vision LLM as image_url.

sharp is removed from packages/engine entirely and from next.config.ts's
serverExternalPackages. apps/web/package.json + lockfile cleanup ships
in the follow-up Cloudflare deployment commit.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
yuanzonghao
2026-06-02 21:46:45 +08:00
parent dd8b60c06b
commit 346d5359d4
10 changed files with 119 additions and 154 deletions
+1 -2
View File
@@ -15,7 +15,6 @@
"@infiplot/ai-client": "workspace:*",
"@infiplot/tts-client": "workspace:*",
"@infiplot/types": "workspace:*",
"jsonrepair": "^3.14.0",
"sharp": "^0.33.5"
"jsonrepair": "^3.14.0"
}
}
-111
View File
@@ -1,111 +0,0 @@
import sharp from "sharp";
const FETCH_TIMEOUT_MS = 5000;
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
// Validate that an imageUrl is safe to fetch server-side.
// Only https: and data: URIs are allowed; http: is rejected to
// prevent SSRF via private IPs / cloud metadata endpoints.
function assertSafeUrl(url: string): void {
if (url.startsWith("data:")) return;
const parsed = new URL(url);
if (parsed.protocol !== "https:") {
throw new Error(
`prevImageUrl must use https: or data: protocol, got ${parsed.protocol}`,
);
}
const host = parsed.hostname;
if (
host === "localhost" ||
host === "127.0.0.1" ||
host === "0.0.0.0" ||
host.startsWith("192.168.") ||
host.startsWith("10.") ||
/^172\.(1[6-9]|2\d|3[0-1])\./.test(host) ||
host === "169.254.169.254"
) {
throw new Error(
`prevImageUrl resolves to a private/reserved IP: ${host}`,
);
}
}
// Pull the bytes from an image URL or data URI into a Buffer suitable for
// sharp. Data URIs are decoded inline (no network); https: URLs are fetched
// with a short timeout — if Runware's CDN is slow we'd rather fail the vision
// step quickly than tie up a 60s Vercel function on a single image read.
async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
assertSafeUrl(imageUrl);
if (imageUrl.startsWith("data:")) {
const comma = imageUrl.indexOf(",");
if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
const b64 = imageUrl.slice(comma + 1);
return Buffer.from(b64, "base64");
}
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
try {
const res = await fetch(imageUrl, { signal: ctrl.signal });
if (!res.ok) {
throw new Error(
`Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
);
}
const contentLength = res.headers.get("content-length");
if (contentLength && Number(contentLength) > MAX_IMAGE_BYTES) {
throw new Error(
`prevImageUrl response too large (${contentLength} bytes, max ${MAX_IMAGE_BYTES})`,
);
}
const arr = await res.arrayBuffer();
if (arr.byteLength > MAX_IMAGE_BYTES) {
throw new Error(
`prevImageUrl response too large (${arr.byteLength} bytes, max ${MAX_IMAGE_BYTES})`,
);
}
return Buffer.from(arr);
} finally {
clearTimeout(timer);
}
}
// Marks the player's click point on the scene image so the vision LLM can see
// WHERE they tapped. Output is base64 because the vision LLM is called over
// the OpenAI-compatible chat endpoint, which only accepts image_url data URIs
// — we can't hand it a Runware CDN URL directly.
export async function annotateClick(
imageUrl: string,
click: { x: number; y: number },
): Promise<string> {
const buf = await loadImageBuffer(imageUrl);
const resized = await sharp(buf)
.resize({ width: 768, withoutEnlargement: true, fit: "inside" })
.png()
.toBuffer();
const meta = await sharp(resized).metadata();
const w = meta.width ?? 768;
const h = meta.height ?? 1152;
const cx = Math.round(click.x * w);
const cy = Math.round(click.y * h);
const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
const stroke = Math.max(2, Math.round(r * 0.25));
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
<circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
<circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
fill="rgba(255,255,255,1)" />
</svg>`;
const out = await sharp(resized)
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
.png({ compressionLevel: 9 })
.toBuffer();
return out.toString("base64");
}
-1
View File
@@ -5,7 +5,6 @@ export {
requestInsertBeat,
requestBeatAudio,
} from "./orchestrator";
export { annotateClick } from "./annotate";
export { synthesizeBeat } from "./voice";
export { mergeCharacters } from "./director";
export type { SceneResult } from "./director";
+21 -25
View File
@@ -1,29 +1,25 @@
import sharp from "sharp";
// Static SVG placeholder used when MOCK_IMAGE=true, so we can exercise the
// TTS path without paying for image generation. Returned as a data URI so the
// rest of the pipeline can treat it as an `imageUrl` interchangeably with
// real Runware URLs (the client's <img src> accepts both, and we never feed
// a mock image to Runware's referenceImages because mockImage mode
// short-circuits the Painter entirely).
//
// Previously rendered to PNG via sharp; switched to a self-describing SVG
// data URI so the engine has zero Node-native dependencies and runs on
// Cloudflare Workers. SVG also stays crisp at any display size.
let cachedDataUri: string | undefined;
const W = 1792;
const H = 1024;
const SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
<rect width="${W}" height="${H}" fill="#161109"/>
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none" stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif" font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif" font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
</svg>`;
const DATA_URI = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(SVG)}`;
// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
// TTS path without paying for image generation. Generated once, then memoized.
// Returned as a data URI so the rest of the pipeline can treat it as an
// `imageUrl` interchangeably with real Runware URLs (the client's <img src>
// accepts both, and we never feed a mock image to Runware's referenceImages
// because mockImage mode short-circuits the Painter entirely).
export async function mockImageDataUri(): Promise<string> {
if (cachedDataUri) return cachedDataUri;
const W = 1792;
const H = 1024;
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
<rect width="${W}" height="${H}" fill="#161109"/>
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
</svg>`;
const png = await sharp(Buffer.from(svg)).png().toBuffer();
cachedDataUri = `data:image/png;base64,${png.toString("base64")}`;
return cachedDataUri;
return DATA_URI;
}
+1 -3
View File
@@ -13,7 +13,6 @@ import type {
VisionResponse,
} from "@infiplot/types";
import { runArchitect } from "./agents/architect";
import { annotateClick } from "./annotate";
import { directInsertBeat, directScene } from "./director";
import { synthesizeBeat } from "./voice";
import { interpret } from "./vision";
@@ -109,9 +108,8 @@ export async function visionDecide(
config: EngineConfig,
req: VisionRequest,
): Promise<VisionResponse> {
const annotated = await annotateClick(req.prevImageUrl, req.click);
const current = req.session.history.at(-1)?.scene ?? null;
return interpret(config.vision, annotated, current);
return interpret(config.vision, req.annotatedImageBase64, current);
}
// ──────────────────────────────────────────────────────────────────────
+13 -8
View File
@@ -67,10 +67,11 @@ export type Scene = {
imageUuid?: string;
/**
* Public CDN URL of this Scene's generated image. Returned to the client for
* `<img src>` rendering, and is what the client passes back to `/api/vision`
* as `prevImageUrl` so the server can re-fetch the bytes for click annotation.
* `<img src>` rendering; the client also feeds it through a Canvas 2D click
* annotator before posting to `/api/vision` (see
* `VisionRequest.annotatedImageBase64`).
*
* For MOCK_IMAGE=true this is a `data:image/png;base64,...` data URI, not a
* For MOCK_IMAGE=true this is a `data:image/svg+xml;...` data URI, not a
* Runware URL — the client renders both forms transparently.
*/
imageUrl?: string;
@@ -306,12 +307,16 @@ export type BeatAudioResponse = {
export type VisionRequest = {
session: Session;
/**
* Public CDN URL (or data URI in MOCK_IMAGE mode) of the scene the player
* just clicked. The server re-fetches the bytes to annotate the click and
* pass an OpenAI-compatible image_url to the vision LLM.
* Raw PNG base64 (no `data:` prefix) of the scene image WITH the player's
* click marker already drawn on it by the browser's Canvas 2D. The server
* forwards this straight to the vision LLM as an OpenAI-compatible
* image_url.
*
* Annotation lives client-side so the engine has no Node-native image
* dependency (sharp doesn't run on Cloudflare Workers) and we save a
* server-side image re-fetch per click.
*/
prevImageUrl: string;
click: { x: number; y: number };
annotatedImageBase64: string;
};
export type VisionResponse = {