refactor(engine): move click annotation from sharp to browser Canvas
The vision pipeline used sharp to draw a click marker on the scene image server-side (engine/src/annotate.ts) and to render the MOCK_IMAGE placeholder PNG (engine/src/mockImage.ts). Both moved off the runtime: - annotateClick → apps/web/lib/annotateClient.ts (Canvas 2D in the browser; toDataURL → raw PNG base64 forwarded to /api/vision). Saves a server-side image re-fetch per click and frees the engine from sharp's native binding (which doesn't run on Cloudflare Workers). - mockImageDataUri → self-describing SVG data URI (no rendering needed). VisionRequest contract changes: prevImageUrl + click → annotatedImageBase64. Server forwards the bytes straight to the vision LLM as image_url. sharp is removed from packages/engine entirely and from next.config.ts's serverExternalPackages. apps/web/package.json + lockfile cleanup ships in the follow-up Cloudflare deployment commit. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -14,9 +14,9 @@ export async function POST(req: Request) {
|
|||||||
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
|
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!body.session || !body.prevImageUrl || !body.click) {
|
if (!body.session || !body.annotatedImageBase64) {
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{ error: "session, prevImageUrl, click are required" },
|
{ error: "session and annotatedImageBase64 are required" },
|
||||||
{ status: 400 },
|
{ status: 400 },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import {
|
|||||||
useState,
|
useState,
|
||||||
} from "react";
|
} from "react";
|
||||||
import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
|
import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
|
||||||
|
import { annotateClick } from "@/lib/annotateClient";
|
||||||
import { PRESETS } from "@/lib/presets";
|
import { PRESETS } from "@/lib/presets";
|
||||||
import type {
|
import type {
|
||||||
Beat,
|
Beat,
|
||||||
@@ -746,10 +747,11 @@ function PlayInner() {
|
|||||||
setPendingClick(click);
|
setPendingClick(click);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const annotatedImageBase64 = await annotateClick(imageUrl, click);
|
||||||
const visionRes = await fetch("/api/vision", {
|
const visionRes = await fetch("/api/vision", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
body: JSON.stringify({ session, prevImageUrl: imageUrl, click }),
|
body: JSON.stringify({ session, annotatedImageBase64 }),
|
||||||
});
|
});
|
||||||
if (!visionRes.ok) {
|
if (!visionRes.ok) {
|
||||||
const j = (await visionRes.json().catch(() => ({}))) as {
|
const j = (await visionRes.json().catch(() => ({}))) as {
|
||||||
|
|||||||
@@ -0,0 +1,78 @@
|
|||||||
|
const TARGET_WIDTH = 768;
|
||||||
|
|
||||||
|
// Browser-side equivalent of the former engine/src/annotate.ts. Redraws the
|
||||||
|
// scene image with the player's click marker on a Canvas 2D and returns the
|
||||||
|
// raw PNG base64 (no `data:` prefix) — interpretClick wraps it back into a
|
||||||
|
// data URL before posting to the vision LLM.
|
||||||
|
//
|
||||||
|
// crossOrigin="anonymous" + the CDN's Access-Control-Allow-Origin header are
|
||||||
|
// both required to keep the canvas un-tainted; without them toDataURL throws
|
||||||
|
// SecurityError. Runware's image CDN supports anonymous CORS; data: URIs
|
||||||
|
// (MOCK_IMAGE mode) load without CORS.
|
||||||
|
export async function annotateClick(
|
||||||
|
imageUrl: string,
|
||||||
|
click: { x: number; y: number },
|
||||||
|
): Promise<string> {
|
||||||
|
const img = await loadImage(imageUrl);
|
||||||
|
|
||||||
|
const scale = Math.min(1, TARGET_WIDTH / img.naturalWidth);
|
||||||
|
const w = Math.max(1, Math.round(img.naturalWidth * scale));
|
||||||
|
const h = Math.max(1, Math.round(img.naturalHeight * scale));
|
||||||
|
|
||||||
|
const canvas = document.createElement("canvas");
|
||||||
|
canvas.width = w;
|
||||||
|
canvas.height = h;
|
||||||
|
const ctx = canvas.getContext("2d");
|
||||||
|
if (!ctx) throw new Error("Canvas 2D context unavailable");
|
||||||
|
|
||||||
|
ctx.drawImage(img, 0, 0, w, h);
|
||||||
|
|
||||||
|
const cx = Math.round(click.x * w);
|
||||||
|
const cy = Math.round(click.y * h);
|
||||||
|
const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
|
||||||
|
const stroke = Math.max(2, Math.round(r * 0.25));
|
||||||
|
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(cx, cy, r, 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = "rgba(255,40,40,0.55)";
|
||||||
|
ctx.fill();
|
||||||
|
ctx.lineWidth = stroke;
|
||||||
|
ctx.strokeStyle = "rgba(255,255,255,0.95)";
|
||||||
|
ctx.stroke();
|
||||||
|
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(cx, cy, Math.max(2, Math.round(r * 0.25)), 0, Math.PI * 2);
|
||||||
|
ctx.fillStyle = "rgba(255,255,255,1)";
|
||||||
|
ctx.fill();
|
||||||
|
|
||||||
|
const dataUrl = canvas.toDataURL("image/png");
|
||||||
|
return dataUrl.replace(/^data:image\/png;base64,/, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// 10s timeout mirrors the old server-side annotator's 5s fetch budget +
|
||||||
|
// headroom for browser decode. Without it a hung CDN response would strand
|
||||||
|
// the player in `vision-thinking` forever.
|
||||||
|
function loadImage(
|
||||||
|
url: string,
|
||||||
|
timeoutMs = 10_000,
|
||||||
|
): Promise<HTMLImageElement> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const img = new Image();
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
img.src = "";
|
||||||
|
reject(new Error(`Image load timed out after ${timeoutMs}ms`));
|
||||||
|
}, timeoutMs);
|
||||||
|
img.crossOrigin = "anonymous";
|
||||||
|
img.onload = () => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
resolve(img);
|
||||||
|
};
|
||||||
|
img.onerror = () => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
reject(
|
||||||
|
new Error(`Failed to load image for annotation: ${url.slice(0, 80)}`),
|
||||||
|
);
|
||||||
|
};
|
||||||
|
img.src = url;
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -10,7 +10,6 @@ const config: NextConfig = {
|
|||||||
"@infiplot/types",
|
"@infiplot/types",
|
||||||
"@infiplot/tts-client",
|
"@infiplot/tts-client",
|
||||||
],
|
],
|
||||||
serverExternalPackages: ["sharp"],
|
|
||||||
turbopack: {
|
turbopack: {
|
||||||
root: path.join(__dirname, "..", ".."),
|
root: path.join(__dirname, "..", ".."),
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
"@infiplot/ai-client": "workspace:*",
|
"@infiplot/ai-client": "workspace:*",
|
||||||
"@infiplot/tts-client": "workspace:*",
|
"@infiplot/tts-client": "workspace:*",
|
||||||
"@infiplot/types": "workspace:*",
|
"@infiplot/types": "workspace:*",
|
||||||
"jsonrepair": "^3.14.0",
|
"jsonrepair": "^3.14.0"
|
||||||
"sharp": "^0.33.5"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,111 +0,0 @@
|
|||||||
import sharp from "sharp";
|
|
||||||
|
|
||||||
const FETCH_TIMEOUT_MS = 5000;
|
|
||||||
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
|
|
||||||
|
|
||||||
// Validate that an imageUrl is safe to fetch server-side.
|
|
||||||
// Only https: and data: URIs are allowed; http: is rejected to
|
|
||||||
// prevent SSRF via private IPs / cloud metadata endpoints.
|
|
||||||
function assertSafeUrl(url: string): void {
|
|
||||||
if (url.startsWith("data:")) return;
|
|
||||||
const parsed = new URL(url);
|
|
||||||
if (parsed.protocol !== "https:") {
|
|
||||||
throw new Error(
|
|
||||||
`prevImageUrl must use https: or data: protocol, got ${parsed.protocol}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const host = parsed.hostname;
|
|
||||||
if (
|
|
||||||
host === "localhost" ||
|
|
||||||
host === "127.0.0.1" ||
|
|
||||||
host === "0.0.0.0" ||
|
|
||||||
host.startsWith("192.168.") ||
|
|
||||||
host.startsWith("10.") ||
|
|
||||||
/^172\.(1[6-9]|2\d|3[0-1])\./.test(host) ||
|
|
||||||
host === "169.254.169.254"
|
|
||||||
) {
|
|
||||||
throw new Error(
|
|
||||||
`prevImageUrl resolves to a private/reserved IP: ${host}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pull the bytes from an image URL or data URI into a Buffer suitable for
|
|
||||||
// sharp. Data URIs are decoded inline (no network); https: URLs are fetched
|
|
||||||
// with a short timeout — if Runware's CDN is slow we'd rather fail the vision
|
|
||||||
// step quickly than tie up a 60s Vercel function on a single image read.
|
|
||||||
async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
|
|
||||||
assertSafeUrl(imageUrl);
|
|
||||||
|
|
||||||
if (imageUrl.startsWith("data:")) {
|
|
||||||
const comma = imageUrl.indexOf(",");
|
|
||||||
if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
|
|
||||||
const b64 = imageUrl.slice(comma + 1);
|
|
||||||
return Buffer.from(b64, "base64");
|
|
||||||
}
|
|
||||||
|
|
||||||
const ctrl = new AbortController();
|
|
||||||
const timer = setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
|
|
||||||
try {
|
|
||||||
const res = await fetch(imageUrl, { signal: ctrl.signal });
|
|
||||||
if (!res.ok) {
|
|
||||||
throw new Error(
|
|
||||||
`Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const contentLength = res.headers.get("content-length");
|
|
||||||
if (contentLength && Number(contentLength) > MAX_IMAGE_BYTES) {
|
|
||||||
throw new Error(
|
|
||||||
`prevImageUrl response too large (${contentLength} bytes, max ${MAX_IMAGE_BYTES})`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const arr = await res.arrayBuffer();
|
|
||||||
if (arr.byteLength > MAX_IMAGE_BYTES) {
|
|
||||||
throw new Error(
|
|
||||||
`prevImageUrl response too large (${arr.byteLength} bytes, max ${MAX_IMAGE_BYTES})`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return Buffer.from(arr);
|
|
||||||
} finally {
|
|
||||||
clearTimeout(timer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Marks the player's click point on the scene image so the vision LLM can see
|
|
||||||
// WHERE they tapped. Output is base64 because the vision LLM is called over
|
|
||||||
// the OpenAI-compatible chat endpoint, which only accepts image_url data URIs
|
|
||||||
// — we can't hand it a Runware CDN URL directly.
|
|
||||||
export async function annotateClick(
|
|
||||||
imageUrl: string,
|
|
||||||
click: { x: number; y: number },
|
|
||||||
): Promise<string> {
|
|
||||||
const buf = await loadImageBuffer(imageUrl);
|
|
||||||
|
|
||||||
const resized = await sharp(buf)
|
|
||||||
.resize({ width: 768, withoutEnlargement: true, fit: "inside" })
|
|
||||||
.png()
|
|
||||||
.toBuffer();
|
|
||||||
|
|
||||||
const meta = await sharp(resized).metadata();
|
|
||||||
const w = meta.width ?? 768;
|
|
||||||
const h = meta.height ?? 1152;
|
|
||||||
|
|
||||||
const cx = Math.round(click.x * w);
|
|
||||||
const cy = Math.round(click.y * h);
|
|
||||||
const r = Math.max(8, Math.round(Math.min(w, h) * 0.025));
|
|
||||||
const stroke = Math.max(2, Math.round(r * 0.25));
|
|
||||||
|
|
||||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${w}" height="${h}" viewBox="0 0 ${w} ${h}">
|
|
||||||
<circle cx="${cx}" cy="${cy}" r="${r}" fill="rgba(255,40,40,0.55)"
|
|
||||||
stroke="rgba(255,255,255,0.95)" stroke-width="${stroke}" />
|
|
||||||
<circle cx="${cx}" cy="${cy}" r="${Math.round(r * 0.25)}"
|
|
||||||
fill="rgba(255,255,255,1)" />
|
|
||||||
</svg>`;
|
|
||||||
|
|
||||||
const out = await sharp(resized)
|
|
||||||
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
|
|
||||||
.png({ compressionLevel: 9 })
|
|
||||||
.toBuffer();
|
|
||||||
|
|
||||||
return out.toString("base64");
|
|
||||||
}
|
|
||||||
@@ -5,7 +5,6 @@ export {
|
|||||||
requestInsertBeat,
|
requestInsertBeat,
|
||||||
requestBeatAudio,
|
requestBeatAudio,
|
||||||
} from "./orchestrator";
|
} from "./orchestrator";
|
||||||
export { annotateClick } from "./annotate";
|
|
||||||
export { synthesizeBeat } from "./voice";
|
export { synthesizeBeat } from "./voice";
|
||||||
export { mergeCharacters } from "./director";
|
export { mergeCharacters } from "./director";
|
||||||
export type { SceneResult } from "./director";
|
export type { SceneResult } from "./director";
|
||||||
|
|||||||
@@ -1,29 +1,25 @@
|
|||||||
import sharp from "sharp";
|
// Static SVG placeholder used when MOCK_IMAGE=true, so we can exercise the
|
||||||
|
// TTS path without paying for image generation. Returned as a data URI so the
|
||||||
let cachedDataUri: string | undefined;
|
// rest of the pipeline can treat it as an `imageUrl` interchangeably with
|
||||||
|
// real Runware URLs (the client's <img src> accepts both, and we never feed
|
||||||
// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
|
// a mock image to Runware's referenceImages because mockImage mode
|
||||||
// TTS path without paying for image generation. Generated once, then memoized.
|
// short-circuits the Painter entirely).
|
||||||
// Returned as a data URI so the rest of the pipeline can treat it as an
|
//
|
||||||
// `imageUrl` interchangeably with real Runware URLs (the client's <img src>
|
// Previously rendered to PNG via sharp; switched to a self-describing SVG
|
||||||
// accepts both, and we never feed a mock image to Runware's referenceImages
|
// data URI so the engine has zero Node-native dependencies and runs on
|
||||||
// because mockImage mode short-circuits the Painter entirely).
|
// Cloudflare Workers. SVG also stays crisp at any display size.
|
||||||
export async function mockImageDataUri(): Promise<string> {
|
|
||||||
if (cachedDataUri) return cachedDataUri;
|
|
||||||
|
|
||||||
const W = 1792;
|
const W = 1792;
|
||||||
const H = 1024;
|
const H = 1024;
|
||||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
|
const SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
|
||||||
<rect width="${W}" height="${H}" fill="#161109"/>
|
<rect width="${W}" height="${H}" fill="#161109"/>
|
||||||
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
|
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none" stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
|
||||||
stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
|
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif" font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
|
||||||
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
|
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif" font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
|
||||||
font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
|
|
||||||
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
|
|
||||||
font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
|
|
||||||
</svg>`;
|
</svg>`;
|
||||||
|
|
||||||
const png = await sharp(Buffer.from(svg)).png().toBuffer();
|
const DATA_URI = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(SVG)}`;
|
||||||
cachedDataUri = `data:image/png;base64,${png.toString("base64")}`;
|
|
||||||
return cachedDataUri;
|
export async function mockImageDataUri(): Promise<string> {
|
||||||
|
return DATA_URI;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ import type {
|
|||||||
VisionResponse,
|
VisionResponse,
|
||||||
} from "@infiplot/types";
|
} from "@infiplot/types";
|
||||||
import { runArchitect } from "./agents/architect";
|
import { runArchitect } from "./agents/architect";
|
||||||
import { annotateClick } from "./annotate";
|
|
||||||
import { directInsertBeat, directScene } from "./director";
|
import { directInsertBeat, directScene } from "./director";
|
||||||
import { synthesizeBeat } from "./voice";
|
import { synthesizeBeat } from "./voice";
|
||||||
import { interpret } from "./vision";
|
import { interpret } from "./vision";
|
||||||
@@ -109,9 +108,8 @@ export async function visionDecide(
|
|||||||
config: EngineConfig,
|
config: EngineConfig,
|
||||||
req: VisionRequest,
|
req: VisionRequest,
|
||||||
): Promise<VisionResponse> {
|
): Promise<VisionResponse> {
|
||||||
const annotated = await annotateClick(req.prevImageUrl, req.click);
|
|
||||||
const current = req.session.history.at(-1)?.scene ?? null;
|
const current = req.session.history.at(-1)?.scene ?? null;
|
||||||
return interpret(config.vision, annotated, current);
|
return interpret(config.vision, req.annotatedImageBase64, current);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -67,10 +67,11 @@ export type Scene = {
|
|||||||
imageUuid?: string;
|
imageUuid?: string;
|
||||||
/**
|
/**
|
||||||
* Public CDN URL of this Scene's generated image. Returned to the client for
|
* Public CDN URL of this Scene's generated image. Returned to the client for
|
||||||
* `<img src>` rendering, and is what the client passes back to `/api/vision`
|
* `<img src>` rendering; the client also feeds it through a Canvas 2D click
|
||||||
* as `prevImageUrl` so the server can re-fetch the bytes for click annotation.
|
* annotator before posting to `/api/vision` (see
|
||||||
|
* `VisionRequest.annotatedImageBase64`).
|
||||||
*
|
*
|
||||||
* For MOCK_IMAGE=true this is a `data:image/png;base64,...` data URI, not a
|
* For MOCK_IMAGE=true this is a `data:image/svg+xml;...` data URI, not a
|
||||||
* Runware URL — the client renders both forms transparently.
|
* Runware URL — the client renders both forms transparently.
|
||||||
*/
|
*/
|
||||||
imageUrl?: string;
|
imageUrl?: string;
|
||||||
@@ -306,12 +307,16 @@ export type BeatAudioResponse = {
|
|||||||
export type VisionRequest = {
|
export type VisionRequest = {
|
||||||
session: Session;
|
session: Session;
|
||||||
/**
|
/**
|
||||||
* Public CDN URL (or data URI in MOCK_IMAGE mode) of the scene the player
|
* Raw PNG base64 (no `data:` prefix) of the scene image WITH the player's
|
||||||
* just clicked. The server re-fetches the bytes to annotate the click and
|
* click marker already drawn on it by the browser's Canvas 2D. The server
|
||||||
* pass an OpenAI-compatible image_url to the vision LLM.
|
* forwards this straight to the vision LLM as an OpenAI-compatible
|
||||||
|
* image_url.
|
||||||
|
*
|
||||||
|
* Annotation lives client-side so the engine has no Node-native image
|
||||||
|
* dependency (sharp doesn't run on Cloudflare Workers) and we save a
|
||||||
|
* server-side image re-fetch per click.
|
||||||
*/
|
*/
|
||||||
prevImageUrl: string;
|
annotatedImageBase64: string;
|
||||||
click: { x: number; y: number };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type VisionResponse = {
|
export type VisionResponse = {
|
||||||
|
|||||||
Reference in New Issue
Block a user