fix: address Copilot review — SSRF validation + log truncation

- annotate.ts: add assertSafeUrl() to reject non-https/data URLs and
  private/reserved IPs (SSRF prevention); cap response body to 10 MB
- jsonParser.ts: truncate raw model output in error log to first 800
  chars to avoid flooding logs / leaking sensitive content
This commit is contained in:
yuanzonghao
2026-06-01 16:29:08 +08:00
parent addbede929
commit 42a09c42f8
2 changed files with 46 additions and 4 deletions
+42 -1
View File
@@ -1,12 +1,42 @@
import sharp from "sharp";
const FETCH_TIMEOUT_MS = 5000;
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
// Validate that an imageUrl is safe to fetch server-side.
// Only https: and data: URIs are allowed; http: is rejected to
// prevent SSRF via private IPs / cloud metadata endpoints.
function assertSafeUrl(url: string): void {
if (url.startsWith("data:")) return;
const parsed = new URL(url);
if (parsed.protocol !== "https:") {
throw new Error(
`prevImageUrl must use https: or data: protocol, got ${parsed.protocol}`,
);
}
const host = parsed.hostname;
if (
host === "localhost" ||
host === "127.0.0.1" ||
host === "0.0.0.0" ||
host.startsWith("192.168.") ||
host.startsWith("10.") ||
/^172\.(1[6-9]|2\d|3[0-1])\./.test(host) ||
host === "169.254.169.254"
) {
throw new Error(
`prevImageUrl resolves to a private/reserved IP: ${host}`,
);
}
}
// Pull the bytes from an image URL or data URI into a Buffer suitable for
// sharp. Data URIs are decoded inline (no network); http(s) URLs are fetched
// sharp. Data URIs are decoded inline (no network); https: URLs are fetched
// with a short timeout — if Runware's CDN is slow we'd rather fail the vision
// step quickly than tie up a 60s Vercel function on a single image read.
async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
assertSafeUrl(imageUrl);
if (imageUrl.startsWith("data:")) {
const comma = imageUrl.indexOf(",");
if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
@@ -23,7 +53,18 @@ async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
`Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
);
}
const contentLength = res.headers.get("content-length");
if (contentLength && Number(contentLength) > MAX_IMAGE_BYTES) {
throw new Error(
`prevImageUrl response too large (${contentLength} bytes, max ${MAX_IMAGE_BYTES})`,
);
}
const arr = await res.arrayBuffer();
if (arr.byteLength > MAX_IMAGE_BYTES) {
throw new Error(
`prevImageUrl response too large (${arr.byteLength} bytes, max ${MAX_IMAGE_BYTES})`,
);
}
return Buffer.from(arr);
} finally {
clearTimeout(timer);
+4 -3
View File
@@ -6,8 +6,9 @@ import { jsonrepair, JSONRepairError } from "jsonrepair";
// 3. Slice between first { and last } and parse.
// 4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
//
// On final failure, logs the FULL raw model output so we can diagnose the
// actual syntax error.
// On final failure, logs the first 800 chars of the raw model output so we
// can diagnose the actual syntax error without flooding logs or leaking
// sensitive content.
//
// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
// broad LLM-output failure modes: truncated JSON, missing commas/brackets,
@@ -86,7 +87,7 @@ export function parseJsonLoose<T>(raw: string): T {
} catch (err) {
const isRepairErr = err instanceof JSONRepairError;
console.error(
`[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Full raw model output:\n${raw}`,
`[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
);
throw err;
}