fix: address Copilot review — SSRF validation + log truncation

- annotate.ts: add assertSafeUrl() to reject non-https/data URLs and private/reserved IPs (SSRF prevention); cap response body to 10 MB - jsonParser.ts: truncate raw model output in error log to first 800 chars to avoid flooding logs / leaking sensitive content
2026-06-01 16:29:08 +08:00
parent addbede929
commit 42a09c42f8
2 changed files with 46 additions and 4 deletions
@@ -1,12 +1,42 @@
 import sharp from "sharp";

 const FETCH_TIMEOUT_MS = 5000;
+const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
+
+// Validate that an imageUrl is safe to fetch server-side.
+// Only https: and data: URIs are allowed; http: is rejected to
+// prevent SSRF via private IPs / cloud metadata endpoints.
+function assertSafeUrl(url: string): void {
+  if (url.startsWith("data:")) return;
+  const parsed = new URL(url);
+  if (parsed.protocol !== "https:") {
+    throw new Error(
+      `prevImageUrl must use https: or data: protocol, got ${parsed.protocol}`,
+    );
+  }
+  const host = parsed.hostname;
+  if (
+    host === "localhost" ||
+    host === "127.0.0.1" ||
+    host === "0.0.0.0" ||
+    host.startsWith("192.168.") ||
+    host.startsWith("10.") ||
+    /^172\.(1[6-9]|2\d|3[0-1])\./.test(host) ||
+    host === "169.254.169.254"
+  ) {
+    throw new Error(
+      `prevImageUrl resolves to a private/reserved IP: ${host}`,
+    );
+  }
+}

 // Pull the bytes from an image URL or data URI into a Buffer suitable for
-// sharp. Data URIs are decoded inline (no network); http(s) URLs are fetched
+// sharp. Data URIs are decoded inline (no network); https: URLs are fetched
 // with a short timeout — if Runware's CDN is slow we'd rather fail the vision
 // step quickly than tie up a 60s Vercel function on a single image read.
 async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
+  assertSafeUrl(imageUrl);
+
  if (imageUrl.startsWith("data:")) {
    const comma = imageUrl.indexOf(",");
    if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
@@ -23,7 +53,18 @@ async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
        `Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
      );
    }
+    const contentLength = res.headers.get("content-length");
+    if (contentLength && Number(contentLength) > MAX_IMAGE_BYTES) {
+      throw new Error(
+        `prevImageUrl response too large (${contentLength} bytes, max ${MAX_IMAGE_BYTES})`,
+      );
+    }
    const arr = await res.arrayBuffer();
+    if (arr.byteLength > MAX_IMAGE_BYTES) {
+      throw new Error(
+        `prevImageUrl response too large (${arr.byteLength} bytes, max ${MAX_IMAGE_BYTES})`,
+      );
+    }
    return Buffer.from(arr);
  } finally {
    clearTimeout(timer);
@@ -6,8 +6,9 @@ import { jsonrepair, JSONRepairError } from "jsonrepair";
 //   3. Slice between first { and last } and parse.
 //   4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
 //
-// On final failure, logs the FULL raw model output so we can diagnose the
-// actual syntax error.
+// On final failure, logs the first 800 chars of the raw model output so we
+// can diagnose the actual syntax error without flooding logs or leaking
+// sensitive content.
 //
 // jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
 // broad LLM-output failure modes: truncated JSON, missing commas/brackets,
@@ -86,7 +87,7 @@ export function parseJsonLoose<T>(raw: string): T {
    } catch (err) {
      const isRepairErr = err instanceof JSONRepairError;
      console.error(
-        `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Full raw model output:\n${raw}`,
+        `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
      );
      throw err;
    }