From addbede9291cac2cdd6636cfa2d62352a007b337 Mon Sep 17 00:00:00 2001
From: yuanzonghao <yuanzonghao123@gmail.com>
Date: Mon, 1 Jun 2026 16:04:13 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20Vercel=20Hobby=20deploy=20readiness=20?=
 =?UTF-8?q?=E2=80=94=20image=20URLs,=20jsonrepair,=20DeepSeek?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move vercel.json to apps/web/ with correct route paths; cap scene route
  maxDuration 120→60s for Hobby. Root vercel.json removed. Vercel project's
  Root Directory must be set to apps/web (Deploy button URL passes this).
- Switch image transport from base64-in-JSON to Runware-hosted URLs:
  generateImage now uses outputType=URL and returns {imageUrl, imageUuid};
  StartResponse/SceneResponse carry imageUrl; VisionRequest carries
  prevImageUrl (server re-fetches the bytes for click annotation). This
  eliminates the 4.5MB serverless body-size risk.
- Painter and director prefer URL over UUID for referenceImages — the UUID
  returned by Runware imageInference isn't always recognized in the refs
  pipeline (surfaces as `failedToTransferImage`).
- Client preloads scene images via `new Image().decode()` before committing
  to React state, so URL transitions render instantly; prefetched scenes
  also warm the HTTP cache.
- jsonParser uses the jsonrepair package (replaces hand-rolled repair) and
  adds a targeted preRepair regex for the missing-key-close-quote pattern
  that jsonrepair couldn't disambiguate. Full raw model output dumped on
  failure for diagnostic visibility.
- Default text provider switched to DeepSeek v4-flash via direct API
  (significantly more stable JSON than MiMo v2.5-pro). VISION/TTS stay on
  MiMo (DeepSeek has no multimodal / TTS offerings).
- next.config: drop dead experimental.serverActions.bodySizeLimit (no
  server actions used).
- README: real Deploy button URL (zonghaoyuan/yume + root-directory=apps/web
  + TTS/MOCK_IMAGE in env list); refreshed env vars table with optional
  TTS section.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                                     |  32 +++--
 apps/web/.env.example                         |  22 ++--
 apps/web/app/api/scene/route.ts               |   6 +-
 apps/web/app/api/vision/route.ts              |   4 +-
 apps/web/app/play/page.tsx                    |  71 ++++++++--
 apps/web/components/PlayCanvas.tsx            |  14 +-
 apps/web/next.config.ts                       |   5 -
 apps/web/vercel.json                          |  11 ++
 packages/ai-client/src/image.ts               | 122 ++++++------------
 packages/ai-client/src/index.ts               |   4 +-
 packages/engine/package.json                  |   1 +
 .../engine/src/agents/characterDesigner.ts    |  70 ++++------
 packages/engine/src/agents/painter.ts         |  40 ++++--
 packages/engine/src/annotate.ts               |  38 +++++-
 packages/engine/src/director.ts               |  68 +++-------
 packages/engine/src/jsonParser.ts             | 115 +++++++++--------
 packages/engine/src/mockImage.ts              |  14 +-
 packages/engine/src/orchestrator.ts           |  10 +-
 packages/types/src/index.ts                   |  50 ++++---
 pnpm-lock.yaml                                |   9 ++
 vercel.json                                   |  11 --
 21 files changed, 392 insertions(+), 325 deletions(-)
 create mode 100644 apps/web/vercel.json
 delete mode 100644 vercel.json

diff --git a/README.md b/README.md
index 5b8bc23..6f65317 100644
--- a/README.md
+++ b/README.md
@@ -37,21 +37,28 @@ There is no traditional game UI baked into the art. The AI paints the world in w
 
 ## One-click deploy
 
-[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/YOUR_USERNAME/yume&env=TEXT_BASE_URL,TEXT_API_KEY,TEXT_MODEL,IMAGE_BASE_URL,IMAGE_API_KEY,IMAGE_MODEL,VISION_BASE_URL,VISION_API_KEY,VISION_MODEL&envDescription=Three%20independently%20configurable%20providers.%20Any%20OpenAI-compatible%20endpoint%20works.&envLink=https://github.com/YOUR_USERNAME/yume%23environment-variables)
+[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/zonghaoyuan/yume&root-directory=apps/web&env=TEXT_BASE_URL,TEXT_API_KEY,TEXT_MODEL,IMAGE_BASE_URL,IMAGE_API_KEY,IMAGE_MODEL,VISION_BASE_URL,VISION_API_KEY,VISION_MODEL,TTS_BASE_URL,TTS_API_KEY,TTS_SPEECH_MODEL,MOCK_IMAGE&envDescription=Three%20required%20providers%20%2B%20optional%20TTS.%20Any%20OpenAI-compatible%20endpoint%20works%20for%20text%2Fvision%2Ftts.&envLink=https://github.com/zonghaoyuan/yume%23environment-variables)
 
-After deploy, set the nine environment variables (see below) in your Vercel project. That's it.
+After deploy, set the environment variables (see below) in your Vercel project. Nine are required; TTS is optional (leave blank to run silently); `MOCK_IMAGE=true` skips image generation for cheap TTS-only testing. The Vercel project's **Root Directory** must be set to `apps/web` (the deploy button passes this; if you configure manually, set it in Project Settings).
 
 ---
 
 ## Environment variables
 
-Three providers, all independently configurable. Text and Vision accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible).
+Three required providers + optional TTS. Text, Vision, and TTS accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible).
 
-| Provider | Variables | Recommended |
-|---|---|---|
-| Text · story director | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL` | `claude-opus-4-7` via Anthropic |
-| Image · UI renderer   | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) |
-| Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google |
+| Provider | Variables | Required? | Recommended |
+|---|---|---|---|
+| Text · story director  | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL`        | ✅ | `claude-opus-4-7` via Anthropic |
+| Image · UI renderer    | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL`     | ✅ | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) |
+| Vision · click reader  | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL`  | ✅ | `gemini-3-flash` via Google |
+| TTS · per-character voice | `TTS_BASE_URL` `TTS_API_KEY` `TTS_SPEECH_MODEL` | optional — leave blank to run silently | `mimo-v2.5-tts` via Xiaomi MiMo |
+
+There's also a flag for cheap testing:
+
+| Variable | Effect |
+|---|---|
+| `MOCK_IMAGE=true` | Skip image generation; the renderer returns a static placeholder. Story, voice, and choices still run normally. Great for iterating on TTS without burning Runware credits. |
 
 See `apps/web/.env.example` for the exact shape.
 
@@ -64,7 +71,7 @@ Requires Node 20+ and pnpm 9+.
 ```bash
 pnpm install
 cp apps/web/.env.example apps/web/.env.local
-# fill in the nine env vars
+# fill in env vars (9 required + optional TTS/MOCK_IMAGE)
 pnpm dev
 # open http://localhost:3000
 ```
@@ -75,11 +82,12 @@ pnpm dev
 
 ```
 yume/
-├── apps/web/              Next.js 16 app — pages + API routes
+├── apps/web/              Next.js 16 app — pages + API routes (Vercel root)
 └── packages/
     ├── types/             shared TypeScript types
-    ├── ai-client/         unified OpenAI-compatible clients
-    └── engine/            three-stage AI orchestration (open core)
+    ├── ai-client/         unified OpenAI-compatible clients + Runware adapter
+    ├── tts-client/        Xiaomi MiMo TTS adapter
+    └── engine/            multi-agent AI orchestration (open core)
 ```
 
 `packages/engine` is the open core — pure TS, no Next.js or browser dependency. Import it directly to build your own visual-novel front-end (Tauri, Electron, CLI, anywhere).
diff --git a/apps/web/.env.example b/apps/web/.env.example
index 20b7700..6cff139 100644
--- a/apps/web/.env.example
+++ b/apps/web/.env.example
@@ -12,12 +12,18 @@
 # =============================================================
 
 # ---- 1. Text LLM · scene director ----------------------------------
-# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN)
-# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1
-# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys)
-TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
-TEXT_API_KEY=tp-xxx
-TEXT_MODEL=mimo-v2.5-pro
+# Any OpenAI-compatible endpoint works: OpenAI, Anthropic (via proxy),
+# Gemini, OpenRouter, DeepSeek, OpenCode, MiMo, local Ollama, …
+# Recommended starters:
+#   A. DeepSeek v4-flash direct (https://api.deepseek.com/v1) — pay-as-you-go,
+#      fastest first-token latency, very stable JSON output.
+#   B. OpenCode Go (https://opencode.ai/zen/go/v1) — $10/mo flat-rate bundle of
+#      12 open-source models (DeepSeek v4-flash, Qwen, Kimi, GLM, MiMo, …).
+#      Cheaper at high volume, slower at the tail.
+#   C. MiMo v2.5 via Xiaomi Token Plan — bundles VISION + TTS in one tp- key.
+TEXT_BASE_URL=https://api.deepseek.com/v1
+TEXT_API_KEY=sk-xxx
+TEXT_MODEL=deepseek-v4-flash
 
 # ---- 2. Image generator (renders the scene background) -------------
 # Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model,
@@ -30,9 +36,7 @@ IMAGE_API_KEY=runware-xxx
 IMAGE_MODEL=runware:400@6
 
 # ---- 3. Vision model · multimodal click interpretation -------------
-# Recommended: MiMo V2.5 omni — multimodal.
-# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and
-#    rejects image_url content parts.
+# Recommended: MiMo V2.5 — multimodal, accepts image_url content parts.
 VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 VISION_API_KEY=tp-xxx
 VISION_MODEL=mimo-v2.5
diff --git a/apps/web/app/api/scene/route.ts b/apps/web/app/api/scene/route.ts
index bcec19b..641e173 100644
--- a/apps/web/app/api/scene/route.ts
+++ b/apps/web/app/api/scene/route.ts
@@ -4,7 +4,11 @@ import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
 
 export const runtime = "nodejs";
-export const maxDuration = 120;
+// Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is
+// Writer + CharDesigner×N + Cinematographer + Painter — happy path 9–12s; the
+// tail (cold provider, multiple new characters) can push 30–45s, so 60 is a
+// reasonable headroom on Hobby.
+export const maxDuration = 60;
 
 export async function POST(req: Request) {
   let body: SceneRequest;
diff --git a/apps/web/app/api/vision/route.ts b/apps/web/app/api/vision/route.ts
index 81d0487..d093209 100644
--- a/apps/web/app/api/vision/route.ts
+++ b/apps/web/app/api/vision/route.ts
@@ -14,9 +14,9 @@ export async function POST(req: Request) {
     return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
   }
 
-  if (!body.session || !body.prevImageBase64 || !body.click) {
+  if (!body.session || !body.prevImageUrl || !body.click) {
     return NextResponse.json(
-      { error: "session, prevImageBase64, click are required" },
+      { error: "session, prevImageUrl, click are required" },
       { status: 400 },
     );
   }
diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx
index b71f21e..88b0308 100644
--- a/apps/web/app/play/page.tsx
+++ b/apps/web/app/play/page.tsx
@@ -28,6 +28,42 @@ import type {
 
 const MUTED_STORAGE_KEY = "yume:muted";
 
+// Cap how long we wait for the browser to download + decode a scene image
+// before giving up and rendering anyway. Runware's CDN is normally <2s for a
+// 1792×1024 PNG; tolerate up to 8s before the typewriter starts so a slow
+// download can't strand the player on a blank screen forever.
+const IMAGE_PRELOAD_TIMEOUT_MS = 8000;
+
+// ──────────────────────────────────────────────────────────────────────
+//  Image preload — decode the Runware URL in memory before committing to
+//  React state, so when the <img> mounts, the browser cache is warm and
+//  rendering is instant. Without this the user sees a blank canvas during
+//  the Runware-CDN download (~1-3s) after /api/scene returns.
+//
+//  Data URIs (MOCK_IMAGE mode) and prefetched-then-cached real URLs both
+//  resolve fast / instantly. Errors and timeouts resolve quietly — better
+//  to render a broken-image than to hang the play loop indefinitely.
+// ──────────────────────────────────────────────────────────────────────
+
+function preloadImage(url: string): Promise<void> {
+  return new Promise<void>((resolve) => {
+    const img = new Image();
+    const done = () => resolve();
+    const timer = setTimeout(done, IMAGE_PRELOAD_TIMEOUT_MS);
+    img.onload = () => {
+      clearTimeout(timer);
+      // .decode() forces the bitmap to be fully decoded before we proceed —
+      // without it, a slow decode could still cause a flash on first paint.
+      img.decode().then(done, done);
+    };
+    img.onerror = () => {
+      clearTimeout(timer);
+      done();
+    };
+    img.src = url;
+  });
+}
+
 // ──────────────────────────────────────────────────────────────────────
 //  Prefetch pool — speculative SceneResponses keyed by choice path.
 //
@@ -123,6 +159,12 @@ function prefetchScenePath(
     }
     const data = (await res.json()) as SceneResponse;
 
+    // Warm the browser's HTTP + image-decode cache for this URL so when the
+    // player eventually picks this choice and we render the <img>, it's
+    // instant. Don't await — let the bytes stream in the background; the
+    // transition path will await its own preloadImage() before committing.
+    void preloadImage(data.imageUrl);
+
     // Recursive: if the resulting scene has exactly one change-scene exit,
     // it is a must-pass node — prefetch its child too.
     if (depth + 1 < PREFETCH_MAX_DEPTH) {
@@ -193,7 +235,7 @@ function PlayInner() {
   const [session, setSession] = useState<Session | null>(null);
   const [currentScene, setCurrentScene] = useState<Scene | null>(null);
   const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
-  const [imageBase64, setImageBase64] = useState<string | null>(null);
+  const [imageUrl, setImageUrl] = useState<string | null>(null);
   const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
   // Lazy-initialize from localStorage so PlayCanvas never mounts with the
   // wrong muted value (an effect-based read would briefly let audio play
@@ -434,7 +476,12 @@ function PlayInner() {
         }
         return (await r.json()) as StartResponse;
       })
-      .then((data) => {
+      .then(async (data) => {
+        // Decode the Runware image in memory before committing to state, so
+        // the <img> renders instantly when it mounts (same rationale as the
+        // performSceneTransition path).
+        await preloadImage(data.imageUrl);
+
         const initial: Session = {
           id: data.sessionId,
           createdAt: Date.now(),
@@ -452,7 +499,7 @@ function PlayInner() {
         setSession(initial);
         setCurrentScene(data.scene);
         setCurrentBeatId(data.scene.entryBeatId);
-        setImageBase64(data.imageBase64);
+        setImageUrl(data.imageUrl);
         // beatAudioMap is populated lazily by the per-beat fetch effect once
         // currentScene becomes non-null (see fetchBeatAudio).
         setPhase("ready");
@@ -520,6 +567,14 @@ function PlayInner() {
       const base = sessionRef.current;
       if (!base) throw new Error("Session lost mid-transition");
 
+      // Wait for the browser to download + decode the Runware-hosted image
+      // BEFORE committing it to state, so the <img> renders instantly when it
+      // mounts. For prefetched scenes the preloadImage call inside
+      // prefetchScenePath has already warmed the cache, so this resolves
+      // almost immediately. For cold transitions we trade an extra ~1-3s of
+      // "transitioning" overlay for an image-pop-in-from-blank flash.
+      await preloadImage(result.imageUrl);
+
       const closedHistory = base.history.map((h, i, arr) =>
         i === arr.length - 1
           ? { ...h, visitedBeatIds: visitedForCurrent, exit }
@@ -540,7 +595,7 @@ function PlayInner() {
       setSession(newSession);
       setCurrentScene(result.scene);
       setCurrentBeatId(result.scene.entryBeatId);
-      setImageBase64(result.imageBase64);
+      setImageUrl(result.imageUrl);
       // beatAudioMap reset + per-beat fetches kicked off by the scene effect.
       setLastExitLabel(exitLabel);
       setPhase("ready");
@@ -607,7 +662,7 @@ function PlayInner() {
   }
 
   async function onBackgroundClick(click: { x: number; y: number }) {
-    if (phase !== "ready" || !session || !currentScene || !imageBase64) return;
+    if (phase !== "ready" || !session || !currentScene || !imageUrl) return;
     setPhase("vision-thinking");
     setPendingClick(click);
 
@@ -615,7 +670,7 @@ function PlayInner() {
       const visionRes = await fetch("/api/vision", {
         method: "POST",
         headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ session, prevImageBase64: imageBase64, click }),
+        body: JSON.stringify({ session, prevImageUrl: imageUrl, click }),
       });
       if (!visionRes.ok) {
         const j = (await visionRes.json().catch(() => ({}))) as {
@@ -763,7 +818,7 @@ function PlayInner() {
     return (
       <div className="fixed inset-0 bg-black flex items-center justify-center z-50">
         <PlayCanvas
-          imageBase64={imageBase64}
+          imageUrl={imageUrl}
           audioBase64={audioBase64}
           audioMime={audioMime}
           muted={muted}
@@ -805,7 +860,7 @@ function PlayInner() {
 
       <main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
         <PlayCanvas
-          imageBase64={imageBase64}
+          imageUrl={imageUrl}
           audioBase64={audioBase64}
           audioMime={audioMime}
           muted={muted}
diff --git a/apps/web/components/PlayCanvas.tsx b/apps/web/components/PlayCanvas.tsx
index 9e77dde..0c4b185 100644
--- a/apps/web/components/PlayCanvas.tsx
+++ b/apps/web/components/PlayCanvas.tsx
@@ -159,7 +159,7 @@ function ChoiceButton({
 
 // ── Main component ─────────────────────────────────────────────────────
 export function PlayCanvas({
-  imageBase64,
+  imageUrl,
   audioBase64,
   audioMime,
   muted,
@@ -171,7 +171,7 @@ export function PlayCanvas({
   onSelectChoice,
   fullViewport = false,
 }: {
-  imageBase64: string | null;
+  imageUrl: string | null;
   audioBase64: string | null;
   audioMime: string | null;
   muted: boolean;
@@ -271,7 +271,7 @@ export function PlayCanvas({
     });
   }
 
-  const interactive = phase === "ready" && !!imageBase64;
+  const interactive = phase === "ready" && !!imageUrl;
   const dimmed = phase === "transitioning";
 
   const sizeStyle = fullViewport
@@ -306,16 +306,16 @@ export function PlayCanvas({
         />
       )}
 
-      {imageBase64 ? (
+      {imageUrl ? (
         <div
           className="relative inline-block"
           style={{ boxShadow: fullViewport ? "none" : SHADOW }}
         >
-          {/* Background image */}
+          {/* Background image — Runware CDN URL or data URI (mock mode) */}
           <img
-            key={imageBase64.slice(-48)}
+            key={imageUrl.slice(-48)}
             ref={imgRef}
-            src={`data:image/png;base64,${imageBase64}`}
+            src={imageUrl}
             alt="Generated scene"
             onClick={handleImageClick}
             onLoad={(e) => {
diff --git a/apps/web/next.config.ts b/apps/web/next.config.ts
index f13a9bd..f178b6a 100644
--- a/apps/web/next.config.ts
+++ b/apps/web/next.config.ts
@@ -14,11 +14,6 @@ const config: NextConfig = {
   turbopack: {
     root: path.join(__dirname, "..", ".."),
   },
-  experimental: {
-    serverActions: {
-      bodySizeLimit: "10mb",
-    },
-  },
 };
 
 export default config;
diff --git a/apps/web/vercel.json b/apps/web/vercel.json
new file mode 100644
index 0000000..4ccec08
--- /dev/null
+++ b/apps/web/vercel.json
@@ -0,0 +1,11 @@
+{
+  "$schema": "https://openapi.vercel.sh/vercel.json",
+  "framework": "nextjs",
+  "functions": {
+    "app/api/start/route.ts":       { "maxDuration": 60 },
+    "app/api/scene/route.ts":       { "maxDuration": 60 },
+    "app/api/vision/route.ts":      { "maxDuration": 60 },
+    "app/api/insert-beat/route.ts": { "maxDuration": 60 },
+    "app/api/beat-audio/route.ts":  { "maxDuration": 30 }
+  }
+}
diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts
index 4710e16..557a4a3 100644
--- a/packages/ai-client/src/image.ts
+++ b/packages/ai-client/src/image.ts
@@ -4,21 +4,23 @@ import { fetchWithRetry } from "./fetchWithRetry";
 // Runware uses its own task-array protocol (not OpenAI-compatible).
 // POST <baseUrl> with [{ taskType: "imageInference", ... }]; errors come
 // back as a 200 with `errors[]`, so we have to inspect the body either way.
-
-// FLUX img2img specifics:
-// - strength < 0.8 has minimal-to-no visible effect on FLUX models (per
-//   Runware docs); we default to 0.85 which leaves room to deviate while
-//   still anchoring on the seed image's composition.
-// - referenceImages caps at 4 per request; the FLUX.2 [klein] 9B KV model
-//   (runware:400@6) accelerates multi-reference inference by ~2.5× via its
-//   KV cache for reference latents (cached only WITHIN one inference run —
-//   not persisted across API calls, hence the upload-once-then-reference
-//   strategy below).
+//
+// referenceImages accepts UUIDs, public URLs, or base64. UUID is cheapest
+// in transport cost; URL is next; base64 last resort. The FLUX.2 [klein] 9B
+// KV variant (runware:400@6) accelerates multi-reference inference ~2.5× via
+// its KV cache for reference latents (cached only within one inference run,
+// not persisted across calls — hence the need to keep stable UUIDs/URLs for
+// later reuse).
+//
+// We request outputType=URL so Runware persists the image and returns a CDN
+// link the client can render directly. The same response also carries the
+// image UUID, so we never need a separate uploadImage round-trip to anchor
+// future referenceImages.
 const DEFAULT_IMG2IMG_STRENGTH = 0.85;
 const MAX_REFERENCE_IMAGES = 4;
 
 type RunwareImageResult = {
-  imageBase64Data?: string;
+  imageURL?: string;
   imageUUID?: string;
 };
 type RunwareError = {
@@ -33,32 +35,40 @@ type RunwareResponse = {
 
 export type GenerateImageOptions = {
   /**
-   * Reference image (UUID, plain base64, or data URI) to use as the
-   * img2img starting point. When set, FLUX preserves the seed image's
-   * composition and applies `strength` to allow deviation from it.
-   * Used for cross-scene visual continuity when sceneKey hits.
+   * Reference image (UUID, public URL, or base64) for img2img. When set,
+   * FLUX preserves the seed image's composition and applies `strength` to
+   * deviate. NOTE: FLUX.2 [klein] 9B KV does NOT support seedImage — use
+   * `referenceImages` for visual continuity instead.
    */
   seedImage?: string;
   /**
-   * Reference images (UUIDs or base64) to condition the generation on —
-   * typically character portraits to anchor identity / outfit / style
-   * across scenes. Runware caps at 4; we silently truncate beyond that.
+   * Reference images (UUIDs, URLs, or base64) to condition generation on —
+   * typically character portraits + the prior scene image. Runware caps at 4;
+   * we silently truncate beyond that.
    */
   referenceImages?: string[];
   /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. */
   strength?: number;
 };
 
+export type GenerateImageResult = {
+  /** Public CDN URL of the generated image (Runware-hosted). */
+  imageUrl: string;
+  /** Stable UUID for cheap re-reference in later `referenceImages`. */
+  imageUuid: string;
+};
+
 // ──────────────────────────────────────────────────────────────────────
-//  generateImage — text-to-image (default) or img2img / multi-reference
-//  when seedImage / referenceImages are supplied. Returns base64.
+//  generateImage — text-to-image (default) or referenceImages-conditioned.
+//  Returns both the public URL (for client display + future references)
+//  and the UUID (cheapest reference form for subsequent calls).
 // ──────────────────────────────────────────────────────────────────────
 
 export async function generateImage(
   config: ProviderConfig,
   prompt: string,
   options?: GenerateImageOptions,
-): Promise<string> {
+): Promise<GenerateImageResult> {
   const url = config.baseUrl.replace(/\/$/, "");
 
   const task: Record<string, unknown> = {
@@ -71,8 +81,9 @@ export async function generateImage(
     steps: 4,
     CFGScale: 3.5,
     numberResults: 1,
-    outputType: "base64Data",
+    outputType: "URL",
     outputFormat: "PNG",
+    includeCost: false,
   };
 
   if (options?.seedImage) {
@@ -109,66 +120,11 @@ export async function generateImage(
     );
   }
 
-  const b64 = json.data?.[0]?.imageBase64Data;
-  if (!b64) {
-    throw new Error(`No image in Runware response: ${text.slice(0, 300)}`);
+  const result = json.data?.[0];
+  const imageUrl = result?.imageURL;
+  const imageUuid = result?.imageUUID;
+  if (!imageUrl || !imageUuid) {
+    throw new Error(`No image URL/UUID in Runware response: ${text.slice(0, 300)}`);
   }
-  return b64;
-}
-
-// ──────────────────────────────────────────────────────────────────────
-//  uploadImage — registers a base64 image on Runware and returns its
-//  UUID, so subsequent generateImage calls can pass the UUID in
-//  referenceImages / seedImage instead of resending the base64 payload
-//  every time. Character base portraits and scene snapshots both flow
-//  through this path.
-//
-//  Runware exposes the imageUpload taskType for exactly this purpose.
-//  Returns the UUID. Caller treats a thrown error as "fall back to
-//  sending base64 next time" — non-fatal.
-// ──────────────────────────────────────────────────────────────────────
-
-export async function uploadImage(
-  config: ProviderConfig,
-  base64: string,
-): Promise<string> {
-  const url = config.baseUrl.replace(/\/$/, "");
-
-  const body = [
-    {
-      taskType: "imageUpload",
-      taskUUID: crypto.randomUUID(),
-      image: `data:image/png;base64,${base64}`,
-    },
-  ];
-
-  const res = await fetchWithRetry(url, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: `Bearer ${config.apiKey}`,
-    },
-    body: JSON.stringify(body),
-  });
-
-  const text = await res.text();
-  let json: RunwareResponse;
-  try {
-    json = JSON.parse(text) as RunwareResponse;
-  } catch {
-    throw new Error(`Image upload API error ${res.status}: ${text.slice(0, 500)}`);
-  }
-
-  if (json.errors?.length) {
-    const e = json.errors[0]!;
-    throw new Error(
-      `Runware upload error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}`,
-    );
-  }
-
-  const uuid = json.data?.[0]?.imageUUID;
-  if (!uuid) {
-    throw new Error(`No UUID in upload response: ${text.slice(0, 300)}`);
-  }
-  return uuid;
+  return { imageUrl, imageUuid };
 }
diff --git a/packages/ai-client/src/index.ts b/packages/ai-client/src/index.ts
index 13fa290..0153e48 100644
--- a/packages/ai-client/src/index.ts
+++ b/packages/ai-client/src/index.ts
@@ -1,5 +1,5 @@
 export { chat } from "./chat";
-export { generateImage, uploadImage } from "./image";
-export type { GenerateImageOptions } from "./image";
+export { generateImage } from "./image";
+export type { GenerateImageOptions, GenerateImageResult } from "./image";
 export { interpretClick } from "./vision";
 export type { ChatMessage } from "./chat";
diff --git a/packages/engine/package.json b/packages/engine/package.json
index 1b51280..0ed11ab 100644
--- a/packages/engine/package.json
+++ b/packages/engine/package.json
@@ -15,6 +15,7 @@
     "@yume/ai-client": "workspace:*",
     "@yume/tts-client": "workspace:*",
     "@yume/types": "workspace:*",
+    "jsonrepair": "^3.14.0",
     "sharp": "^0.33.5"
   }
 }
diff --git a/packages/engine/src/agents/characterDesigner.ts b/packages/engine/src/agents/characterDesigner.ts
index ae5f505..81dc4be 100644
--- a/packages/engine/src/agents/characterDesigner.ts
+++ b/packages/engine/src/agents/characterDesigner.ts
@@ -1,4 +1,4 @@
-import { chat, generateImage, uploadImage } from "@yume/ai-client";
+import { chat, generateImage } from "@yume/ai-client";
 import { provisionVoice } from "@yume/tts-client";
 import type {
   Character,
@@ -7,7 +7,7 @@ import type {
   Session,
 } from "@yume/types";
 import { parseJsonLoose } from "../jsonParser";
-import { mockImageBase64 } from "../mockImage";
+import { mockImageDataUri } from "../mockImage";
 import {
   CHARACTER_DESIGNER_SYSTEM,
   buildCharacterDesignerUserMessage,
@@ -24,8 +24,8 @@ import {
 //        which keeps appearance and vocal personality coherent)
 //
 //    2. In parallel:
-//       a. Image gen — base portrait from visualDescription + styleGuide
-//          then upload to Runware → get UUID for cheap re-reference
+//       a. Image gen — base portrait (Runware returns URL + UUID in one shot;
+//          no separate upload round-trip is needed for cheap re-reference)
 //       b. Voice provisioning — Xiaomi MiMo voicedesign from voiceDescription
 //          → reference audio for later voiceclone synth
 //
@@ -66,57 +66,39 @@ async function runDesignLLM(
   return parseJsonLoose<CharacterDesignOutput>(raw);
 }
 
-// Generate the per-character base portrait and upload it. The portrait is
-// a "concept sheet" — single character, neutral pose, plain background —
-// so it works well as a Runware referenceImages anchor for later scenes.
+// Generate the per-character base portrait. The portrait is a "concept
+// sheet" — single character, neutral pose, plain background — so it works
+// well as a Runware referenceImages anchor for later scenes.
 //
-// Returns both the base64 (for client-side asset use, e.g., 立绘登场
-// animations) and the Runware UUID (for cheap referencing in subsequent
-// Painter calls without resending the 100KB+ base64 each time).
+// Returns the URL (for any client display + URL-form references) and the
+// UUID (cheapest reference form for subsequent Painter calls). Both come
+// back in one `imageInference` response now that we use outputType=URL —
+// no separate upload step needed.
 //
-// The upload step is best-effort: if it fails, we still return the base64
-// so the next scene can pass it as a referenceImages entry directly (just
-// pays the bandwidth cost each call instead of once).
-async function renderAndUploadPortrait(
+// In mock mode we return the data URI as basePortraitUrl with no UUID
+// (Painter is short-circuited anyway, so the lack of a UUID is moot).
+async function renderPortrait(
   config: EngineConfig,
   charName: string,
   visualDescription: string,
   styleGuide: string,
-): Promise<{ basePortraitBase64?: string; basePortraitUuid?: string }> {
-  let base64: string;
+): Promise<{ basePortraitUrl?: string; basePortraitUuid?: string }> {
   try {
     if (config.mockImage) {
-      base64 = await mockImageBase64();
-    } else {
-      const prompt = buildCharacterPortraitPrompt(
-        charName,
-        visualDescription,
-        styleGuide,
-      );
-      base64 = await generateImage(config.image, prompt);
+      return { basePortraitUrl: await mockImageDataUri() };
     }
+    const prompt = buildCharacterPortraitPrompt(
+      charName,
+      visualDescription,
+      styleGuide,
+    );
+    const { imageUrl, imageUuid } = await generateImage(config.image, prompt);
+    return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid };
   } catch (err) {
     const msg = err instanceof Error ? err.message : String(err);
     console.error(`[characterDesigner] portrait gen failed for ${charName}: ${msg}`);
     return {}; // no portrait at all — degrade gracefully
   }
-
-  // Skip upload in mock mode — the mock image is the same static SVG every
-  // time and uploading it gives us a UUID that points to a useless asset.
-  if (config.mockImage) {
-    return { basePortraitBase64: base64 };
-  }
-
-  try {
-    const uuid = await uploadImage(config.image, base64);
-    return { basePortraitBase64: base64, basePortraitUuid: uuid };
-  } catch (err) {
-    const msg = err instanceof Error ? err.message : String(err);
-    console.warn(
-      `[characterDesigner] portrait upload failed for ${charName}: ${msg} — will pass base64 in subsequent calls`,
-    );
-    return { basePortraitBase64: base64 };
-  }
 }
 
 async function provisionVoiceSafe(
@@ -157,8 +139,8 @@ export async function designCharacter(
   // Step 2 — parallel: portrait + voice provisioning.
   const tProvision = Date.now();
   const portraitPromise = visualDescription
-    ? renderAndUploadPortrait(config, charName, visualDescription, session.styleGuide)
-    : Promise.resolve({} as Awaited<ReturnType<typeof renderAndUploadPortrait>>);
+    ? renderPortrait(config, charName, visualDescription, session.styleGuide)
+    : Promise.resolve({} as Awaited<ReturnType<typeof renderPortrait>>);
   const voicePromise = provisionVoiceSafe(config, voiceDescription, charName);
 
   const [portrait, voice] = await Promise.all([portraitPromise, voicePromise]);
@@ -170,7 +152,7 @@ export async function designCharacter(
     name: charName,
     voiceDescription,
     visualDescription,
-    basePortraitBase64: portrait.basePortraitBase64,
+    basePortraitUrl: portrait.basePortraitUrl,
     basePortraitUuid: portrait.basePortraitUuid,
     voice,
   };
diff --git a/packages/engine/src/agents/painter.ts b/packages/engine/src/agents/painter.ts
index e9d6e00..1f99128 100644
--- a/packages/engine/src/agents/painter.ts
+++ b/packages/engine/src/agents/painter.ts
@@ -1,12 +1,12 @@
 import { generateImage } from "@yume/ai-client";
-import type { GenerateImageOptions } from "@yume/ai-client";
+import type { GenerateImageOptions, GenerateImageResult } from "@yume/ai-client";
 import type {
   Beat,
   Character,
   EngineConfig,
   ProviderConfig,
 } from "@yume/types";
-import { mockImageBase64 } from "../mockImage";
+import { mockImageDataUri } from "../mockImage";
 import { buildPainterPrompt } from "../prompts";
 
 // ──────────────────────────────────────────────────────────────────────
@@ -24,6 +24,11 @@ import { buildPainterPrompt } from "../prompts";
 //       (most visually prominent)
 //    3. Other on-stage NPCs' portraits — secondary characters in the frame
 //
+//  References are sent as UUIDs (preferred — cheapest in transport) or URLs
+//  (fallback — still cheaper than base64). Base64 fallback was removed when
+//  generateImage switched to outputType=URL, which always returns both a UUID
+//  and a URL so we never lack a cheap reference handle.
+//
 //  Failure handling — two-tier degradation:
 //    A. referenceImages call           (preferred — full visual anchoring)
 //    B. pure text-to-image fallback    (last resort if Runware refs API errors)
@@ -36,8 +41,8 @@ export type PainterInput = {
   styleGuide: string;
   onStageCharacters: Character[];
   /**
-   * Prior scene's Runware UUID or base64. When set (= sceneKey hit a
-   * prior scene), it slots into referenceImages[0] for spatial continuity.
+   * Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
+   * scene), it slots into referenceImages[0] for spatial continuity.
    * Capacity-wise this displaces ONE character portrait — slot is shared
    * with character refs, capped at 4 total per Runware spec.
    */
@@ -67,10 +72,16 @@ export function collectReferenceImages(
   }
 
   // Slot 1+ — character portraits, speaker-first.
+  //
+  // Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
+  // UUID isn't always recognized by the `referenceImages` pipeline (the error
+  // surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
+  // they can always fetch it from their own infra. UUID is kept as a backstop
+  // for any edge case where URL is missing (e.g., legacy session state).
   const speakerName = entryBeat?.speaker;
   if (speakerName) {
     const speaker = characters.find((c) => c.name === speakerName);
-    const ref = speaker?.basePortraitUuid ?? speaker?.basePortraitBase64;
+    const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
     if (ref && refs.length < MAX_REFERENCE_IMAGES) {
       refs.push(ref);
       seen.add(speakerName);
@@ -81,7 +92,7 @@ export function collectReferenceImages(
     if (refs.length >= MAX_REFERENCE_IMAGES) break;
     if (seen.has(c.name)) continue;
     const char = characters.find((x) => x.name === c.name);
-    const ref = char?.basePortraitUuid ?? char?.basePortraitBase64;
+    const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
     if (ref) {
       refs.push(ref);
       seen.add(c.name);
@@ -96,7 +107,7 @@ async function tryGenerate(
   prompt: string,
   options: GenerateImageOptions,
   label: string,
-): Promise<string | null> {
+): Promise<GenerateImageResult | null> {
   try {
     return await generateImage(config, prompt, options);
   } catch (err) {
@@ -106,12 +117,18 @@ async function tryGenerate(
   }
 }
 
+export type PainterResult =
+  | { kind: "real"; imageUrl: string; imageUuid: string }
+  | { kind: "mock"; imageUrl: string };
+
 export async function runPainter(
   config: EngineConfig,
   input: PainterInput,
   entryBeat: Beat | undefined,
-): Promise<string> {
-  if (config.mockImage) return mockImageBase64();
+): Promise<PainterResult> {
+  if (config.mockImage) {
+    return { kind: "mock", imageUrl: await mockImageDataUri() };
+  }
 
   const prompt = buildPainterPrompt(
     input.integratedPrompt,
@@ -135,11 +152,12 @@ export async function runPainter(
       { referenceImages: refs },
       `referenceImages (${refs.length})`,
     );
-    if (r) return r;
+    if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
   }
 
   // Tier B — pure text-to-image. Last resort, used when Tier A failed OR
   // there are no references to send (first scene with no characters yet).
   // Errors here propagate to the caller.
-  return generateImage(config.image, prompt);
+  const r = await generateImage(config.image, prompt);
+  return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
 }
diff --git a/packages/engine/src/annotate.ts b/packages/engine/src/annotate.ts
index 77df5da..6991301 100644
--- a/packages/engine/src/annotate.ts
+++ b/packages/engine/src/annotate.ts
@@ -1,10 +1,44 @@
 import sharp from "sharp";
 
+const FETCH_TIMEOUT_MS = 5000;
+
+// Pull the bytes from an image URL or data URI into a Buffer suitable for
+// sharp. Data URIs are decoded inline (no network); http(s) URLs are fetched
+// with a short timeout — if Runware's CDN is slow we'd rather fail the vision
+// step quickly than tie up a 60s Vercel function on a single image read.
+async function loadImageBuffer(imageUrl: string): Promise<Buffer> {
+  if (imageUrl.startsWith("data:")) {
+    const comma = imageUrl.indexOf(",");
+    if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
+    const b64 = imageUrl.slice(comma + 1);
+    return Buffer.from(b64, "base64");
+  }
+
+  const ctrl = new AbortController();
+  const timer = setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
+  try {
+    const res = await fetch(imageUrl, { signal: ctrl.signal });
+    if (!res.ok) {
+      throw new Error(
+        `Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
+      );
+    }
+    const arr = await res.arrayBuffer();
+    return Buffer.from(arr);
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+// Marks the player's click point on the scene image so the vision LLM can see
+// WHERE they tapped. Output is base64 because the vision LLM is called over
+// the OpenAI-compatible chat endpoint, which only accepts image_url data URIs
+// — we can't hand it a Runware CDN URL directly.
 export async function annotateClick(
-  imageBase64: string,
+  imageUrl: string,
   click: { x: number; y: number },
 ): Promise<string> {
-  const buf = Buffer.from(imageBase64, "base64");
+  const buf = await loadImageBuffer(imageUrl);
 
   const resized = await sharp(buf)
     .resize({ width: 768, withoutEnlargement: true, fit: "inside" })
diff --git a/packages/engine/src/director.ts b/packages/engine/src/director.ts
index df7bde2..a1526e0 100644
--- a/packages/engine/src/director.ts
+++ b/packages/engine/src/director.ts
@@ -1,4 +1,4 @@
-import { chat, uploadImage } from "@yume/ai-client";
+import { chat } from "@yume/ai-client";
 import type {
   Character,
   EngineConfig,
@@ -29,7 +29,7 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
 //      │
 //      ├─ CharacterDesigner LLM × N    (parallel per new char)
 //      │     │
-//      │     ├─ portrait gen + upload  (parallel within agent)
+//      │     ├─ portrait gen (Runware returns URL + UUID in one call)
 //      │     └─ voice provisioning     (parallel within agent)
 //      │
 //      ├─ Cinematographer LLM          (parallel with all of the above)
@@ -37,13 +37,11 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
 //      └─ wait for all parallel branches
 //      │
 //      ▼
-//    Painter (FLUX referenceImages — two-tier degradation chain)
+//    Painter — generateImage with referenceImages (UUID/URL refs only;
+//              no base64 to upload, since outputType=URL gives both back)
 //      │
 //      ▼
-//    upload final scene image → Scene.imageUuid
-//      │
-//      ▼
-//    return { scene, sceneImageBase64, characters }
+//    return { scene, sceneImageUrl, characters }
 //
 //  The Cinematographer intentionally does NOT depend on CharacterDesigner
 //  output — it only positions named characters in the frame, not their
@@ -80,7 +78,7 @@ export function mergeCharacters(
       ...u,
       voice: u.voice ?? prev.voice,
       visualDescription: u.visualDescription ?? prev.visualDescription,
-      basePortraitBase64: u.basePortraitBase64 ?? prev.basePortraitBase64,
+      basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
       basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
       voiceDescription: u.voiceDescription || prev.voiceDescription,
     });
@@ -92,27 +90,22 @@ export function mergeCharacters(
 // scene — used by the Painter as one of the `referenceImages` (NOT as a
 // seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
 //
-// Returns the UUID if available (cheap reference, ~36 chars over the wire),
-// else the base64 of the most recent matching scene's image. Returns
-// undefined when no prior scene shares the current sceneKey.
+// Prefer URL over UUID for the same reason painter.collectReferenceImages
+// does: the UUID returned by `imageInference` isn't always recognized by
+// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
+// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
+// as a backstop. Returns undefined when no prior scene shares the sceneKey.
 function pickPriorSceneReference(
   session: Session,
   currentSceneKey: string | undefined,
-  priorImageBase64ByUuid: Map<string, string>,
 ): { priorSceneReference?: string; priorSceneKey?: string } {
   if (!currentSceneKey) return {};
   for (let i = session.history.length - 1; i >= 0; i--) {
     const prior = session.history[i]!.scene;
     if (prior.sceneKey === currentSceneKey) {
-      if (prior.imageUuid) {
-        return {
-          priorSceneReference: prior.imageUuid,
-          priorSceneKey: prior.sceneKey,
-        };
-      }
-      const cached = priorImageBase64ByUuid.get(prior.id);
-      if (cached) {
-        return { priorSceneReference: cached, priorSceneKey: prior.sceneKey };
+      const ref = prior.imageUrl ?? prior.imageUuid;
+      if (ref) {
+        return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
       }
     }
   }
@@ -121,25 +114,18 @@ function pickPriorSceneReference(
 
 export type SceneResult = {
   scene: Scene;
-  sceneImageBase64: string;
+  sceneImageUrl: string;
   characters: Character[];
 };
 
 // ──────────────────────────────────────────────────────────────────────
 //  directScene — the multi-agent pipeline. Used by orchestrator's
 //  startSession and requestScene.
-//
-//  priorImageBase64ByUuid: optional map from prior Scene.id → base64
-//  the caller has on-hand. If a sceneKey-hit scene's imageUuid is missing
-//  but the base64 is cached locally, we can still feed it as one of the
-//  Painter's referenceImages. Pass an empty map when caller has no cache
-//  (orchestrator does pass it for the start-session bootstrap).
 // ──────────────────────────────────────────────────────────────────────
 
 export async function directScene(
   config: EngineConfig,
   session: Session,
-  priorImageBase64ByUuid: Map<string, string> = new Map(),
 ): Promise<SceneResult> {
   const tTotal = Date.now();
 
@@ -168,7 +154,6 @@ export async function directScene(
   const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
     session,
     writerOut.sceneKey,
-    priorImageBase64ByUuid,
   );
 
   // Stage 2 — parallel: CharacterDesigner(s) and Cinematographer.
@@ -237,7 +222,7 @@ export async function directScene(
   );
 
   const tPainter = Date.now();
-  const sceneImageBase64 = await runPainter(
+  const painted = await runPainter(
     config,
     {
       integratedPrompt: cinemaOut.integratedPrompt,
@@ -249,22 +234,6 @@ export async function directScene(
   );
   tlog("[directScene] Painter", tPainter);
 
-  // Stage 4 — best-effort upload of the final scene image so the NEXT
-  // sceneKey-match call can reference its UUID instead of carrying base64.
-  // If upload fails, the scene still works; only loses cheap referencing
-  // on the next hop. Don't wait on mock images (static placeholder).
-  let imageUuid: string | undefined;
-  if (!config.mockImage) {
-    try {
-      const tUpload = Date.now();
-      imageUuid = await uploadImage(config.image, sceneImageBase64);
-      tlog("[directScene] image upload", tUpload);
-    } catch (err) {
-      const msg = err instanceof Error ? err.message : String(err);
-      console.warn(`[directScene] scene image upload failed: ${msg} — sceneKey reuse will need base64 fallback`);
-    }
-  }
-
   const scene: Scene = {
     id: newSceneId(),
     // scenePrompt is the cinematographer's English compositional output;
@@ -276,12 +245,13 @@ export async function directScene(
     beats: writerOut.beats,
     entryBeatId: writerOut.entryBeatId,
     sceneKey: writerOut.sceneKey,
-    imageUuid,
+    imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
+    imageUrl: painted.imageUrl,
   };
 
   tlog("[directScene] TOTAL", tTotal);
 
-  return { scene, sceneImageBase64, characters };
+  return { scene, sceneImageUrl: painted.imageUrl, characters };
 }
 
 // ──────────────────────────────────────────────────────────────────────
diff --git a/packages/engine/src/jsonParser.ts b/packages/engine/src/jsonParser.ts
index 20130fc..68d9de9 100644
--- a/packages/engine/src/jsonParser.ts
+++ b/packages/engine/src/jsonParser.ts
@@ -1,13 +1,44 @@
+import { jsonrepair, JSONRepairError } from "jsonrepair";
+
 // Strict-then-forgiving JSON parser for LLM output. Tries in order:
 //   1. Direct JSON.parse on the trimmed text.
 //   2. Extract from ```json``` fenced block.
 //   3. Slice between first { and last } and parse.
-//   4. Apply best-effort regex repair (trailing commas, missing commas
-//      between adjacent values) and try again.
+//   4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
 //
-// On final failure, logs the first 800 chars of the raw model output so we
-// can see what the LLM did wrong (the standard error message only shows
-// the position, not the surrounding context).
+// On final failure, logs the FULL raw model output so we can diagnose the
+// actual syntax error.
+//
+// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
+// broad LLM-output failure modes: truncated JSON, missing commas/brackets,
+// single quotes, Python None/True/False, JS comments. We layer a small set
+// of targeted pre-repairs in front of it for failure modes jsonrepair can't
+// disambiguate on its own (see preRepair).
+
+// ──────────────────────────────────────────────────────────────────────
+//  preRepair — fix specific LLM error patterns before handing to jsonrepair.
+//
+//  Pattern 1: missing closing quote on a key.
+//     Broken:  "lineDelivery: "语速稍快...",
+//     Correct: "lineDelivery": "语速稍快...",
+//
+//  jsonrepair fails on this because it's ambiguous — "lineDelivery: " could
+//  be a complete string value, leaving "语速稍快..." as a syntax error. But
+//  if we see  "<key-like>:<whitespace>"  we know structurally it should be
+//  a key-colon-value triplet.
+//
+//  Match constraints:
+//    - The key match excludes  "  \n  :  so we can't overrun into adjacent
+//      fields or absorb the colon as part of the key name.
+//    - The colon must be followed by whitespace and another  "  (the value
+//      string's opening quote). This is what disambiguates from a value
+//      string that happens to contain a colon.
+// ──────────────────────────────────────────────────────────────────────
+
+function preRepair(s: string): string {
+  return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
+}
+
 export function parseJsonLoose<T>(raw: string): T {
   const trimmed = raw.trim();
 
@@ -28,54 +59,36 @@ export function parseJsonLoose<T>(raw: string): T {
 
   const first = trimmed.indexOf("{");
   const last = trimmed.lastIndexOf("}");
-  if (first !== -1 && last > first) {
-    const slice = trimmed.slice(first, last + 1);
-    try {
-      return JSON.parse(slice) as T;
-    } catch {
-      // Last resort: try repairing common LLM-output malformations.
-      const repaired = repairJsonString(slice);
+  const slice =
+    first !== -1 && last > first ? trimmed.slice(first, last + 1) : trimmed;
+
+  // Try the brace-sliced version first; if there were no braces at all
+  // (slice === trimmed), this is just a second attempt at the raw text.
+  try {
+    return JSON.parse(slice) as T;
+  } catch {
+    // Targeted pre-repair (no-op on already-valid JSON) → jsonrepair.
+    const prefixed = preRepair(slice);
+
+    // If preRepair changed something, give the cheap path another shot —
+    // the input might already be valid now without needing jsonrepair.
+    if (prefixed !== slice) {
       try {
-        return JSON.parse(repaired) as T;
-      } catch (err) {
-        console.error(
-          `[parseJsonLoose] all strategies failed. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
-        );
-        throw err;
+        return JSON.parse(prefixed) as T;
+      } catch {
+        // fall through to jsonrepair
       }
     }
+
+    try {
+      const repaired = jsonrepair(prefixed);
+      return JSON.parse(repaired) as T;
+    } catch (err) {
+      const isRepairErr = err instanceof JSONRepairError;
+      console.error(
+        `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Full raw model output:\n${raw}`,
+      );
+      throw err;
+    }
   }
-
-  console.error(
-    `[parseJsonLoose] no { ... } found. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
-  );
-  throw new Error(`Failed to parse JSON from model output: ${raw.slice(0, 200)}`);
-}
-
-// Best-effort repair of LLM-typical JSON syntax errors. Targeted at the two
-// most common failures we see in practice:
-//   1. Trailing comma before } or ].
-//   2. Missing comma between two adjacent JSON values (the specific error
-//      mode we hit at position 3390).
-//
-// Deliberately conservative — does NOT try to fix unclosed strings,
-// unbalanced braces, or strip JS-style comments. The comment-stripping
-// path was previously included but would corrupt JSON string values
-// containing `//` (e.g. URLs like "https://example.com"); since LLMs in
-// `responseFormat: "json_object"` mode essentially never emit comments,
-// dropping that step is a net win for safety.
-function repairJsonString(s: string): string {
-  return s
-    // 1. Strip trailing commas before } or ].
-    .replace(/,(\s*[}\]])/g, "$1")
-    // 2. Insert missing commas between two adjacent JSON values. The cases:
-    //      } { → },{        ] [ → ],[        } [ → },[        ] { → ],{
-    //      "string" "key"   "string" {       "string" [
-    //      number then "key" / { / [
-    //
-    //    The regex looks for a closing token (} ] " or a digit) followed by
-    //    a newline and an opening token (} ] " a letter), and inserts a
-    //    comma between them. Requires the newline (\s*\n\s*) so it only
-    //    fires across line boundaries, never within a single-line value.
-    .replace(/(\}|\]|"|\d)(\s*\n\s*)(\{|\[|")/g, "$1,$2$3");
 }
diff --git a/packages/engine/src/mockImage.ts b/packages/engine/src/mockImage.ts
index 3d3b2ae..fcc0d5c 100644
--- a/packages/engine/src/mockImage.ts
+++ b/packages/engine/src/mockImage.ts
@@ -1,11 +1,15 @@
 import sharp from "sharp";
 
-let cached: string | undefined;
+let cachedDataUri: string | undefined;
 
 // A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
 // TTS path without paying for image generation. Generated once, then memoized.
-export async function mockImageBase64(): Promise<string> {
-  if (cached) return cached;
+// Returned as a data URI so the rest of the pipeline can treat it as an
+// `imageUrl` interchangeably with real Runware URLs (the client's <img src>
+// accepts both, and we never feed a mock image to Runware's referenceImages
+// because mockImage mode short-circuits the Painter entirely).
+export async function mockImageDataUri(): Promise<string> {
+  if (cachedDataUri) return cachedDataUri;
 
   const W = 1792;
   const H = 1024;
@@ -20,6 +24,6 @@ export async function mockImageBase64(): Promise<string> {
   </svg>`;
 
   const png = await sharp(Buffer.from(svg)).png().toBuffer();
-  cached = png.toString("base64");
-  return cached;
+  cachedDataUri = `data:image/png;base64,${png.toString("base64")}`;
+  return cachedDataUri;
 }
diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts
index 87a8e6a..7813d27 100644
--- a/packages/engine/src/orchestrator.ts
+++ b/packages/engine/src/orchestrator.ts
@@ -49,14 +49,14 @@ export async function startSession(
     characters: [],
   };
 
-  const { scene, sceneImageBase64, characters } = await directScene(config, session);
+  const { scene, sceneImageUrl, characters } = await directScene(config, session);
 
   tlog("[start] TOTAL", tTotal);
 
   return {
     sessionId: session.id,
     scene,
-    imageBase64: sceneImageBase64,
+    imageUrl: sceneImageUrl,
     characters,
   };
 }
@@ -71,7 +71,7 @@ export async function requestScene(
 ): Promise<SceneResponse> {
   const tTotal = Date.now();
 
-  const { scene, sceneImageBase64, characters } = await directScene(
+  const { scene, sceneImageUrl, characters } = await directScene(
     config,
     req.session,
   );
@@ -80,7 +80,7 @@ export async function requestScene(
 
   return {
     scene,
-    imageBase64: sceneImageBase64,
+    imageUrl: sceneImageUrl,
     characters,
   };
 }
@@ -95,7 +95,7 @@ export async function visionDecide(
   config: EngineConfig,
   req: VisionRequest,
 ): Promise<VisionResponse> {
-  const annotated = await annotateClick(req.prevImageBase64, req.click);
+  const annotated = await annotateClick(req.prevImageUrl, req.click);
   const current = req.session.history.at(-1)?.scene ?? null;
   return interpret(config.vision, annotated, current);
 }
diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts
index 01be754..e98503f 100644
--- a/packages/types/src/index.ts
+++ b/packages/types/src/index.ts
@@ -56,17 +56,24 @@ export type Scene = {
    * e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this
    * key, the Painter slots the previous Scene's image into Runware's
    * `referenceImages` (alongside character portraits) so the same physical
-   * space stays visually consistent across cuts. (Originally planned as a
-   * seedImage / img2img anchor, but FLUX.2 [klein] 9B KV does not support
-   * seedImage — referenceImages serves the same purpose with the model.)
+   * space stays visually consistent across cuts.
    */
   sceneKey?: string;
   /**
-   * Runware UUID of this Scene's generated image — once uploaded, subsequent
-   * Scenes that match sceneKey can reference it via `referenceImages`
-   * without resending base64.
+   * Runware UUID of this Scene's generated image. Cheapest form to send back
+   * to Runware's `referenceImages` in subsequent calls (UUID > URL > base64
+   * in transport cost). Not shown to the client — `imageUrl` is what renders.
    */
   imageUuid?: string;
+  /**
+   * Public CDN URL of this Scene's generated image. Returned to the client for
+   * `<img src>` rendering, and is what the client passes back to `/api/vision`
+   * as `prevImageUrl` so the server can re-fetch the bytes for click annotation.
+   *
+   * For MOCK_IMAGE=true this is a `data:image/png;base64,...` data URI, not a
+   * Runware URL — the client renders both forms transparently.
+   */
+  imageUrl?: string;
 };
 
 export type SceneExit =
@@ -111,17 +118,17 @@ export type Character = {
    */
   visualDescription?: string;
   /**
-   * Base portrait image generated by the CharacterDesigner once, then reused
-   * as a Runware `referenceImages` entry in every subsequent scene the
-   * character appears in. Stored as base64 for client display.
-   */
-  basePortraitBase64?: string;
-  /**
-   * Runware UUID for the base portrait. Once uploaded via the image-upload
-   * endpoint, subsequent Painter calls reference this UUID instead of
-   * resending the full base64 payload.
+   * Runware UUID for the base portrait. Generated by the CharacterDesigner
+   * once, reused as a `referenceImages` entry on every subsequent scene the
+   * character appears in. UUID is the cheapest reference form for Runware.
    */
   basePortraitUuid?: string;
+  /**
+   * Public CDN URL for the base portrait. Same image as `basePortraitUuid`;
+   * kept around for the client (if it ever wants to render character cards)
+   * and as a fallback reference form for `referenceImages` when UUID is absent.
+   */
+  basePortraitUrl?: string;
   /** Xiaomi MiMo voice reference audio. */
   voice?: CharacterVoice;
 };
@@ -196,7 +203,8 @@ export type StartRequest = {
 export type StartResponse = {
   sessionId: string;
   scene: Scene;
-  imageBase64: string;
+  /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
+  imageUrl: string;
   /** Character registry with voice references + visual cards provisioned. */
   characters: Character[];
 };
@@ -210,7 +218,8 @@ export type SceneRequest = {
 
 export type SceneResponse = {
   scene: Scene;
-  imageBase64: string;
+  /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
+  imageUrl: string;
   characters: Character[];
 };
 
@@ -235,7 +244,12 @@ export type BeatAudioResponse = {
 // trigger a scene change.
 export type VisionRequest = {
   session: Session;
-  prevImageBase64: string;
+  /**
+   * Public CDN URL (or data URI in MOCK_IMAGE mode) of the scene the player
+   * just clicked. The server re-fetches the bytes to annotate the click and
+   * pass an OpenAI-compatible image_url to the vision LLM.
+   */
+  prevImageUrl: string;
   click: { x: number; y: number };
 };
 
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 483ebce..8607276 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -75,6 +75,9 @@ importers:
       '@yume/types':
         specifier: workspace:*
         version: link:../types
+      jsonrepair:
+        specifier: ^3.14.0
+        version: 3.14.0
       sharp:
         specifier: ^0.33.5
         version: 0.33.5
@@ -594,6 +597,10 @@ packages:
     resolution: {integrity: sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==}
     hasBin: true
 
+  jsonrepair@3.14.0:
+    resolution: {integrity: sha512-tWPGKMZf/8UPim+fcW2EfcQ/d/7aKUrP6IECz9G3Tu6Q5dX0orSleqJ9z6sSw7qrQkjF8/Edo4DvsWBZ8H+HNg==}
+    hasBin: true
+
   lilconfig@3.1.3:
     resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==}
     engines: {node: '>=14'}
@@ -1240,6 +1247,8 @@ snapshots:
 
   jiti@1.21.7: {}
 
+  jsonrepair@3.14.0: {}
+
   lilconfig@3.1.3: {}
 
   lines-and-columns@1.2.4: {}
diff --git a/vercel.json b/vercel.json
deleted file mode 100644
index 5af8dcf..0000000
--- a/vercel.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "$schema": "https://openapi.vercel.sh/vercel.json",
-  "framework": "nextjs",
-  "buildCommand": "pnpm build",
-  "installCommand": "pnpm install",
-  "functions": {
-    "apps/web/app/api/interact/route.ts": { "maxDuration": 60 },
-    "apps/web/app/api/vision/route.ts": { "maxDuration": 60 },
-    "apps/web/app/api/start/route.ts": { "maxDuration": 60 }
-  }
-}