- {/* Background image */}
+ {/* Background image — Runware CDN URL or data URI (mock mode) */}

{
diff --git a/apps/web/next.config.ts b/apps/web/next.config.ts
index f13a9bd..f178b6a 100644
--- a/apps/web/next.config.ts
+++ b/apps/web/next.config.ts
@@ -14,11 +14,6 @@ const config: NextConfig = {
turbopack: {
root: path.join(__dirname, "..", ".."),
},
- experimental: {
- serverActions: {
- bodySizeLimit: "10mb",
- },
- },
};
export default config;
diff --git a/apps/web/vercel.json b/apps/web/vercel.json
new file mode 100644
index 0000000..4ccec08
--- /dev/null
+++ b/apps/web/vercel.json
@@ -0,0 +1,11 @@
+{
+ "$schema": "https://openapi.vercel.sh/vercel.json",
+ "framework": "nextjs",
+ "functions": {
+ "app/api/start/route.ts": { "maxDuration": 60 },
+ "app/api/scene/route.ts": { "maxDuration": 60 },
+ "app/api/vision/route.ts": { "maxDuration": 60 },
+ "app/api/insert-beat/route.ts": { "maxDuration": 60 },
+ "app/api/beat-audio/route.ts": { "maxDuration": 30 }
+ }
+}
diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts
index 4710e16..557a4a3 100644
--- a/packages/ai-client/src/image.ts
+++ b/packages/ai-client/src/image.ts
@@ -4,21 +4,23 @@ import { fetchWithRetry } from "./fetchWithRetry";
// Runware uses its own task-array protocol (not OpenAI-compatible).
// POST
with [{ taskType: "imageInference", ... }]; errors come
// back as a 200 with `errors[]`, so we have to inspect the body either way.
-
-// FLUX img2img specifics:
-// - strength < 0.8 has minimal-to-no visible effect on FLUX models (per
-// Runware docs); we default to 0.85 which leaves room to deviate while
-// still anchoring on the seed image's composition.
-// - referenceImages caps at 4 per request; the FLUX.2 [klein] 9B KV model
-// (runware:400@6) accelerates multi-reference inference by ~2.5× via its
-// KV cache for reference latents (cached only WITHIN one inference run —
-// not persisted across API calls, hence the upload-once-then-reference
-// strategy below).
+//
+// referenceImages accepts UUIDs, public URLs, or base64. UUID is cheapest
+// in transport cost; URL is next; base64 last resort. The FLUX.2 [klein] 9B
+// KV variant (runware:400@6) accelerates multi-reference inference ~2.5× via
+// its KV cache for reference latents (cached only within one inference run,
+// not persisted across calls — hence the need to keep stable UUIDs/URLs for
+// later reuse).
+//
+// We request outputType=URL so Runware persists the image and returns a CDN
+// link the client can render directly. The same response also carries the
+// image UUID, so we never need a separate uploadImage round-trip to anchor
+// future referenceImages.
const DEFAULT_IMG2IMG_STRENGTH = 0.85;
const MAX_REFERENCE_IMAGES = 4;
type RunwareImageResult = {
- imageBase64Data?: string;
+ imageURL?: string;
imageUUID?: string;
};
type RunwareError = {
@@ -33,32 +35,40 @@ type RunwareResponse = {
export type GenerateImageOptions = {
/**
- * Reference image (UUID, plain base64, or data URI) to use as the
- * img2img starting point. When set, FLUX preserves the seed image's
- * composition and applies `strength` to allow deviation from it.
- * Used for cross-scene visual continuity when sceneKey hits.
+ * Reference image (UUID, public URL, or base64) for img2img. When set,
+ * FLUX preserves the seed image's composition and applies `strength` to
+ * deviate. NOTE: FLUX.2 [klein] 9B KV does NOT support seedImage — use
+ * `referenceImages` for visual continuity instead.
*/
seedImage?: string;
/**
- * Reference images (UUIDs or base64) to condition the generation on —
- * typically character portraits to anchor identity / outfit / style
- * across scenes. Runware caps at 4; we silently truncate beyond that.
+ * Reference images (UUIDs, URLs, or base64) to condition generation on —
+ * typically character portraits + the prior scene image. Runware caps at 4;
+ * we silently truncate beyond that.
*/
referenceImages?: string[];
/** 0–1, FLUX needs ≥ 0.8 to actually have an effect. */
strength?: number;
};
+export type GenerateImageResult = {
+ /** Public CDN URL of the generated image (Runware-hosted). */
+ imageUrl: string;
+ /** Stable UUID for cheap re-reference in later `referenceImages`. */
+ imageUuid: string;
+};
+
// ──────────────────────────────────────────────────────────────────────
-// generateImage — text-to-image (default) or img2img / multi-reference
-// when seedImage / referenceImages are supplied. Returns base64.
+// generateImage — text-to-image (default) or referenceImages-conditioned.
+// Returns both the public URL (for client display + future references)
+// and the UUID (cheapest reference form for subsequent calls).
// ──────────────────────────────────────────────────────────────────────
export async function generateImage(
config: ProviderConfig,
prompt: string,
options?: GenerateImageOptions,
-): Promise {
+): Promise {
const url = config.baseUrl.replace(/\/$/, "");
const task: Record = {
@@ -71,8 +81,9 @@ export async function generateImage(
steps: 4,
CFGScale: 3.5,
numberResults: 1,
- outputType: "base64Data",
+ outputType: "URL",
outputFormat: "PNG",
+ includeCost: false,
};
if (options?.seedImage) {
@@ -109,66 +120,11 @@ export async function generateImage(
);
}
- const b64 = json.data?.[0]?.imageBase64Data;
- if (!b64) {
- throw new Error(`No image in Runware response: ${text.slice(0, 300)}`);
+ const result = json.data?.[0];
+ const imageUrl = result?.imageURL;
+ const imageUuid = result?.imageUUID;
+ if (!imageUrl || !imageUuid) {
+ throw new Error(`No image URL/UUID in Runware response: ${text.slice(0, 300)}`);
}
- return b64;
-}
-
-// ──────────────────────────────────────────────────────────────────────
-// uploadImage — registers a base64 image on Runware and returns its
-// UUID, so subsequent generateImage calls can pass the UUID in
-// referenceImages / seedImage instead of resending the base64 payload
-// every time. Character base portraits and scene snapshots both flow
-// through this path.
-//
-// Runware exposes the imageUpload taskType for exactly this purpose.
-// Returns the UUID. Caller treats a thrown error as "fall back to
-// sending base64 next time" — non-fatal.
-// ──────────────────────────────────────────────────────────────────────
-
-export async function uploadImage(
- config: ProviderConfig,
- base64: string,
-): Promise {
- const url = config.baseUrl.replace(/\/$/, "");
-
- const body = [
- {
- taskType: "imageUpload",
- taskUUID: crypto.randomUUID(),
- image: `data:image/png;base64,${base64}`,
- },
- ];
-
- const res = await fetchWithRetry(url, {
- method: "POST",
- headers: {
- "Content-Type": "application/json",
- Authorization: `Bearer ${config.apiKey}`,
- },
- body: JSON.stringify(body),
- });
-
- const text = await res.text();
- let json: RunwareResponse;
- try {
- json = JSON.parse(text) as RunwareResponse;
- } catch {
- throw new Error(`Image upload API error ${res.status}: ${text.slice(0, 500)}`);
- }
-
- if (json.errors?.length) {
- const e = json.errors[0]!;
- throw new Error(
- `Runware upload error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}`,
- );
- }
-
- const uuid = json.data?.[0]?.imageUUID;
- if (!uuid) {
- throw new Error(`No UUID in upload response: ${text.slice(0, 300)}`);
- }
- return uuid;
+ return { imageUrl, imageUuid };
}
diff --git a/packages/ai-client/src/index.ts b/packages/ai-client/src/index.ts
index 13fa290..0153e48 100644
--- a/packages/ai-client/src/index.ts
+++ b/packages/ai-client/src/index.ts
@@ -1,5 +1,5 @@
export { chat } from "./chat";
-export { generateImage, uploadImage } from "./image";
-export type { GenerateImageOptions } from "./image";
+export { generateImage } from "./image";
+export type { GenerateImageOptions, GenerateImageResult } from "./image";
export { interpretClick } from "./vision";
export type { ChatMessage } from "./chat";
diff --git a/packages/engine/package.json b/packages/engine/package.json
index 1b51280..0ed11ab 100644
--- a/packages/engine/package.json
+++ b/packages/engine/package.json
@@ -15,6 +15,7 @@
"@yume/ai-client": "workspace:*",
"@yume/tts-client": "workspace:*",
"@yume/types": "workspace:*",
+ "jsonrepair": "^3.14.0",
"sharp": "^0.33.5"
}
}
diff --git a/packages/engine/src/agents/characterDesigner.ts b/packages/engine/src/agents/characterDesigner.ts
index ae5f505..81dc4be 100644
--- a/packages/engine/src/agents/characterDesigner.ts
+++ b/packages/engine/src/agents/characterDesigner.ts
@@ -1,4 +1,4 @@
-import { chat, generateImage, uploadImage } from "@yume/ai-client";
+import { chat, generateImage } from "@yume/ai-client";
import { provisionVoice } from "@yume/tts-client";
import type {
Character,
@@ -7,7 +7,7 @@ import type {
Session,
} from "@yume/types";
import { parseJsonLoose } from "../jsonParser";
-import { mockImageBase64 } from "../mockImage";
+import { mockImageDataUri } from "../mockImage";
import {
CHARACTER_DESIGNER_SYSTEM,
buildCharacterDesignerUserMessage,
@@ -24,8 +24,8 @@ import {
// which keeps appearance and vocal personality coherent)
//
// 2. In parallel:
-// a. Image gen — base portrait from visualDescription + styleGuide
-// then upload to Runware → get UUID for cheap re-reference
+// a. Image gen — base portrait (Runware returns URL + UUID in one shot;
+// no separate upload round-trip is needed for cheap re-reference)
// b. Voice provisioning — Xiaomi MiMo voicedesign from voiceDescription
// → reference audio for later voiceclone synth
//
@@ -66,57 +66,39 @@ async function runDesignLLM(
return parseJsonLoose(raw);
}
-// Generate the per-character base portrait and upload it. The portrait is
-// a "concept sheet" — single character, neutral pose, plain background —
-// so it works well as a Runware referenceImages anchor for later scenes.
+// Generate the per-character base portrait. The portrait is a "concept
+// sheet" — single character, neutral pose, plain background — so it works
+// well as a Runware referenceImages anchor for later scenes.
//
-// Returns both the base64 (for client-side asset use, e.g., 立绘登场
-// animations) and the Runware UUID (for cheap referencing in subsequent
-// Painter calls without resending the 100KB+ base64 each time).
+// Returns the URL (for any client display + URL-form references) and the
+// UUID (cheapest reference form for subsequent Painter calls). Both come
+// back in one `imageInference` response now that we use outputType=URL —
+// no separate upload step needed.
//
-// The upload step is best-effort: if it fails, we still return the base64
-// so the next scene can pass it as a referenceImages entry directly (just
-// pays the bandwidth cost each call instead of once).
-async function renderAndUploadPortrait(
+// In mock mode we return the data URI as basePortraitUrl with no UUID
+// (Painter is short-circuited anyway, so the lack of a UUID is moot).
+async function renderPortrait(
config: EngineConfig,
charName: string,
visualDescription: string,
styleGuide: string,
-): Promise<{ basePortraitBase64?: string; basePortraitUuid?: string }> {
- let base64: string;
+): Promise<{ basePortraitUrl?: string; basePortraitUuid?: string }> {
try {
if (config.mockImage) {
- base64 = await mockImageBase64();
- } else {
- const prompt = buildCharacterPortraitPrompt(
- charName,
- visualDescription,
- styleGuide,
- );
- base64 = await generateImage(config.image, prompt);
+ return { basePortraitUrl: await mockImageDataUri() };
}
+ const prompt = buildCharacterPortraitPrompt(
+ charName,
+ visualDescription,
+ styleGuide,
+ );
+ const { imageUrl, imageUuid } = await generateImage(config.image, prompt);
+ return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid };
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[characterDesigner] portrait gen failed for ${charName}: ${msg}`);
return {}; // no portrait at all — degrade gracefully
}
-
- // Skip upload in mock mode — the mock image is the same static SVG every
- // time and uploading it gives us a UUID that points to a useless asset.
- if (config.mockImage) {
- return { basePortraitBase64: base64 };
- }
-
- try {
- const uuid = await uploadImage(config.image, base64);
- return { basePortraitBase64: base64, basePortraitUuid: uuid };
- } catch (err) {
- const msg = err instanceof Error ? err.message : String(err);
- console.warn(
- `[characterDesigner] portrait upload failed for ${charName}: ${msg} — will pass base64 in subsequent calls`,
- );
- return { basePortraitBase64: base64 };
- }
}
async function provisionVoiceSafe(
@@ -157,8 +139,8 @@ export async function designCharacter(
// Step 2 — parallel: portrait + voice provisioning.
const tProvision = Date.now();
const portraitPromise = visualDescription
- ? renderAndUploadPortrait(config, charName, visualDescription, session.styleGuide)
- : Promise.resolve({} as Awaited>);
+ ? renderPortrait(config, charName, visualDescription, session.styleGuide)
+ : Promise.resolve({} as Awaited>);
const voicePromise = provisionVoiceSafe(config, voiceDescription, charName);
const [portrait, voice] = await Promise.all([portraitPromise, voicePromise]);
@@ -170,7 +152,7 @@ export async function designCharacter(
name: charName,
voiceDescription,
visualDescription,
- basePortraitBase64: portrait.basePortraitBase64,
+ basePortraitUrl: portrait.basePortraitUrl,
basePortraitUuid: portrait.basePortraitUuid,
voice,
};
diff --git a/packages/engine/src/agents/painter.ts b/packages/engine/src/agents/painter.ts
index e9d6e00..1f99128 100644
--- a/packages/engine/src/agents/painter.ts
+++ b/packages/engine/src/agents/painter.ts
@@ -1,12 +1,12 @@
import { generateImage } from "@yume/ai-client";
-import type { GenerateImageOptions } from "@yume/ai-client";
+import type { GenerateImageOptions, GenerateImageResult } from "@yume/ai-client";
import type {
Beat,
Character,
EngineConfig,
ProviderConfig,
} from "@yume/types";
-import { mockImageBase64 } from "../mockImage";
+import { mockImageDataUri } from "../mockImage";
import { buildPainterPrompt } from "../prompts";
// ──────────────────────────────────────────────────────────────────────
@@ -24,6 +24,11 @@ import { buildPainterPrompt } from "../prompts";
// (most visually prominent)
// 3. Other on-stage NPCs' portraits — secondary characters in the frame
//
+// References are sent as UUIDs (preferred — cheapest in transport) or URLs
+// (fallback — still cheaper than base64). Base64 fallback was removed when
+// generateImage switched to outputType=URL, which always returns both a UUID
+// and a URL so we never lack a cheap reference handle.
+//
// Failure handling — two-tier degradation:
// A. referenceImages call (preferred — full visual anchoring)
// B. pure text-to-image fallback (last resort if Runware refs API errors)
@@ -36,8 +41,8 @@ export type PainterInput = {
styleGuide: string;
onStageCharacters: Character[];
/**
- * Prior scene's Runware UUID or base64. When set (= sceneKey hit a
- * prior scene), it slots into referenceImages[0] for spatial continuity.
+ * Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
+ * scene), it slots into referenceImages[0] for spatial continuity.
* Capacity-wise this displaces ONE character portrait — slot is shared
* with character refs, capped at 4 total per Runware spec.
*/
@@ -67,10 +72,16 @@ export function collectReferenceImages(
}
// Slot 1+ — character portraits, speaker-first.
+ //
+ // Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
+ // UUID isn't always recognized by the `referenceImages` pipeline (the error
+ // surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
+ // they can always fetch it from their own infra. UUID is kept as a backstop
+ // for any edge case where URL is missing (e.g., legacy session state).
const speakerName = entryBeat?.speaker;
if (speakerName) {
const speaker = characters.find((c) => c.name === speakerName);
- const ref = speaker?.basePortraitUuid ?? speaker?.basePortraitBase64;
+ const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
if (ref && refs.length < MAX_REFERENCE_IMAGES) {
refs.push(ref);
seen.add(speakerName);
@@ -81,7 +92,7 @@ export function collectReferenceImages(
if (refs.length >= MAX_REFERENCE_IMAGES) break;
if (seen.has(c.name)) continue;
const char = characters.find((x) => x.name === c.name);
- const ref = char?.basePortraitUuid ?? char?.basePortraitBase64;
+ const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
if (ref) {
refs.push(ref);
seen.add(c.name);
@@ -96,7 +107,7 @@ async function tryGenerate(
prompt: string,
options: GenerateImageOptions,
label: string,
-): Promise {
+): Promise {
try {
return await generateImage(config, prompt, options);
} catch (err) {
@@ -106,12 +117,18 @@ async function tryGenerate(
}
}
+export type PainterResult =
+ | { kind: "real"; imageUrl: string; imageUuid: string }
+ | { kind: "mock"; imageUrl: string };
+
export async function runPainter(
config: EngineConfig,
input: PainterInput,
entryBeat: Beat | undefined,
-): Promise {
- if (config.mockImage) return mockImageBase64();
+): Promise {
+ if (config.mockImage) {
+ return { kind: "mock", imageUrl: await mockImageDataUri() };
+ }
const prompt = buildPainterPrompt(
input.integratedPrompt,
@@ -135,11 +152,12 @@ export async function runPainter(
{ referenceImages: refs },
`referenceImages (${refs.length})`,
);
- if (r) return r;
+ if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}
// Tier B — pure text-to-image. Last resort, used when Tier A failed OR
// there are no references to send (first scene with no characters yet).
// Errors here propagate to the caller.
- return generateImage(config.image, prompt);
+ const r = await generateImage(config.image, prompt);
+ return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}
diff --git a/packages/engine/src/annotate.ts b/packages/engine/src/annotate.ts
index 77df5da..6991301 100644
--- a/packages/engine/src/annotate.ts
+++ b/packages/engine/src/annotate.ts
@@ -1,10 +1,44 @@
import sharp from "sharp";
+const FETCH_TIMEOUT_MS = 5000;
+
+// Pull the bytes from an image URL or data URI into a Buffer suitable for
+// sharp. Data URIs are decoded inline (no network); http(s) URLs are fetched
+// with a short timeout — if Runware's CDN is slow we'd rather fail the vision
+// step quickly than tie up a 60s Vercel function on a single image read.
+async function loadImageBuffer(imageUrl: string): Promise {
+ if (imageUrl.startsWith("data:")) {
+ const comma = imageUrl.indexOf(",");
+ if (comma === -1) throw new Error("Malformed data URI in prevImageUrl");
+ const b64 = imageUrl.slice(comma + 1);
+ return Buffer.from(b64, "base64");
+ }
+
+ const ctrl = new AbortController();
+ const timer = setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
+ try {
+ const res = await fetch(imageUrl, { signal: ctrl.signal });
+ if (!res.ok) {
+ throw new Error(
+ `Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`,
+ );
+ }
+ const arr = await res.arrayBuffer();
+ return Buffer.from(arr);
+ } finally {
+ clearTimeout(timer);
+ }
+}
+
+// Marks the player's click point on the scene image so the vision LLM can see
+// WHERE they tapped. Output is base64 because the vision LLM is called over
+// the OpenAI-compatible chat endpoint, which only accepts image_url data URIs
+// — we can't hand it a Runware CDN URL directly.
export async function annotateClick(
- imageBase64: string,
+ imageUrl: string,
click: { x: number; y: number },
): Promise {
- const buf = Buffer.from(imageBase64, "base64");
+ const buf = await loadImageBuffer(imageUrl);
const resized = await sharp(buf)
.resize({ width: 768, withoutEnlargement: true, fit: "inside" })
diff --git a/packages/engine/src/director.ts b/packages/engine/src/director.ts
index df7bde2..a1526e0 100644
--- a/packages/engine/src/director.ts
+++ b/packages/engine/src/director.ts
@@ -1,4 +1,4 @@
-import { chat, uploadImage } from "@yume/ai-client";
+import { chat } from "@yume/ai-client";
import type {
Character,
EngineConfig,
@@ -29,7 +29,7 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// │
// ├─ CharacterDesigner LLM × N (parallel per new char)
// │ │
-// │ ├─ portrait gen + upload (parallel within agent)
+// │ ├─ portrait gen (Runware returns URL + UUID in one call)
// │ └─ voice provisioning (parallel within agent)
// │
// ├─ Cinematographer LLM (parallel with all of the above)
@@ -37,13 +37,11 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// └─ wait for all parallel branches
// │
// ▼
-// Painter (FLUX referenceImages — two-tier degradation chain)
+// Painter — generateImage with referenceImages (UUID/URL refs only;
+// no base64 to upload, since outputType=URL gives both back)
// │
// ▼
-// upload final scene image → Scene.imageUuid
-// │
-// ▼
-// return { scene, sceneImageBase64, characters }
+// return { scene, sceneImageUrl, characters }
//
// The Cinematographer intentionally does NOT depend on CharacterDesigner
// output — it only positions named characters in the frame, not their
@@ -80,7 +78,7 @@ export function mergeCharacters(
...u,
voice: u.voice ?? prev.voice,
visualDescription: u.visualDescription ?? prev.visualDescription,
- basePortraitBase64: u.basePortraitBase64 ?? prev.basePortraitBase64,
+ basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
});
@@ -92,27 +90,22 @@ export function mergeCharacters(
// scene — used by the Painter as one of the `referenceImages` (NOT as a
// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
//
-// Returns the UUID if available (cheap reference, ~36 chars over the wire),
-// else the base64 of the most recent matching scene's image. Returns
-// undefined when no prior scene shares the current sceneKey.
+// Prefer URL over UUID for the same reason painter.collectReferenceImages
+// does: the UUID returned by `imageInference` isn't always recognized by
+// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
+// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
+// as a backstop. Returns undefined when no prior scene shares the sceneKey.
function pickPriorSceneReference(
session: Session,
currentSceneKey: string | undefined,
- priorImageBase64ByUuid: Map,
): { priorSceneReference?: string; priorSceneKey?: string } {
if (!currentSceneKey) return {};
for (let i = session.history.length - 1; i >= 0; i--) {
const prior = session.history[i]!.scene;
if (prior.sceneKey === currentSceneKey) {
- if (prior.imageUuid) {
- return {
- priorSceneReference: prior.imageUuid,
- priorSceneKey: prior.sceneKey,
- };
- }
- const cached = priorImageBase64ByUuid.get(prior.id);
- if (cached) {
- return { priorSceneReference: cached, priorSceneKey: prior.sceneKey };
+ const ref = prior.imageUrl ?? prior.imageUuid;
+ if (ref) {
+ return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
}
}
}
@@ -121,25 +114,18 @@ function pickPriorSceneReference(
export type SceneResult = {
scene: Scene;
- sceneImageBase64: string;
+ sceneImageUrl: string;
characters: Character[];
};
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
-//
-// priorImageBase64ByUuid: optional map from prior Scene.id → base64
-// the caller has on-hand. If a sceneKey-hit scene's imageUuid is missing
-// but the base64 is cached locally, we can still feed it as one of the
-// Painter's referenceImages. Pass an empty map when caller has no cache
-// (orchestrator does pass it for the start-session bootstrap).
// ──────────────────────────────────────────────────────────────────────
export async function directScene(
config: EngineConfig,
session: Session,
- priorImageBase64ByUuid: Map = new Map(),
): Promise {
const tTotal = Date.now();
@@ -168,7 +154,6 @@ export async function directScene(
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
writerOut.sceneKey,
- priorImageBase64ByUuid,
);
// Stage 2 — parallel: CharacterDesigner(s) and Cinematographer.
@@ -237,7 +222,7 @@ export async function directScene(
);
const tPainter = Date.now();
- const sceneImageBase64 = await runPainter(
+ const painted = await runPainter(
config,
{
integratedPrompt: cinemaOut.integratedPrompt,
@@ -249,22 +234,6 @@ export async function directScene(
);
tlog("[directScene] Painter", tPainter);
- // Stage 4 — best-effort upload of the final scene image so the NEXT
- // sceneKey-match call can reference its UUID instead of carrying base64.
- // If upload fails, the scene still works; only loses cheap referencing
- // on the next hop. Don't wait on mock images (static placeholder).
- let imageUuid: string | undefined;
- if (!config.mockImage) {
- try {
- const tUpload = Date.now();
- imageUuid = await uploadImage(config.image, sceneImageBase64);
- tlog("[directScene] image upload", tUpload);
- } catch (err) {
- const msg = err instanceof Error ? err.message : String(err);
- console.warn(`[directScene] scene image upload failed: ${msg} — sceneKey reuse will need base64 fallback`);
- }
- }
-
const scene: Scene = {
id: newSceneId(),
// scenePrompt is the cinematographer's English compositional output;
@@ -276,12 +245,13 @@ export async function directScene(
beats: writerOut.beats,
entryBeatId: writerOut.entryBeatId,
sceneKey: writerOut.sceneKey,
- imageUuid,
+ imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
+ imageUrl: painted.imageUrl,
};
tlog("[directScene] TOTAL", tTotal);
- return { scene, sceneImageBase64, characters };
+ return { scene, sceneImageUrl: painted.imageUrl, characters };
}
// ──────────────────────────────────────────────────────────────────────
diff --git a/packages/engine/src/jsonParser.ts b/packages/engine/src/jsonParser.ts
index 20130fc..68d9de9 100644
--- a/packages/engine/src/jsonParser.ts
+++ b/packages/engine/src/jsonParser.ts
@@ -1,13 +1,44 @@
+import { jsonrepair, JSONRepairError } from "jsonrepair";
+
// Strict-then-forgiving JSON parser for LLM output. Tries in order:
// 1. Direct JSON.parse on the trimmed text.
// 2. Extract from ```json``` fenced block.
// 3. Slice between first { and last } and parse.
-// 4. Apply best-effort regex repair (trailing commas, missing commas
-// between adjacent values) and try again.
+// 4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
//
-// On final failure, logs the first 800 chars of the raw model output so we
-// can see what the LLM did wrong (the standard error message only shows
-// the position, not the surrounding context).
+// On final failure, logs the FULL raw model output so we can diagnose the
+// actual syntax error.
+//
+// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
+// broad LLM-output failure modes: truncated JSON, missing commas/brackets,
+// single quotes, Python None/True/False, JS comments. We layer a small set
+// of targeted pre-repairs in front of it for failure modes jsonrepair can't
+// disambiguate on its own (see preRepair).
+
+// ──────────────────────────────────────────────────────────────────────
+// preRepair — fix specific LLM error patterns before handing to jsonrepair.
+//
+// Pattern 1: missing closing quote on a key.
+// Broken: "lineDelivery: "语速稍快...",
+// Correct: "lineDelivery": "语速稍快...",
+//
+// jsonrepair fails on this because it's ambiguous — "lineDelivery: " could
+// be a complete string value, leaving "语速稍快..." as a syntax error. But
+// if we see ":" we know structurally it should be
+// a key-colon-value triplet.
+//
+// Match constraints:
+// - The key match excludes " \n : so we can't overrun into adjacent
+// fields or absorb the colon as part of the key name.
+// - The colon must be followed by whitespace and another " (the value
+// string's opening quote). This is what disambiguates from a value
+// string that happens to contain a colon.
+// ──────────────────────────────────────────────────────────────────────
+
+function preRepair(s: string): string {
+ return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
+}
+
export function parseJsonLoose(raw: string): T {
const trimmed = raw.trim();
@@ -28,54 +59,36 @@ export function parseJsonLoose(raw: string): T {
const first = trimmed.indexOf("{");
const last = trimmed.lastIndexOf("}");
- if (first !== -1 && last > first) {
- const slice = trimmed.slice(first, last + 1);
- try {
- return JSON.parse(slice) as T;
- } catch {
- // Last resort: try repairing common LLM-output malformations.
- const repaired = repairJsonString(slice);
+ const slice =
+ first !== -1 && last > first ? trimmed.slice(first, last + 1) : trimmed;
+
+ // Try the brace-sliced version first; if there were no braces at all
+ // (slice === trimmed), this is just a second attempt at the raw text.
+ try {
+ return JSON.parse(slice) as T;
+ } catch {
+ // Targeted pre-repair (no-op on already-valid JSON) → jsonrepair.
+ const prefixed = preRepair(slice);
+
+ // If preRepair changed something, give the cheap path another shot —
+ // the input might already be valid now without needing jsonrepair.
+ if (prefixed !== slice) {
try {
- return JSON.parse(repaired) as T;
- } catch (err) {
- console.error(
- `[parseJsonLoose] all strategies failed. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
- );
- throw err;
+ return JSON.parse(prefixed) as T;
+ } catch {
+ // fall through to jsonrepair
}
}
+
+ try {
+ const repaired = jsonrepair(prefixed);
+ return JSON.parse(repaired) as T;
+ } catch (err) {
+ const isRepairErr = err instanceof JSONRepairError;
+ console.error(
+ `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Full raw model output:\n${raw}`,
+ );
+ throw err;
+ }
}
-
- console.error(
- `[parseJsonLoose] no { ... } found. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
- );
- throw new Error(`Failed to parse JSON from model output: ${raw.slice(0, 200)}`);
-}
-
-// Best-effort repair of LLM-typical JSON syntax errors. Targeted at the two
-// most common failures we see in practice:
-// 1. Trailing comma before } or ].
-// 2. Missing comma between two adjacent JSON values (the specific error
-// mode we hit at position 3390).
-//
-// Deliberately conservative — does NOT try to fix unclosed strings,
-// unbalanced braces, or strip JS-style comments. The comment-stripping
-// path was previously included but would corrupt JSON string values
-// containing `//` (e.g. URLs like "https://example.com"); since LLMs in
-// `responseFormat: "json_object"` mode essentially never emit comments,
-// dropping that step is a net win for safety.
-function repairJsonString(s: string): string {
- return s
- // 1. Strip trailing commas before } or ].
- .replace(/,(\s*[}\]])/g, "$1")
- // 2. Insert missing commas between two adjacent JSON values. The cases:
- // } { → },{ ] [ → ],[ } [ → },[ ] { → ],{
- // "string" "key" "string" { "string" [
- // number then "key" / { / [
- //
- // The regex looks for a closing token (} ] " or a digit) followed by
- // a newline and an opening token (} ] " a letter), and inserts a
- // comma between them. Requires the newline (\s*\n\s*) so it only
- // fires across line boundaries, never within a single-line value.
- .replace(/(\}|\]|"|\d)(\s*\n\s*)(\{|\[|")/g, "$1,$2$3");
}
diff --git a/packages/engine/src/mockImage.ts b/packages/engine/src/mockImage.ts
index 3d3b2ae..fcc0d5c 100644
--- a/packages/engine/src/mockImage.ts
+++ b/packages/engine/src/mockImage.ts
@@ -1,11 +1,15 @@
import sharp from "sharp";
-let cached: string | undefined;
+let cachedDataUri: string | undefined;
// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
// TTS path without paying for image generation. Generated once, then memoized.
-export async function mockImageBase64(): Promise {
- if (cached) return cached;
+// Returned as a data URI so the rest of the pipeline can treat it as an
+// `imageUrl` interchangeably with real Runware URLs (the client's
+// accepts both, and we never feed a mock image to Runware's referenceImages
+// because mockImage mode short-circuits the Painter entirely).
+export async function mockImageDataUri(): Promise {
+ if (cachedDataUri) return cachedDataUri;
const W = 1792;
const H = 1024;
@@ -20,6 +24,6 @@ export async function mockImageBase64(): Promise {
`;
const png = await sharp(Buffer.from(svg)).png().toBuffer();
- cached = png.toString("base64");
- return cached;
+ cachedDataUri = `data:image/png;base64,${png.toString("base64")}`;
+ return cachedDataUri;
}
diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts
index 87a8e6a..7813d27 100644
--- a/packages/engine/src/orchestrator.ts
+++ b/packages/engine/src/orchestrator.ts
@@ -49,14 +49,14 @@ export async function startSession(
characters: [],
};
- const { scene, sceneImageBase64, characters } = await directScene(config, session);
+ const { scene, sceneImageUrl, characters } = await directScene(config, session);
tlog("[start] TOTAL", tTotal);
return {
sessionId: session.id,
scene,
- imageBase64: sceneImageBase64,
+ imageUrl: sceneImageUrl,
characters,
};
}
@@ -71,7 +71,7 @@ export async function requestScene(
): Promise {
const tTotal = Date.now();
- const { scene, sceneImageBase64, characters } = await directScene(
+ const { scene, sceneImageUrl, characters } = await directScene(
config,
req.session,
);
@@ -80,7 +80,7 @@ export async function requestScene(
return {
scene,
- imageBase64: sceneImageBase64,
+ imageUrl: sceneImageUrl,
characters,
};
}
@@ -95,7 +95,7 @@ export async function visionDecide(
config: EngineConfig,
req: VisionRequest,
): Promise {
- const annotated = await annotateClick(req.prevImageBase64, req.click);
+ const annotated = await annotateClick(req.prevImageUrl, req.click);
const current = req.session.history.at(-1)?.scene ?? null;
return interpret(config.vision, annotated, current);
}
diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts
index 01be754..e98503f 100644
--- a/packages/types/src/index.ts
+++ b/packages/types/src/index.ts
@@ -56,17 +56,24 @@ export type Scene = {
* e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this
* key, the Painter slots the previous Scene's image into Runware's
* `referenceImages` (alongside character portraits) so the same physical
- * space stays visually consistent across cuts. (Originally planned as a
- * seedImage / img2img anchor, but FLUX.2 [klein] 9B KV does not support
- * seedImage — referenceImages serves the same purpose with the model.)
+ * space stays visually consistent across cuts.
*/
sceneKey?: string;
/**
- * Runware UUID of this Scene's generated image — once uploaded, subsequent
- * Scenes that match sceneKey can reference it via `referenceImages`
- * without resending base64.
+ * Runware UUID of this Scene's generated image. Cheapest form to send back
+ * to Runware's `referenceImages` in subsequent calls (UUID > URL > base64
+ * in transport cost). Not shown to the client — `imageUrl` is what renders.
*/
imageUuid?: string;
+ /**
+ * Public CDN URL of this Scene's generated image. Returned to the client for
+ * `
` rendering, and is what the client passes back to `/api/vision`
+ * as `prevImageUrl` so the server can re-fetch the bytes for click annotation.
+ *
+ * For MOCK_IMAGE=true this is a `data:image/png;base64,...` data URI, not a
+ * Runware URL — the client renders both forms transparently.
+ */
+ imageUrl?: string;
};
export type SceneExit =
@@ -111,17 +118,17 @@ export type Character = {
*/
visualDescription?: string;
/**
- * Base portrait image generated by the CharacterDesigner once, then reused
- * as a Runware `referenceImages` entry in every subsequent scene the
- * character appears in. Stored as base64 for client display.
- */
- basePortraitBase64?: string;
- /**
- * Runware UUID for the base portrait. Once uploaded via the image-upload
- * endpoint, subsequent Painter calls reference this UUID instead of
- * resending the full base64 payload.
+ * Runware UUID for the base portrait. Generated by the CharacterDesigner
+ * once, reused as a `referenceImages` entry on every subsequent scene the
+ * character appears in. UUID is the cheapest reference form for Runware.
*/
basePortraitUuid?: string;
+ /**
+ * Public CDN URL for the base portrait. Same image as `basePortraitUuid`;
+ * kept around for the client (if it ever wants to render character cards)
+ * and as a fallback reference form for `referenceImages` when UUID is absent.
+ */
+ basePortraitUrl?: string;
/** Xiaomi MiMo voice reference audio. */
voice?: CharacterVoice;
};
@@ -196,7 +203,8 @@ export type StartRequest = {
export type StartResponse = {
sessionId: string;
scene: Scene;
- imageBase64: string;
+ /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
+ imageUrl: string;
/** Character registry with voice references + visual cards provisioned. */
characters: Character[];
};
@@ -210,7 +218,8 @@ export type SceneRequest = {
export type SceneResponse = {
scene: Scene;
- imageBase64: string;
+ /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
+ imageUrl: string;
characters: Character[];
};
@@ -235,7 +244,12 @@ export type BeatAudioResponse = {
// trigger a scene change.
export type VisionRequest = {
session: Session;
- prevImageBase64: string;
+ /**
+ * Public CDN URL (or data URI in MOCK_IMAGE mode) of the scene the player
+ * just clicked. The server re-fetches the bytes to annotate the click and
+ * pass an OpenAI-compatible image_url to the vision LLM.
+ */
+ prevImageUrl: string;
click: { x: number; y: number };
};
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 483ebce..8607276 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -75,6 +75,9 @@ importers:
'@yume/types':
specifier: workspace:*
version: link:../types
+ jsonrepair:
+ specifier: ^3.14.0
+ version: 3.14.0
sharp:
specifier: ^0.33.5
version: 0.33.5
@@ -594,6 +597,10 @@ packages:
resolution: {integrity: sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==}
hasBin: true
+ jsonrepair@3.14.0:
+ resolution: {integrity: sha512-tWPGKMZf/8UPim+fcW2EfcQ/d/7aKUrP6IECz9G3Tu6Q5dX0orSleqJ9z6sSw7qrQkjF8/Edo4DvsWBZ8H+HNg==}
+ hasBin: true
+
lilconfig@3.1.3:
resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==}
engines: {node: '>=14'}
@@ -1240,6 +1247,8 @@ snapshots:
jiti@1.21.7: {}
+ jsonrepair@3.14.0: {}
+
lilconfig@3.1.3: {}
lines-and-columns@1.2.4: {}
diff --git a/vercel.json b/vercel.json
deleted file mode 100644
index 5af8dcf..0000000
--- a/vercel.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
- "$schema": "https://openapi.vercel.sh/vercel.json",
- "framework": "nextjs",
- "buildCommand": "pnpm build",
- "installCommand": "pnpm install",
- "functions": {
- "apps/web/app/api/interact/route.ts": { "maxDuration": 60 },
- "apps/web/app/api/vision/route.ts": { "maxDuration": 60 },
- "apps/web/app/api/start/route.ts": { "maxDuration": 60 }
- }
-}