From addbede9291cac2cdd6636cfa2d62352a007b337 Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Mon, 1 Jun 2026 16:04:13 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20Vercel=20Hobby=20deploy=20readiness=20?= =?UTF-8?q?=E2=80=94=20image=20URLs,=20jsonrepair,=20DeepSeek?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move vercel.json to apps/web/ with correct route paths; cap scene route maxDuration 120→60s for Hobby. Root vercel.json removed. Vercel project's Root Directory must be set to apps/web (Deploy button URL passes this). - Switch image transport from base64-in-JSON to Runware-hosted URLs: generateImage now uses outputType=URL and returns {imageUrl, imageUuid}; StartResponse/SceneResponse carry imageUrl; VisionRequest carries prevImageUrl (server re-fetches the bytes for click annotation). This eliminates the 4.5MB serverless body-size risk. - Painter and director prefer URL over UUID for referenceImages — the UUID returned by Runware imageInference isn't always recognized in the refs pipeline (surfaces as `failedToTransferImage`). - Client preloads scene images via `new Image().decode()` before committing to React state, so URL transitions render instantly; prefetched scenes also warm the HTTP cache. - jsonParser uses the jsonrepair package (replaces hand-rolled repair) and adds a targeted preRepair regex for the missing-key-close-quote pattern that jsonrepair couldn't disambiguate. Full raw model output dumped on failure for diagnostic visibility. - Default text provider switched to DeepSeek v4-flash via direct API (significantly more stable JSON than MiMo v2.5-pro). VISION/TTS stay on MiMo (DeepSeek has no multimodal / TTS offerings). - next.config: drop dead experimental.serverActions.bodySizeLimit (no server actions used). - README: real Deploy button URL (zonghaoyuan/yume + root-directory=apps/web + TTS/MOCK_IMAGE in env list); refreshed env vars table with optional TTS section. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 32 +++-- apps/web/.env.example | 22 ++-- apps/web/app/api/scene/route.ts | 6 +- apps/web/app/api/vision/route.ts | 4 +- apps/web/app/play/page.tsx | 71 ++++++++-- apps/web/components/PlayCanvas.tsx | 14 +- apps/web/next.config.ts | 5 - apps/web/vercel.json | 11 ++ packages/ai-client/src/image.ts | 122 ++++++------------ packages/ai-client/src/index.ts | 4 +- packages/engine/package.json | 1 + .../engine/src/agents/characterDesigner.ts | 70 ++++------ packages/engine/src/agents/painter.ts | 40 ++++-- packages/engine/src/annotate.ts | 38 +++++- packages/engine/src/director.ts | 68 +++------- packages/engine/src/jsonParser.ts | 115 +++++++++-------- packages/engine/src/mockImage.ts | 14 +- packages/engine/src/orchestrator.ts | 10 +- packages/types/src/index.ts | 50 ++++--- pnpm-lock.yaml | 9 ++ vercel.json | 11 -- 21 files changed, 392 insertions(+), 325 deletions(-) create mode 100644 apps/web/vercel.json delete mode 100644 vercel.json diff --git a/README.md b/README.md index 5b8bc23..6f65317 100644 --- a/README.md +++ b/README.md @@ -37,21 +37,28 @@ There is no traditional game UI baked into the art. The AI paints the world in w ## One-click deploy -[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/YOUR_USERNAME/yume&env=TEXT_BASE_URL,TEXT_API_KEY,TEXT_MODEL,IMAGE_BASE_URL,IMAGE_API_KEY,IMAGE_MODEL,VISION_BASE_URL,VISION_API_KEY,VISION_MODEL&envDescription=Three%20independently%20configurable%20providers.%20Any%20OpenAI-compatible%20endpoint%20works.&envLink=https://github.com/YOUR_USERNAME/yume%23environment-variables) +[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/zonghaoyuan/yume&root-directory=apps/web&env=TEXT_BASE_URL,TEXT_API_KEY,TEXT_MODEL,IMAGE_BASE_URL,IMAGE_API_KEY,IMAGE_MODEL,VISION_BASE_URL,VISION_API_KEY,VISION_MODEL,TTS_BASE_URL,TTS_API_KEY,TTS_SPEECH_MODEL,MOCK_IMAGE&envDescription=Three%20required%20providers%20%2B%20optional%20TTS.%20Any%20OpenAI-compatible%20endpoint%20works%20for%20text%2Fvision%2Ftts.&envLink=https://github.com/zonghaoyuan/yume%23environment-variables) -After deploy, set the nine environment variables (see below) in your Vercel project. That's it. +After deploy, set the environment variables (see below) in your Vercel project. Nine are required; TTS is optional (leave blank to run silently); `MOCK_IMAGE=true` skips image generation for cheap TTS-only testing. The Vercel project's **Root Directory** must be set to `apps/web` (the deploy button passes this; if you configure manually, set it in Project Settings). --- ## Environment variables -Three providers, all independently configurable. Text and Vision accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible). +Three required providers + optional TTS. Text, Vision, and TTS accept any OpenAI-compatible endpoint (OpenAI, Anthropic via OpenAI-compat proxy, Gemini, OpenRouter, DeepSeek, local Ollama, …). Image goes to **Runware** (its own task-array protocol, not OpenAI-compatible). -| Provider | Variables | Recommended | -|---|---|---| -| Text · story director | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL` | `claude-opus-4-7` via Anthropic | -| Image · UI renderer | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) | -| Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | `gemini-3-flash` via Google | +| Provider | Variables | Required? | Recommended | +|---|---|---|---| +| Text · story director | `TEXT_BASE_URL` `TEXT_API_KEY` `TEXT_MODEL` | ✅ | `claude-opus-4-7` via Anthropic | +| Image · UI renderer | `IMAGE_BASE_URL` `IMAGE_API_KEY` `IMAGE_MODEL` | ✅ | `runware:400@6` (FLUX.2 [klein] 9B KV) via [Runware](https://runware.ai) | +| Vision · click reader | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | ✅ | `gemini-3-flash` via Google | +| TTS · per-character voice | `TTS_BASE_URL` `TTS_API_KEY` `TTS_SPEECH_MODEL` | optional — leave blank to run silently | `mimo-v2.5-tts` via Xiaomi MiMo | + +There's also a flag for cheap testing: + +| Variable | Effect | +|---|---| +| `MOCK_IMAGE=true` | Skip image generation; the renderer returns a static placeholder. Story, voice, and choices still run normally. Great for iterating on TTS without burning Runware credits. | See `apps/web/.env.example` for the exact shape. @@ -64,7 +71,7 @@ Requires Node 20+ and pnpm 9+. ```bash pnpm install cp apps/web/.env.example apps/web/.env.local -# fill in the nine env vars +# fill in env vars (9 required + optional TTS/MOCK_IMAGE) pnpm dev # open http://localhost:3000 ``` @@ -75,11 +82,12 @@ pnpm dev ``` yume/ -├── apps/web/ Next.js 16 app — pages + API routes +├── apps/web/ Next.js 16 app — pages + API routes (Vercel root) └── packages/ ├── types/ shared TypeScript types - ├── ai-client/ unified OpenAI-compatible clients - └── engine/ three-stage AI orchestration (open core) + ├── ai-client/ unified OpenAI-compatible clients + Runware adapter + ├── tts-client/ Xiaomi MiMo TTS adapter + └── engine/ multi-agent AI orchestration (open core) ``` `packages/engine` is the open core — pure TS, no Next.js or browser dependency. Import it directly to build your own visual-novel front-end (Tauri, Electron, CLI, anywhere). diff --git a/apps/web/.env.example b/apps/web/.env.example index 20b7700..6cff139 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -12,12 +12,18 @@ # ============================================================= # ---- 1. Text LLM · scene director ---------------------------------- -# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN) -# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1 -# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys) -TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 -TEXT_API_KEY=tp-xxx -TEXT_MODEL=mimo-v2.5-pro +# Any OpenAI-compatible endpoint works: OpenAI, Anthropic (via proxy), +# Gemini, OpenRouter, DeepSeek, OpenCode, MiMo, local Ollama, … +# Recommended starters: +# A. DeepSeek v4-flash direct (https://api.deepseek.com/v1) — pay-as-you-go, +# fastest first-token latency, very stable JSON output. +# B. OpenCode Go (https://opencode.ai/zen/go/v1) — $10/mo flat-rate bundle of +# 12 open-source models (DeepSeek v4-flash, Qwen, Kimi, GLM, MiMo, …). +# Cheaper at high volume, slower at the tail. +# C. MiMo v2.5 via Xiaomi Token Plan — bundles VISION + TTS in one tp- key. +TEXT_BASE_URL=https://api.deepseek.com/v1 +TEXT_API_KEY=sk-xxx +TEXT_MODEL=deepseek-v4-flash # ---- 2. Image generator (renders the scene background) ------------- # Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model, @@ -30,9 +36,7 @@ IMAGE_API_KEY=runware-xxx IMAGE_MODEL=runware:400@6 # ---- 3. Vision model · multimodal click interpretation ------------- -# Recommended: MiMo V2.5 omni — multimodal. -# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and -# rejects image_url content parts. +# Recommended: MiMo V2.5 — multimodal, accepts image_url content parts. VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 VISION_API_KEY=tp-xxx VISION_MODEL=mimo-v2.5 diff --git a/apps/web/app/api/scene/route.ts b/apps/web/app/api/scene/route.ts index bcec19b..641e173 100644 --- a/apps/web/app/api/scene/route.ts +++ b/apps/web/app/api/scene/route.ts @@ -4,7 +4,11 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -export const maxDuration = 120; +// Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is +// Writer + CharDesigner×N + Cinematographer + Painter — happy path 9–12s; the +// tail (cold provider, multiple new characters) can push 30–45s, so 60 is a +// reasonable headroom on Hobby. +export const maxDuration = 60; export async function POST(req: Request) { let body: SceneRequest; diff --git a/apps/web/app/api/vision/route.ts b/apps/web/app/api/vision/route.ts index 81d0487..d093209 100644 --- a/apps/web/app/api/vision/route.ts +++ b/apps/web/app/api/vision/route.ts @@ -14,9 +14,9 @@ export async function POST(req: Request) { return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); } - if (!body.session || !body.prevImageBase64 || !body.click) { + if (!body.session || !body.prevImageUrl || !body.click) { return NextResponse.json( - { error: "session, prevImageBase64, click are required" }, + { error: "session, prevImageUrl, click are required" }, { status: 400 }, ); } diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx index b71f21e..88b0308 100644 --- a/apps/web/app/play/page.tsx +++ b/apps/web/app/play/page.tsx @@ -28,6 +28,42 @@ import type { const MUTED_STORAGE_KEY = "yume:muted"; +// Cap how long we wait for the browser to download + decode a scene image +// before giving up and rendering anyway. Runware's CDN is normally <2s for a +// 1792×1024 PNG; tolerate up to 8s before the typewriter starts so a slow +// download can't strand the player on a blank screen forever. +const IMAGE_PRELOAD_TIMEOUT_MS = 8000; + +// ────────────────────────────────────────────────────────────────────── +// Image preload — decode the Runware URL in memory before committing to +// React state, so when the mounts, the browser cache is warm and +// rendering is instant. Without this the user sees a blank canvas during +// the Runware-CDN download (~1-3s) after /api/scene returns. +// +// Data URIs (MOCK_IMAGE mode) and prefetched-then-cached real URLs both +// resolve fast / instantly. Errors and timeouts resolve quietly — better +// to render a broken-image than to hang the play loop indefinitely. +// ────────────────────────────────────────────────────────────────────── + +function preloadImage(url: string): Promise { + return new Promise((resolve) => { + const img = new Image(); + const done = () => resolve(); + const timer = setTimeout(done, IMAGE_PRELOAD_TIMEOUT_MS); + img.onload = () => { + clearTimeout(timer); + // .decode() forces the bitmap to be fully decoded before we proceed — + // without it, a slow decode could still cause a flash on first paint. + img.decode().then(done, done); + }; + img.onerror = () => { + clearTimeout(timer); + done(); + }; + img.src = url; + }); +} + // ────────────────────────────────────────────────────────────────────── // Prefetch pool — speculative SceneResponses keyed by choice path. // @@ -123,6 +159,12 @@ function prefetchScenePath( } const data = (await res.json()) as SceneResponse; + // Warm the browser's HTTP + image-decode cache for this URL so when the + // player eventually picks this choice and we render the , it's + // instant. Don't await — let the bytes stream in the background; the + // transition path will await its own preloadImage() before committing. + void preloadImage(data.imageUrl); + // Recursive: if the resulting scene has exactly one change-scene exit, // it is a must-pass node — prefetch its child too. if (depth + 1 < PREFETCH_MAX_DEPTH) { @@ -193,7 +235,7 @@ function PlayInner() { const [session, setSession] = useState(null); const [currentScene, setCurrentScene] = useState(null); const [currentBeatId, setCurrentBeatId] = useState(null); - const [imageBase64, setImageBase64] = useState(null); + const [imageUrl, setImageUrl] = useState(null); const [beatAudioMap, setBeatAudioMap] = useState>({}); // Lazy-initialize from localStorage so PlayCanvas never mounts with the // wrong muted value (an effect-based read would briefly let audio play @@ -434,7 +476,12 @@ function PlayInner() { } return (await r.json()) as StartResponse; }) - .then((data) => { + .then(async (data) => { + // Decode the Runware image in memory before committing to state, so + // the renders instantly when it mounts (same rationale as the + // performSceneTransition path). + await preloadImage(data.imageUrl); + const initial: Session = { id: data.sessionId, createdAt: Date.now(), @@ -452,7 +499,7 @@ function PlayInner() { setSession(initial); setCurrentScene(data.scene); setCurrentBeatId(data.scene.entryBeatId); - setImageBase64(data.imageBase64); + setImageUrl(data.imageUrl); // beatAudioMap is populated lazily by the per-beat fetch effect once // currentScene becomes non-null (see fetchBeatAudio). setPhase("ready"); @@ -520,6 +567,14 @@ function PlayInner() { const base = sessionRef.current; if (!base) throw new Error("Session lost mid-transition"); + // Wait for the browser to download + decode the Runware-hosted image + // BEFORE committing it to state, so the renders instantly when it + // mounts. For prefetched scenes the preloadImage call inside + // prefetchScenePath has already warmed the cache, so this resolves + // almost immediately. For cold transitions we trade an extra ~1-3s of + // "transitioning" overlay for an image-pop-in-from-blank flash. + await preloadImage(result.imageUrl); + const closedHistory = base.history.map((h, i, arr) => i === arr.length - 1 ? { ...h, visitedBeatIds: visitedForCurrent, exit } @@ -540,7 +595,7 @@ function PlayInner() { setSession(newSession); setCurrentScene(result.scene); setCurrentBeatId(result.scene.entryBeatId); - setImageBase64(result.imageBase64); + setImageUrl(result.imageUrl); // beatAudioMap reset + per-beat fetches kicked off by the scene effect. setLastExitLabel(exitLabel); setPhase("ready"); @@ -607,7 +662,7 @@ function PlayInner() { } async function onBackgroundClick(click: { x: number; y: number }) { - if (phase !== "ready" || !session || !currentScene || !imageBase64) return; + if (phase !== "ready" || !session || !currentScene || !imageUrl) return; setPhase("vision-thinking"); setPendingClick(click); @@ -615,7 +670,7 @@ function PlayInner() { const visionRes = await fetch("/api/vision", { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session, prevImageBase64: imageBase64, click }), + body: JSON.stringify({ session, prevImageUrl: imageUrl, click }), }); if (!visionRes.ok) { const j = (await visionRes.json().catch(() => ({}))) as { @@ -763,7 +818,7 @@ function PlayInner() { return (
)} - {imageBase64 ? ( + {imageUrl ? (
- {/* Background image */} + {/* Background image — Runware CDN URL or data URI (mock mode) */} Generated scene { diff --git a/apps/web/next.config.ts b/apps/web/next.config.ts index f13a9bd..f178b6a 100644 --- a/apps/web/next.config.ts +++ b/apps/web/next.config.ts @@ -14,11 +14,6 @@ const config: NextConfig = { turbopack: { root: path.join(__dirname, "..", ".."), }, - experimental: { - serverActions: { - bodySizeLimit: "10mb", - }, - }, }; export default config; diff --git a/apps/web/vercel.json b/apps/web/vercel.json new file mode 100644 index 0000000..4ccec08 --- /dev/null +++ b/apps/web/vercel.json @@ -0,0 +1,11 @@ +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "framework": "nextjs", + "functions": { + "app/api/start/route.ts": { "maxDuration": 60 }, + "app/api/scene/route.ts": { "maxDuration": 60 }, + "app/api/vision/route.ts": { "maxDuration": 60 }, + "app/api/insert-beat/route.ts": { "maxDuration": 60 }, + "app/api/beat-audio/route.ts": { "maxDuration": 30 } + } +} diff --git a/packages/ai-client/src/image.ts b/packages/ai-client/src/image.ts index 4710e16..557a4a3 100644 --- a/packages/ai-client/src/image.ts +++ b/packages/ai-client/src/image.ts @@ -4,21 +4,23 @@ import { fetchWithRetry } from "./fetchWithRetry"; // Runware uses its own task-array protocol (not OpenAI-compatible). // POST with [{ taskType: "imageInference", ... }]; errors come // back as a 200 with `errors[]`, so we have to inspect the body either way. - -// FLUX img2img specifics: -// - strength < 0.8 has minimal-to-no visible effect on FLUX models (per -// Runware docs); we default to 0.85 which leaves room to deviate while -// still anchoring on the seed image's composition. -// - referenceImages caps at 4 per request; the FLUX.2 [klein] 9B KV model -// (runware:400@6) accelerates multi-reference inference by ~2.5× via its -// KV cache for reference latents (cached only WITHIN one inference run — -// not persisted across API calls, hence the upload-once-then-reference -// strategy below). +// +// referenceImages accepts UUIDs, public URLs, or base64. UUID is cheapest +// in transport cost; URL is next; base64 last resort. The FLUX.2 [klein] 9B +// KV variant (runware:400@6) accelerates multi-reference inference ~2.5× via +// its KV cache for reference latents (cached only within one inference run, +// not persisted across calls — hence the need to keep stable UUIDs/URLs for +// later reuse). +// +// We request outputType=URL so Runware persists the image and returns a CDN +// link the client can render directly. The same response also carries the +// image UUID, so we never need a separate uploadImage round-trip to anchor +// future referenceImages. const DEFAULT_IMG2IMG_STRENGTH = 0.85; const MAX_REFERENCE_IMAGES = 4; type RunwareImageResult = { - imageBase64Data?: string; + imageURL?: string; imageUUID?: string; }; type RunwareError = { @@ -33,32 +35,40 @@ type RunwareResponse = { export type GenerateImageOptions = { /** - * Reference image (UUID, plain base64, or data URI) to use as the - * img2img starting point. When set, FLUX preserves the seed image's - * composition and applies `strength` to allow deviation from it. - * Used for cross-scene visual continuity when sceneKey hits. + * Reference image (UUID, public URL, or base64) for img2img. When set, + * FLUX preserves the seed image's composition and applies `strength` to + * deviate. NOTE: FLUX.2 [klein] 9B KV does NOT support seedImage — use + * `referenceImages` for visual continuity instead. */ seedImage?: string; /** - * Reference images (UUIDs or base64) to condition the generation on — - * typically character portraits to anchor identity / outfit / style - * across scenes. Runware caps at 4; we silently truncate beyond that. + * Reference images (UUIDs, URLs, or base64) to condition generation on — + * typically character portraits + the prior scene image. Runware caps at 4; + * we silently truncate beyond that. */ referenceImages?: string[]; /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. */ strength?: number; }; +export type GenerateImageResult = { + /** Public CDN URL of the generated image (Runware-hosted). */ + imageUrl: string; + /** Stable UUID for cheap re-reference in later `referenceImages`. */ + imageUuid: string; +}; + // ────────────────────────────────────────────────────────────────────── -// generateImage — text-to-image (default) or img2img / multi-reference -// when seedImage / referenceImages are supplied. Returns base64. +// generateImage — text-to-image (default) or referenceImages-conditioned. +// Returns both the public URL (for client display + future references) +// and the UUID (cheapest reference form for subsequent calls). // ────────────────────────────────────────────────────────────────────── export async function generateImage( config: ProviderConfig, prompt: string, options?: GenerateImageOptions, -): Promise { +): Promise { const url = config.baseUrl.replace(/\/$/, ""); const task: Record = { @@ -71,8 +81,9 @@ export async function generateImage( steps: 4, CFGScale: 3.5, numberResults: 1, - outputType: "base64Data", + outputType: "URL", outputFormat: "PNG", + includeCost: false, }; if (options?.seedImage) { @@ -109,66 +120,11 @@ export async function generateImage( ); } - const b64 = json.data?.[0]?.imageBase64Data; - if (!b64) { - throw new Error(`No image in Runware response: ${text.slice(0, 300)}`); + const result = json.data?.[0]; + const imageUrl = result?.imageURL; + const imageUuid = result?.imageUUID; + if (!imageUrl || !imageUuid) { + throw new Error(`No image URL/UUID in Runware response: ${text.slice(0, 300)}`); } - return b64; -} - -// ────────────────────────────────────────────────────────────────────── -// uploadImage — registers a base64 image on Runware and returns its -// UUID, so subsequent generateImage calls can pass the UUID in -// referenceImages / seedImage instead of resending the base64 payload -// every time. Character base portraits and scene snapshots both flow -// through this path. -// -// Runware exposes the imageUpload taskType for exactly this purpose. -// Returns the UUID. Caller treats a thrown error as "fall back to -// sending base64 next time" — non-fatal. -// ────────────────────────────────────────────────────────────────────── - -export async function uploadImage( - config: ProviderConfig, - base64: string, -): Promise { - const url = config.baseUrl.replace(/\/$/, ""); - - const body = [ - { - taskType: "imageUpload", - taskUUID: crypto.randomUUID(), - image: `data:image/png;base64,${base64}`, - }, - ]; - - const res = await fetchWithRetry(url, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${config.apiKey}`, - }, - body: JSON.stringify(body), - }); - - const text = await res.text(); - let json: RunwareResponse; - try { - json = JSON.parse(text) as RunwareResponse; - } catch { - throw new Error(`Image upload API error ${res.status}: ${text.slice(0, 500)}`); - } - - if (json.errors?.length) { - const e = json.errors[0]!; - throw new Error( - `Runware upload error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}`, - ); - } - - const uuid = json.data?.[0]?.imageUUID; - if (!uuid) { - throw new Error(`No UUID in upload response: ${text.slice(0, 300)}`); - } - return uuid; + return { imageUrl, imageUuid }; } diff --git a/packages/ai-client/src/index.ts b/packages/ai-client/src/index.ts index 13fa290..0153e48 100644 --- a/packages/ai-client/src/index.ts +++ b/packages/ai-client/src/index.ts @@ -1,5 +1,5 @@ export { chat } from "./chat"; -export { generateImage, uploadImage } from "./image"; -export type { GenerateImageOptions } from "./image"; +export { generateImage } from "./image"; +export type { GenerateImageOptions, GenerateImageResult } from "./image"; export { interpretClick } from "./vision"; export type { ChatMessage } from "./chat"; diff --git a/packages/engine/package.json b/packages/engine/package.json index 1b51280..0ed11ab 100644 --- a/packages/engine/package.json +++ b/packages/engine/package.json @@ -15,6 +15,7 @@ "@yume/ai-client": "workspace:*", "@yume/tts-client": "workspace:*", "@yume/types": "workspace:*", + "jsonrepair": "^3.14.0", "sharp": "^0.33.5" } } diff --git a/packages/engine/src/agents/characterDesigner.ts b/packages/engine/src/agents/characterDesigner.ts index ae5f505..81dc4be 100644 --- a/packages/engine/src/agents/characterDesigner.ts +++ b/packages/engine/src/agents/characterDesigner.ts @@ -1,4 +1,4 @@ -import { chat, generateImage, uploadImage } from "@yume/ai-client"; +import { chat, generateImage } from "@yume/ai-client"; import { provisionVoice } from "@yume/tts-client"; import type { Character, @@ -7,7 +7,7 @@ import type { Session, } from "@yume/types"; import { parseJsonLoose } from "../jsonParser"; -import { mockImageBase64 } from "../mockImage"; +import { mockImageDataUri } from "../mockImage"; import { CHARACTER_DESIGNER_SYSTEM, buildCharacterDesignerUserMessage, @@ -24,8 +24,8 @@ import { // which keeps appearance and vocal personality coherent) // // 2. In parallel: -// a. Image gen — base portrait from visualDescription + styleGuide -// then upload to Runware → get UUID for cheap re-reference +// a. Image gen — base portrait (Runware returns URL + UUID in one shot; +// no separate upload round-trip is needed for cheap re-reference) // b. Voice provisioning — Xiaomi MiMo voicedesign from voiceDescription // → reference audio for later voiceclone synth // @@ -66,57 +66,39 @@ async function runDesignLLM( return parseJsonLoose(raw); } -// Generate the per-character base portrait and upload it. The portrait is -// a "concept sheet" — single character, neutral pose, plain background — -// so it works well as a Runware referenceImages anchor for later scenes. +// Generate the per-character base portrait. The portrait is a "concept +// sheet" — single character, neutral pose, plain background — so it works +// well as a Runware referenceImages anchor for later scenes. // -// Returns both the base64 (for client-side asset use, e.g., 立绘登场 -// animations) and the Runware UUID (for cheap referencing in subsequent -// Painter calls without resending the 100KB+ base64 each time). +// Returns the URL (for any client display + URL-form references) and the +// UUID (cheapest reference form for subsequent Painter calls). Both come +// back in one `imageInference` response now that we use outputType=URL — +// no separate upload step needed. // -// The upload step is best-effort: if it fails, we still return the base64 -// so the next scene can pass it as a referenceImages entry directly (just -// pays the bandwidth cost each call instead of once). -async function renderAndUploadPortrait( +// In mock mode we return the data URI as basePortraitUrl with no UUID +// (Painter is short-circuited anyway, so the lack of a UUID is moot). +async function renderPortrait( config: EngineConfig, charName: string, visualDescription: string, styleGuide: string, -): Promise<{ basePortraitBase64?: string; basePortraitUuid?: string }> { - let base64: string; +): Promise<{ basePortraitUrl?: string; basePortraitUuid?: string }> { try { if (config.mockImage) { - base64 = await mockImageBase64(); - } else { - const prompt = buildCharacterPortraitPrompt( - charName, - visualDescription, - styleGuide, - ); - base64 = await generateImage(config.image, prompt); + return { basePortraitUrl: await mockImageDataUri() }; } + const prompt = buildCharacterPortraitPrompt( + charName, + visualDescription, + styleGuide, + ); + const { imageUrl, imageUuid } = await generateImage(config.image, prompt); + return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid }; } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error(`[characterDesigner] portrait gen failed for ${charName}: ${msg}`); return {}; // no portrait at all — degrade gracefully } - - // Skip upload in mock mode — the mock image is the same static SVG every - // time and uploading it gives us a UUID that points to a useless asset. - if (config.mockImage) { - return { basePortraitBase64: base64 }; - } - - try { - const uuid = await uploadImage(config.image, base64); - return { basePortraitBase64: base64, basePortraitUuid: uuid }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.warn( - `[characterDesigner] portrait upload failed for ${charName}: ${msg} — will pass base64 in subsequent calls`, - ); - return { basePortraitBase64: base64 }; - } } async function provisionVoiceSafe( @@ -157,8 +139,8 @@ export async function designCharacter( // Step 2 — parallel: portrait + voice provisioning. const tProvision = Date.now(); const portraitPromise = visualDescription - ? renderAndUploadPortrait(config, charName, visualDescription, session.styleGuide) - : Promise.resolve({} as Awaited>); + ? renderPortrait(config, charName, visualDescription, session.styleGuide) + : Promise.resolve({} as Awaited>); const voicePromise = provisionVoiceSafe(config, voiceDescription, charName); const [portrait, voice] = await Promise.all([portraitPromise, voicePromise]); @@ -170,7 +152,7 @@ export async function designCharacter( name: charName, voiceDescription, visualDescription, - basePortraitBase64: portrait.basePortraitBase64, + basePortraitUrl: portrait.basePortraitUrl, basePortraitUuid: portrait.basePortraitUuid, voice, }; diff --git a/packages/engine/src/agents/painter.ts b/packages/engine/src/agents/painter.ts index e9d6e00..1f99128 100644 --- a/packages/engine/src/agents/painter.ts +++ b/packages/engine/src/agents/painter.ts @@ -1,12 +1,12 @@ import { generateImage } from "@yume/ai-client"; -import type { GenerateImageOptions } from "@yume/ai-client"; +import type { GenerateImageOptions, GenerateImageResult } from "@yume/ai-client"; import type { Beat, Character, EngineConfig, ProviderConfig, } from "@yume/types"; -import { mockImageBase64 } from "../mockImage"; +import { mockImageDataUri } from "../mockImage"; import { buildPainterPrompt } from "../prompts"; // ────────────────────────────────────────────────────────────────────── @@ -24,6 +24,11 @@ import { buildPainterPrompt } from "../prompts"; // (most visually prominent) // 3. Other on-stage NPCs' portraits — secondary characters in the frame // +// References are sent as UUIDs (preferred — cheapest in transport) or URLs +// (fallback — still cheaper than base64). Base64 fallback was removed when +// generateImage switched to outputType=URL, which always returns both a UUID +// and a URL so we never lack a cheap reference handle. +// // Failure handling — two-tier degradation: // A. referenceImages call (preferred — full visual anchoring) // B. pure text-to-image fallback (last resort if Runware refs API errors) @@ -36,8 +41,8 @@ export type PainterInput = { styleGuide: string; onStageCharacters: Character[]; /** - * Prior scene's Runware UUID or base64. When set (= sceneKey hit a - * prior scene), it slots into referenceImages[0] for spatial continuity. + * Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior + * scene), it slots into referenceImages[0] for spatial continuity. * Capacity-wise this displaces ONE character portrait — slot is shared * with character refs, capped at 4 total per Runware spec. */ @@ -67,10 +72,16 @@ export function collectReferenceImages( } // Slot 1+ — character portraits, speaker-first. + // + // Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that + // UUID isn't always recognized by the `referenceImages` pipeline (the error + // surfaces as `failedToTransferImage`). The URL is Runware's own CDN link — + // they can always fetch it from their own infra. UUID is kept as a backstop + // for any edge case where URL is missing (e.g., legacy session state). const speakerName = entryBeat?.speaker; if (speakerName) { const speaker = characters.find((c) => c.name === speakerName); - const ref = speaker?.basePortraitUuid ?? speaker?.basePortraitBase64; + const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid; if (ref && refs.length < MAX_REFERENCE_IMAGES) { refs.push(ref); seen.add(speakerName); @@ -81,7 +92,7 @@ export function collectReferenceImages( if (refs.length >= MAX_REFERENCE_IMAGES) break; if (seen.has(c.name)) continue; const char = characters.find((x) => x.name === c.name); - const ref = char?.basePortraitUuid ?? char?.basePortraitBase64; + const ref = char?.basePortraitUrl ?? char?.basePortraitUuid; if (ref) { refs.push(ref); seen.add(c.name); @@ -96,7 +107,7 @@ async function tryGenerate( prompt: string, options: GenerateImageOptions, label: string, -): Promise { +): Promise { try { return await generateImage(config, prompt, options); } catch (err) { @@ -106,12 +117,18 @@ async function tryGenerate( } } +export type PainterResult = + | { kind: "real"; imageUrl: string; imageUuid: string } + | { kind: "mock"; imageUrl: string }; + export async function runPainter( config: EngineConfig, input: PainterInput, entryBeat: Beat | undefined, -): Promise { - if (config.mockImage) return mockImageBase64(); +): Promise { + if (config.mockImage) { + return { kind: "mock", imageUrl: await mockImageDataUri() }; + } const prompt = buildPainterPrompt( input.integratedPrompt, @@ -135,11 +152,12 @@ export async function runPainter( { referenceImages: refs }, `referenceImages (${refs.length})`, ); - if (r) return r; + if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid }; } // Tier B — pure text-to-image. Last resort, used when Tier A failed OR // there are no references to send (first scene with no characters yet). // Errors here propagate to the caller. - return generateImage(config.image, prompt); + const r = await generateImage(config.image, prompt); + return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid }; } diff --git a/packages/engine/src/annotate.ts b/packages/engine/src/annotate.ts index 77df5da..6991301 100644 --- a/packages/engine/src/annotate.ts +++ b/packages/engine/src/annotate.ts @@ -1,10 +1,44 @@ import sharp from "sharp"; +const FETCH_TIMEOUT_MS = 5000; + +// Pull the bytes from an image URL or data URI into a Buffer suitable for +// sharp. Data URIs are decoded inline (no network); http(s) URLs are fetched +// with a short timeout — if Runware's CDN is slow we'd rather fail the vision +// step quickly than tie up a 60s Vercel function on a single image read. +async function loadImageBuffer(imageUrl: string): Promise { + if (imageUrl.startsWith("data:")) { + const comma = imageUrl.indexOf(","); + if (comma === -1) throw new Error("Malformed data URI in prevImageUrl"); + const b64 = imageUrl.slice(comma + 1); + return Buffer.from(b64, "base64"); + } + + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS); + try { + const res = await fetch(imageUrl, { signal: ctrl.signal }); + if (!res.ok) { + throw new Error( + `Failed to fetch prevImageUrl (${res.status}): ${imageUrl.slice(0, 120)}`, + ); + } + const arr = await res.arrayBuffer(); + return Buffer.from(arr); + } finally { + clearTimeout(timer); + } +} + +// Marks the player's click point on the scene image so the vision LLM can see +// WHERE they tapped. Output is base64 because the vision LLM is called over +// the OpenAI-compatible chat endpoint, which only accepts image_url data URIs +// — we can't hand it a Runware CDN URL directly. export async function annotateClick( - imageBase64: string, + imageUrl: string, click: { x: number; y: number }, ): Promise { - const buf = Buffer.from(imageBase64, "base64"); + const buf = await loadImageBuffer(imageUrl); const resized = await sharp(buf) .resize({ width: 768, withoutEnlargement: true, fit: "inside" }) diff --git a/packages/engine/src/director.ts b/packages/engine/src/director.ts index df7bde2..a1526e0 100644 --- a/packages/engine/src/director.ts +++ b/packages/engine/src/director.ts @@ -1,4 +1,4 @@ -import { chat, uploadImage } from "@yume/ai-client"; +import { chat } from "@yume/ai-client"; import type { Character, EngineConfig, @@ -29,7 +29,7 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; // │ // ├─ CharacterDesigner LLM × N (parallel per new char) // │ │ -// │ ├─ portrait gen + upload (parallel within agent) +// │ ├─ portrait gen (Runware returns URL + UUID in one call) // │ └─ voice provisioning (parallel within agent) // │ // ├─ Cinematographer LLM (parallel with all of the above) @@ -37,13 +37,11 @@ import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts"; // └─ wait for all parallel branches // │ // ▼ -// Painter (FLUX referenceImages — two-tier degradation chain) +// Painter — generateImage with referenceImages (UUID/URL refs only; +// no base64 to upload, since outputType=URL gives both back) // │ // ▼ -// upload final scene image → Scene.imageUuid -// │ -// ▼ -// return { scene, sceneImageBase64, characters } +// return { scene, sceneImageUrl, characters } // // The Cinematographer intentionally does NOT depend on CharacterDesigner // output — it only positions named characters in the frame, not their @@ -80,7 +78,7 @@ export function mergeCharacters( ...u, voice: u.voice ?? prev.voice, visualDescription: u.visualDescription ?? prev.visualDescription, - basePortraitBase64: u.basePortraitBase64 ?? prev.basePortraitBase64, + basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl, basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid, voiceDescription: u.voiceDescription || prev.voiceDescription, }); @@ -92,27 +90,22 @@ export function mergeCharacters( // scene — used by the Painter as one of the `referenceImages` (NOT as a // seedImage, because FLUX.2 [klein] 9B KV does not support seedImage). // -// Returns the UUID if available (cheap reference, ~36 chars over the wire), -// else the base64 of the most recent matching scene's image. Returns -// undefined when no prior scene shares the current sceneKey. +// Prefer URL over UUID for the same reason painter.collectReferenceImages +// does: the UUID returned by `imageInference` isn't always recognized by +// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`. +// The URL is Runware's own CDN link — they can always fetch it. UUID is kept +// as a backstop. Returns undefined when no prior scene shares the sceneKey. function pickPriorSceneReference( session: Session, currentSceneKey: string | undefined, - priorImageBase64ByUuid: Map, ): { priorSceneReference?: string; priorSceneKey?: string } { if (!currentSceneKey) return {}; for (let i = session.history.length - 1; i >= 0; i--) { const prior = session.history[i]!.scene; if (prior.sceneKey === currentSceneKey) { - if (prior.imageUuid) { - return { - priorSceneReference: prior.imageUuid, - priorSceneKey: prior.sceneKey, - }; - } - const cached = priorImageBase64ByUuid.get(prior.id); - if (cached) { - return { priorSceneReference: cached, priorSceneKey: prior.sceneKey }; + const ref = prior.imageUrl ?? prior.imageUuid; + if (ref) { + return { priorSceneReference: ref, priorSceneKey: prior.sceneKey }; } } } @@ -121,25 +114,18 @@ function pickPriorSceneReference( export type SceneResult = { scene: Scene; - sceneImageBase64: string; + sceneImageUrl: string; characters: Character[]; }; // ────────────────────────────────────────────────────────────────────── // directScene — the multi-agent pipeline. Used by orchestrator's // startSession and requestScene. -// -// priorImageBase64ByUuid: optional map from prior Scene.id → base64 -// the caller has on-hand. If a sceneKey-hit scene's imageUuid is missing -// but the base64 is cached locally, we can still feed it as one of the -// Painter's referenceImages. Pass an empty map when caller has no cache -// (orchestrator does pass it for the start-session bootstrap). // ────────────────────────────────────────────────────────────────────── export async function directScene( config: EngineConfig, session: Session, - priorImageBase64ByUuid: Map = new Map(), ): Promise { const tTotal = Date.now(); @@ -168,7 +154,6 @@ export async function directScene( const { priorSceneReference, priorSceneKey } = pickPriorSceneReference( session, writerOut.sceneKey, - priorImageBase64ByUuid, ); // Stage 2 — parallel: CharacterDesigner(s) and Cinematographer. @@ -237,7 +222,7 @@ export async function directScene( ); const tPainter = Date.now(); - const sceneImageBase64 = await runPainter( + const painted = await runPainter( config, { integratedPrompt: cinemaOut.integratedPrompt, @@ -249,22 +234,6 @@ export async function directScene( ); tlog("[directScene] Painter", tPainter); - // Stage 4 — best-effort upload of the final scene image so the NEXT - // sceneKey-match call can reference its UUID instead of carrying base64. - // If upload fails, the scene still works; only loses cheap referencing - // on the next hop. Don't wait on mock images (static placeholder). - let imageUuid: string | undefined; - if (!config.mockImage) { - try { - const tUpload = Date.now(); - imageUuid = await uploadImage(config.image, sceneImageBase64); - tlog("[directScene] image upload", tUpload); - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.warn(`[directScene] scene image upload failed: ${msg} — sceneKey reuse will need base64 fallback`); - } - } - const scene: Scene = { id: newSceneId(), // scenePrompt is the cinematographer's English compositional output; @@ -276,12 +245,13 @@ export async function directScene( beats: writerOut.beats, entryBeatId: writerOut.entryBeatId, sceneKey: writerOut.sceneKey, - imageUuid, + imageUuid: painted.kind === "real" ? painted.imageUuid : undefined, + imageUrl: painted.imageUrl, }; tlog("[directScene] TOTAL", tTotal); - return { scene, sceneImageBase64, characters }; + return { scene, sceneImageUrl: painted.imageUrl, characters }; } // ────────────────────────────────────────────────────────────────────── diff --git a/packages/engine/src/jsonParser.ts b/packages/engine/src/jsonParser.ts index 20130fc..68d9de9 100644 --- a/packages/engine/src/jsonParser.ts +++ b/packages/engine/src/jsonParser.ts @@ -1,13 +1,44 @@ +import { jsonrepair, JSONRepairError } from "jsonrepair"; + // Strict-then-forgiving JSON parser for LLM output. Tries in order: // 1. Direct JSON.parse on the trimmed text. // 2. Extract from ```json``` fenced block. // 3. Slice between first { and last } and parse. -// 4. Apply best-effort regex repair (trailing commas, missing commas -// between adjacent values) and try again. +// 4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair. // -// On final failure, logs the first 800 chars of the raw model output so we -// can see what the LLM did wrong (the standard error message only shows -// the position, not the surrounding context). +// On final failure, logs the FULL raw model output so we can diagnose the +// actual syntax error. +// +// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the +// broad LLM-output failure modes: truncated JSON, missing commas/brackets, +// single quotes, Python None/True/False, JS comments. We layer a small set +// of targeted pre-repairs in front of it for failure modes jsonrepair can't +// disambiguate on its own (see preRepair). + +// ────────────────────────────────────────────────────────────────────── +// preRepair — fix specific LLM error patterns before handing to jsonrepair. +// +// Pattern 1: missing closing quote on a key. +// Broken: "lineDelivery: "语速稍快...", +// Correct: "lineDelivery": "语速稍快...", +// +// jsonrepair fails on this because it's ambiguous — "lineDelivery: " could +// be a complete string value, leaving "语速稍快..." as a syntax error. But +// if we see ":" we know structurally it should be +// a key-colon-value triplet. +// +// Match constraints: +// - The key match excludes " \n : so we can't overrun into adjacent +// fields or absorb the colon as part of the key name. +// - The colon must be followed by whitespace and another " (the value +// string's opening quote). This is what disambiguates from a value +// string that happens to contain a colon. +// ────────────────────────────────────────────────────────────────────── + +function preRepair(s: string): string { + return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"'); +} + export function parseJsonLoose(raw: string): T { const trimmed = raw.trim(); @@ -28,54 +59,36 @@ export function parseJsonLoose(raw: string): T { const first = trimmed.indexOf("{"); const last = trimmed.lastIndexOf("}"); - if (first !== -1 && last > first) { - const slice = trimmed.slice(first, last + 1); - try { - return JSON.parse(slice) as T; - } catch { - // Last resort: try repairing common LLM-output malformations. - const repaired = repairJsonString(slice); + const slice = + first !== -1 && last > first ? trimmed.slice(first, last + 1) : trimmed; + + // Try the brace-sliced version first; if there were no braces at all + // (slice === trimmed), this is just a second attempt at the raw text. + try { + return JSON.parse(slice) as T; + } catch { + // Targeted pre-repair (no-op on already-valid JSON) → jsonrepair. + const prefixed = preRepair(slice); + + // If preRepair changed something, give the cheap path another shot — + // the input might already be valid now without needing jsonrepair. + if (prefixed !== slice) { try { - return JSON.parse(repaired) as T; - } catch (err) { - console.error( - `[parseJsonLoose] all strategies failed. Raw output (first 800 chars):\n${raw.slice(0, 800)}`, - ); - throw err; + return JSON.parse(prefixed) as T; + } catch { + // fall through to jsonrepair } } + + try { + const repaired = jsonrepair(prefixed); + return JSON.parse(repaired) as T; + } catch (err) { + const isRepairErr = err instanceof JSONRepairError; + console.error( + `[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Full raw model output:\n${raw}`, + ); + throw err; + } } - - console.error( - `[parseJsonLoose] no { ... } found. Raw output (first 800 chars):\n${raw.slice(0, 800)}`, - ); - throw new Error(`Failed to parse JSON from model output: ${raw.slice(0, 200)}`); -} - -// Best-effort repair of LLM-typical JSON syntax errors. Targeted at the two -// most common failures we see in practice: -// 1. Trailing comma before } or ]. -// 2. Missing comma between two adjacent JSON values (the specific error -// mode we hit at position 3390). -// -// Deliberately conservative — does NOT try to fix unclosed strings, -// unbalanced braces, or strip JS-style comments. The comment-stripping -// path was previously included but would corrupt JSON string values -// containing `//` (e.g. URLs like "https://example.com"); since LLMs in -// `responseFormat: "json_object"` mode essentially never emit comments, -// dropping that step is a net win for safety. -function repairJsonString(s: string): string { - return s - // 1. Strip trailing commas before } or ]. - .replace(/,(\s*[}\]])/g, "$1") - // 2. Insert missing commas between two adjacent JSON values. The cases: - // } { → },{ ] [ → ],[ } [ → },[ ] { → ],{ - // "string" "key" "string" { "string" [ - // number then "key" / { / [ - // - // The regex looks for a closing token (} ] " or a digit) followed by - // a newline and an opening token (} ] " a letter), and inserts a - // comma between them. Requires the newline (\s*\n\s*) so it only - // fires across line boundaries, never within a single-line value. - .replace(/(\}|\]|"|\d)(\s*\n\s*)(\{|\[|")/g, "$1,$2$3"); } diff --git a/packages/engine/src/mockImage.ts b/packages/engine/src/mockImage.ts index 3d3b2ae..fcc0d5c 100644 --- a/packages/engine/src/mockImage.ts +++ b/packages/engine/src/mockImage.ts @@ -1,11 +1,15 @@ import sharp from "sharp"; -let cached: string | undefined; +let cachedDataUri: string | undefined; // A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the // TTS path without paying for image generation. Generated once, then memoized. -export async function mockImageBase64(): Promise { - if (cached) return cached; +// Returned as a data URI so the rest of the pipeline can treat it as an +// `imageUrl` interchangeably with real Runware URLs (the client's +// accepts both, and we never feed a mock image to Runware's referenceImages +// because mockImage mode short-circuits the Painter entirely). +export async function mockImageDataUri(): Promise { + if (cachedDataUri) return cachedDataUri; const W = 1792; const H = 1024; @@ -20,6 +24,6 @@ export async function mockImageBase64(): Promise { `; const png = await sharp(Buffer.from(svg)).png().toBuffer(); - cached = png.toString("base64"); - return cached; + cachedDataUri = `data:image/png;base64,${png.toString("base64")}`; + return cachedDataUri; } diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts index 87a8e6a..7813d27 100644 --- a/packages/engine/src/orchestrator.ts +++ b/packages/engine/src/orchestrator.ts @@ -49,14 +49,14 @@ export async function startSession( characters: [], }; - const { scene, sceneImageBase64, characters } = await directScene(config, session); + const { scene, sceneImageUrl, characters } = await directScene(config, session); tlog("[start] TOTAL", tTotal); return { sessionId: session.id, scene, - imageBase64: sceneImageBase64, + imageUrl: sceneImageUrl, characters, }; } @@ -71,7 +71,7 @@ export async function requestScene( ): Promise { const tTotal = Date.now(); - const { scene, sceneImageBase64, characters } = await directScene( + const { scene, sceneImageUrl, characters } = await directScene( config, req.session, ); @@ -80,7 +80,7 @@ export async function requestScene( return { scene, - imageBase64: sceneImageBase64, + imageUrl: sceneImageUrl, characters, }; } @@ -95,7 +95,7 @@ export async function visionDecide( config: EngineConfig, req: VisionRequest, ): Promise { - const annotated = await annotateClick(req.prevImageBase64, req.click); + const annotated = await annotateClick(req.prevImageUrl, req.click); const current = req.session.history.at(-1)?.scene ?? null; return interpret(config.vision, annotated, current); } diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 01be754..e98503f 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -56,17 +56,24 @@ export type Scene = { * e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this * key, the Painter slots the previous Scene's image into Runware's * `referenceImages` (alongside character portraits) so the same physical - * space stays visually consistent across cuts. (Originally planned as a - * seedImage / img2img anchor, but FLUX.2 [klein] 9B KV does not support - * seedImage — referenceImages serves the same purpose with the model.) + * space stays visually consistent across cuts. */ sceneKey?: string; /** - * Runware UUID of this Scene's generated image — once uploaded, subsequent - * Scenes that match sceneKey can reference it via `referenceImages` - * without resending base64. + * Runware UUID of this Scene's generated image. Cheapest form to send back + * to Runware's `referenceImages` in subsequent calls (UUID > URL > base64 + * in transport cost). Not shown to the client — `imageUrl` is what renders. */ imageUuid?: string; + /** + * Public CDN URL of this Scene's generated image. Returned to the client for + * `` rendering, and is what the client passes back to `/api/vision` + * as `prevImageUrl` so the server can re-fetch the bytes for click annotation. + * + * For MOCK_IMAGE=true this is a `data:image/png;base64,...` data URI, not a + * Runware URL — the client renders both forms transparently. + */ + imageUrl?: string; }; export type SceneExit = @@ -111,17 +118,17 @@ export type Character = { */ visualDescription?: string; /** - * Base portrait image generated by the CharacterDesigner once, then reused - * as a Runware `referenceImages` entry in every subsequent scene the - * character appears in. Stored as base64 for client display. - */ - basePortraitBase64?: string; - /** - * Runware UUID for the base portrait. Once uploaded via the image-upload - * endpoint, subsequent Painter calls reference this UUID instead of - * resending the full base64 payload. + * Runware UUID for the base portrait. Generated by the CharacterDesigner + * once, reused as a `referenceImages` entry on every subsequent scene the + * character appears in. UUID is the cheapest reference form for Runware. */ basePortraitUuid?: string; + /** + * Public CDN URL for the base portrait. Same image as `basePortraitUuid`; + * kept around for the client (if it ever wants to render character cards) + * and as a fallback reference form for `referenceImages` when UUID is absent. + */ + basePortraitUrl?: string; /** Xiaomi MiMo voice reference audio. */ voice?: CharacterVoice; }; @@ -196,7 +203,8 @@ export type StartRequest = { export type StartResponse = { sessionId: string; scene: Scene; - imageBase64: string; + /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */ + imageUrl: string; /** Character registry with voice references + visual cards provisioned. */ characters: Character[]; }; @@ -210,7 +218,8 @@ export type SceneRequest = { export type SceneResponse = { scene: Scene; - imageBase64: string; + /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */ + imageUrl: string; characters: Character[]; }; @@ -235,7 +244,12 @@ export type BeatAudioResponse = { // trigger a scene change. export type VisionRequest = { session: Session; - prevImageBase64: string; + /** + * Public CDN URL (or data URI in MOCK_IMAGE mode) of the scene the player + * just clicked. The server re-fetches the bytes to annotate the click and + * pass an OpenAI-compatible image_url to the vision LLM. + */ + prevImageUrl: string; click: { x: number; y: number }; }; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 483ebce..8607276 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -75,6 +75,9 @@ importers: '@yume/types': specifier: workspace:* version: link:../types + jsonrepair: + specifier: ^3.14.0 + version: 3.14.0 sharp: specifier: ^0.33.5 version: 0.33.5 @@ -594,6 +597,10 @@ packages: resolution: {integrity: sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==} hasBin: true + jsonrepair@3.14.0: + resolution: {integrity: sha512-tWPGKMZf/8UPim+fcW2EfcQ/d/7aKUrP6IECz9G3Tu6Q5dX0orSleqJ9z6sSw7qrQkjF8/Edo4DvsWBZ8H+HNg==} + hasBin: true + lilconfig@3.1.3: resolution: {integrity: sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==} engines: {node: '>=14'} @@ -1240,6 +1247,8 @@ snapshots: jiti@1.21.7: {} + jsonrepair@3.14.0: {} + lilconfig@3.1.3: {} lines-and-columns@1.2.4: {} diff --git a/vercel.json b/vercel.json deleted file mode 100644 index 5af8dcf..0000000 --- a/vercel.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "$schema": "https://openapi.vercel.sh/vercel.json", - "framework": "nextjs", - "buildCommand": "pnpm build", - "installCommand": "pnpm install", - "functions": { - "apps/web/app/api/interact/route.ts": { "maxDuration": 60 }, - "apps/web/app/api/vision/route.ts": { "maxDuration": 60 }, - "apps/web/app/api/start/route.ts": { "maxDuration": 60 } - } -}