addbede929
- Move vercel.json to apps/web/ with correct route paths; cap scene route
maxDuration 120→60s for Hobby. Root vercel.json removed. Vercel project's
Root Directory must be set to apps/web (Deploy button URL passes this).
- Switch image transport from base64-in-JSON to Runware-hosted URLs:
generateImage now uses outputType=URL and returns {imageUrl, imageUuid};
StartResponse/SceneResponse carry imageUrl; VisionRequest carries
prevImageUrl (server re-fetches the bytes for click annotation). This
eliminates the 4.5MB serverless body-size risk.
- Painter and director prefer URL over UUID for referenceImages — the UUID
returned by Runware imageInference isn't always recognized in the refs
pipeline (surfaces as `failedToTransferImage`).
- Client preloads scene images via `new Image().decode()` before committing
to React state, so URL transitions render instantly; prefetched scenes
also warm the HTTP cache.
- jsonParser uses the jsonrepair package (replaces hand-rolled repair) and
adds a targeted preRepair regex for the missing-key-close-quote pattern
that jsonrepair couldn't disambiguate. Full raw model output dumped on
failure for diagnostic visibility.
- Default text provider switched to DeepSeek v4-flash via direct API
(significantly more stable JSON than MiMo v2.5-pro). VISION/TTS stay on
MiMo (DeepSeek has no multimodal / TTS offerings).
- next.config: drop dead experimental.serverActions.bodySizeLimit (no
server actions used).
- README: real Deploy button URL (zonghaoyuan/yume + root-directory=apps/web
+ TTS/MOCK_IMAGE in env list); refreshed env vars table with optional
TTS section.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
164 lines
6.2 KiB
TypeScript
164 lines
6.2 KiB
TypeScript
import { generateImage } from "@yume/ai-client";
|
||
import type { GenerateImageOptions, GenerateImageResult } from "@yume/ai-client";
|
||
import type {
|
||
Beat,
|
||
Character,
|
||
EngineConfig,
|
||
ProviderConfig,
|
||
} from "@yume/types";
|
||
import { mockImageDataUri } from "../mockImage";
|
||
import { buildPainterPrompt } from "../prompts";
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Painter — final image generation with multi-reference anchoring.
|
||
//
|
||
// FLUX.2 [klein] 9B KV does NOT support seedImage (img2img). Instead,
|
||
// visual continuity comes entirely from `referenceImages` (capped at 4),
|
||
// which the KV-optimized variant accelerates ~2.5× via key-value caching
|
||
// of reference latents.
|
||
//
|
||
// References are slotted in priority order (max 4):
|
||
// 1. Prior scene image — when sceneKey matched a previous scene, this
|
||
// anchors the same physical space (lighting/layout/style continuity)
|
||
// 2. Entry beat's speaker portrait — the NPC the player is talking with
|
||
// (most visually prominent)
|
||
// 3. Other on-stage NPCs' portraits — secondary characters in the frame
|
||
//
|
||
// References are sent as UUIDs (preferred — cheapest in transport) or URLs
|
||
// (fallback — still cheaper than base64). Base64 fallback was removed when
|
||
// generateImage switched to outputType=URL, which always returns both a UUID
|
||
// and a URL so we never lack a cheap reference handle.
|
||
//
|
||
// Failure handling — two-tier degradation:
|
||
// A. referenceImages call (preferred — full visual anchoring)
|
||
// B. pure text-to-image fallback (last resort if Runware refs API errors)
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
const MAX_REFERENCE_IMAGES = 4;
|
||
|
||
export type PainterInput = {
|
||
integratedPrompt: string;
|
||
styleGuide: string;
|
||
onStageCharacters: Character[];
|
||
/**
|
||
* Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
|
||
* scene), it slots into referenceImages[0] for spatial continuity.
|
||
* Capacity-wise this displaces ONE character portrait — slot is shared
|
||
* with character refs, capped at 4 total per Runware spec.
|
||
*/
|
||
priorSceneImage?: string;
|
||
};
|
||
|
||
// Pick the references we send to Runware as `referenceImages`. Priority:
|
||
// slot 0: priorSceneImage (if any — sceneKey continuity)
|
||
// slot 1: entry beat's speaker portrait (the NPC speaking to the player)
|
||
// slot 2+: other on-stage NPCs from entry beat's activeCharacters
|
||
// Caps at 4 total. Returns the array exactly as it'll be sent — already
|
||
// truncated, already deduplicated.
|
||
export function collectReferenceImages(
|
||
characters: Character[],
|
||
entryBeat: Beat | undefined,
|
||
priorSceneImage: string | undefined,
|
||
): string[] {
|
||
const refs: string[] = [];
|
||
const seen = new Set<string>();
|
||
|
||
// Slot 0 — prior scene image for spatial continuity. Goes first because
|
||
// backdrop drift is the most jarring discontinuity across same-sceneKey
|
||
// scenes; character drift is partially masked by character archetype text
|
||
// in the prompt anyway.
|
||
if (priorSceneImage) {
|
||
refs.push(priorSceneImage);
|
||
}
|
||
|
||
// Slot 1+ — character portraits, speaker-first.
|
||
//
|
||
// Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
|
||
// UUID isn't always recognized by the `referenceImages` pipeline (the error
|
||
// surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
|
||
// they can always fetch it from their own infra. UUID is kept as a backstop
|
||
// for any edge case where URL is missing (e.g., legacy session state).
|
||
const speakerName = entryBeat?.speaker;
|
||
if (speakerName) {
|
||
const speaker = characters.find((c) => c.name === speakerName);
|
||
const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
|
||
if (ref && refs.length < MAX_REFERENCE_IMAGES) {
|
||
refs.push(ref);
|
||
seen.add(speakerName);
|
||
}
|
||
}
|
||
|
||
for (const c of entryBeat?.activeCharacters ?? []) {
|
||
if (refs.length >= MAX_REFERENCE_IMAGES) break;
|
||
if (seen.has(c.name)) continue;
|
||
const char = characters.find((x) => x.name === c.name);
|
||
const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
|
||
if (ref) {
|
||
refs.push(ref);
|
||
seen.add(c.name);
|
||
}
|
||
}
|
||
|
||
return refs.slice(0, MAX_REFERENCE_IMAGES);
|
||
}
|
||
|
||
async function tryGenerate(
|
||
config: ProviderConfig,
|
||
prompt: string,
|
||
options: GenerateImageOptions,
|
||
label: string,
|
||
): Promise<GenerateImageResult | null> {
|
||
try {
|
||
return await generateImage(config, prompt, options);
|
||
} catch (err) {
|
||
const msg = err instanceof Error ? err.message : String(err);
|
||
console.warn(`[painter] ${label} failed: ${msg}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
export type PainterResult =
|
||
| { kind: "real"; imageUrl: string; imageUuid: string }
|
||
| { kind: "mock"; imageUrl: string };
|
||
|
||
export async function runPainter(
|
||
config: EngineConfig,
|
||
input: PainterInput,
|
||
entryBeat: Beat | undefined,
|
||
): Promise<PainterResult> {
|
||
if (config.mockImage) {
|
||
return { kind: "mock", imageUrl: await mockImageDataUri() };
|
||
}
|
||
|
||
const prompt = buildPainterPrompt(
|
||
input.integratedPrompt,
|
||
input.styleGuide,
|
||
input.onStageCharacters,
|
||
);
|
||
|
||
const refs = collectReferenceImages(
|
||
input.onStageCharacters,
|
||
entryBeat,
|
||
input.priorSceneImage,
|
||
);
|
||
|
||
// Tier A — with referenceImages (priorSceneImage + character portraits).
|
||
// FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
|
||
// ~2.5× compared to the non-KV variant.
|
||
if (refs.length > 0) {
|
||
const r = await tryGenerate(
|
||
config.image,
|
||
prompt,
|
||
{ referenceImages: refs },
|
||
`referenceImages (${refs.length})`,
|
||
);
|
||
if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
|
||
}
|
||
|
||
// Tier B — pure text-to-image. Last resort, used when Tier A failed OR
|
||
// there are no references to send (first scene with no characters yet).
|
||
// Errors here propagate to the caller.
|
||
const r = await generateImage(config.image, prompt);
|
||
return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
|
||
}
|