9fc83de276
Thread orientation (portrait|landscape) from client through API, engine, and image gen. Portrait devices render 1024x1792 (9:16) full-bleed scenes; desktop/landscape keeps 1792x1024 (16:9). Adds cover-aware click→image coordinate mapping, session-locked orientation, a shared coerceOrientation helper, and a choices overflow cap in portrait. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
189 lines
7.2 KiB
TypeScript
189 lines
7.2 KiB
TypeScript
import { generateImage } from "@infiplot/ai-client";
|
||
import type { GenerateImageOptions, GenerateImageResult } from "@infiplot/ai-client";
|
||
import type {
|
||
Beat,
|
||
Character,
|
||
EngineConfig,
|
||
Orientation,
|
||
ProviderConfig,
|
||
} from "@infiplot/types";
|
||
import { mockImageDataUri } from "../mockImage";
|
||
import { buildPainterPrompt } from "../prompts";
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Painter — final image generation with multi-reference anchoring.
|
||
//
|
||
// FLUX.2 [klein] 9B KV does NOT support seedImage (img2img). Instead,
|
||
// visual continuity comes entirely from `referenceImages` (capped at 4),
|
||
// which the KV-optimized variant accelerates ~2.5× via key-value caching
|
||
// of reference latents.
|
||
//
|
||
// References are slotted in priority order (max 4):
|
||
// 1. Prior scene image — when sceneKey matched a previous scene, this
|
||
// anchors the same physical space (lighting/layout/style continuity)
|
||
// 2. Entry beat's speaker portrait — the NPC the player is talking with
|
||
// (most visually prominent)
|
||
// 3. Other on-stage NPCs' portraits — secondary characters in the frame
|
||
//
|
||
// References are sent as UUIDs (preferred — cheapest in transport) or URLs
|
||
// (fallback — still cheaper than base64). Base64 fallback was removed when
|
||
// generateImage switched to outputType=URL, which always returns both a UUID
|
||
// and a URL so we never lack a cheap reference handle.
|
||
//
|
||
// Failure handling — two-tier degradation:
|
||
// A. referenceImages call (preferred — full visual anchoring)
|
||
// B. pure text-to-image fallback (last resort if Runware refs API errors)
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
const MAX_REFERENCE_IMAGES = 4;
|
||
|
||
export type PainterInput = {
|
||
integratedPrompt: string;
|
||
styleGuide: string;
|
||
onStageCharacters: Character[];
|
||
/**
|
||
* Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
|
||
* scene), it slots into referenceImages[0] for spatial continuity.
|
||
* Capacity-wise this displaces ONE character portrait — slot is shared
|
||
* with character refs, capped at 4 total per Runware spec.
|
||
*/
|
||
priorSceneImage?: string;
|
||
/**
|
||
* User-uploaded style reference (data URL base64). When set, it takes the
|
||
* highest-priority slot in referenceImages so the painting STYLE (brush /
|
||
* color / mood) of the user's image is anchored across every scene this
|
||
* session paints — even before any priorScene exists.
|
||
*/
|
||
styleReferenceImage?: string;
|
||
/**
|
||
* Session-locked output aspect. Drives both the Painter prompt's framing
|
||
* rules and the generated image's pixel dimensions. Default "landscape".
|
||
*/
|
||
orientation?: Orientation;
|
||
};
|
||
|
||
// Pick the references we send to Runware as `referenceImages`. Priority:
|
||
// slot 0: priorSceneImage (if any — sceneKey continuity)
|
||
// slot 1: entry beat's speaker portrait (the NPC speaking to the player)
|
||
// slot 2+: other on-stage NPCs from entry beat's activeCharacters
|
||
// Caps at 4 total. Returns the array exactly as it'll be sent — already
|
||
// truncated, already deduplicated.
|
||
export function collectReferenceImages(
|
||
characters: Character[],
|
||
entryBeat: Beat | undefined,
|
||
priorSceneImage: string | undefined,
|
||
styleReferenceImage?: string,
|
||
): string[] {
|
||
const refs: string[] = [];
|
||
const seen = new Set<string>();
|
||
|
||
// Slot 0 — user-uploaded style reference image, if any. Goes first because
|
||
// it anchors the whole-session painting STYLE (brush / color / mood) that
|
||
// the user explicitly chose. priorScene continuity comes second; character
|
||
// archetypes are partially covered by the prompt text anyway.
|
||
if (styleReferenceImage) {
|
||
refs.push(styleReferenceImage);
|
||
}
|
||
|
||
// Slot N — prior scene image for spatial continuity. Backdrop drift is the
|
||
// next-most jarring discontinuity across same-sceneKey scenes; character
|
||
// drift is partially masked by character archetype text in the prompt.
|
||
if (priorSceneImage) {
|
||
refs.push(priorSceneImage);
|
||
}
|
||
|
||
// Slot 1+ — character portraits, speaker-first.
|
||
//
|
||
// Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
|
||
// UUID isn't always recognized by the `referenceImages` pipeline (the error
|
||
// surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
|
||
// they can always fetch it from their own infra. UUID is kept as a backstop
|
||
// for any edge case where URL is missing (e.g., legacy session state).
|
||
const speakerName = entryBeat?.speaker;
|
||
if (speakerName) {
|
||
const speaker = characters.find((c) => c.name === speakerName);
|
||
const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
|
||
if (ref && refs.length < MAX_REFERENCE_IMAGES) {
|
||
refs.push(ref);
|
||
seen.add(speakerName);
|
||
}
|
||
}
|
||
|
||
for (const c of entryBeat?.activeCharacters ?? []) {
|
||
if (refs.length >= MAX_REFERENCE_IMAGES) break;
|
||
if (seen.has(c.name)) continue;
|
||
const char = characters.find((x) => x.name === c.name);
|
||
const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
|
||
if (ref) {
|
||
refs.push(ref);
|
||
seen.add(c.name);
|
||
}
|
||
}
|
||
|
||
return refs.slice(0, MAX_REFERENCE_IMAGES);
|
||
}
|
||
|
||
async function tryGenerate(
|
||
config: ProviderConfig,
|
||
prompt: string,
|
||
options: GenerateImageOptions,
|
||
label: string,
|
||
): Promise<GenerateImageResult | null> {
|
||
try {
|
||
return await generateImage(config, prompt, options);
|
||
} catch (err) {
|
||
const msg = err instanceof Error ? err.message : String(err);
|
||
console.warn(`[painter] ${label} failed: ${msg}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
export type PainterResult =
|
||
| { kind: "real"; imageUrl: string; imageUuid: string }
|
||
| { kind: "mock"; imageUrl: string };
|
||
|
||
export async function runPainter(
|
||
config: EngineConfig,
|
||
input: PainterInput,
|
||
entryBeat: Beat | undefined,
|
||
): Promise<PainterResult> {
|
||
if (config.mockImage) {
|
||
return { kind: "mock", imageUrl: await mockImageDataUri(input.orientation) };
|
||
}
|
||
|
||
const prompt = buildPainterPrompt(
|
||
input.integratedPrompt,
|
||
input.styleGuide,
|
||
input.onStageCharacters,
|
||
input.orientation,
|
||
);
|
||
|
||
const refs = collectReferenceImages(
|
||
input.onStageCharacters,
|
||
entryBeat,
|
||
input.priorSceneImage,
|
||
input.styleReferenceImage,
|
||
);
|
||
|
||
// Tier A — with referenceImages (priorSceneImage + character portraits).
|
||
// FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
|
||
// ~2.5× compared to the non-KV variant.
|
||
if (refs.length > 0) {
|
||
const r = await tryGenerate(
|
||
config.image,
|
||
prompt,
|
||
{ referenceImages: refs, orientation: input.orientation },
|
||
`referenceImages (${refs.length})`,
|
||
);
|
||
if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
|
||
}
|
||
|
||
// Tier B — pure text-to-image. Last resort, used when Tier A failed OR
|
||
// there are no references to send (first scene with no characters yet).
|
||
// Errors here propagate to the caller.
|
||
const r = await generateImage(config.image, prompt, {
|
||
orientation: input.orientation,
|
||
});
|
||
return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
|
||
}
|