Files
infiplot-web/lib/engine/agents/painter.ts
T
DESKTOP-I1T6TF3\Q 347ab297d5 feat(web,engine): custom style — image upload, AI-extract prompt, painter ref
自定义画风入口里加上传按钮:客户端把图缩到 512px webp(base64),传到新
路由 /api/parse-style-image,vision LLM 解析成英文 style prompt 回填 textarea;
图本身随 sessionStorage → /api/start → Session.styleReferenceImage 透传,
painter.collectReferenceImages 把它置于 slot 0,整局每一幕都作为 reference
图锚定画风(brush / color / mood),比 priorScene 优先级更高。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 19:15:19 +08:00

180 lines
6.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { generateImage } from "@infiplot/ai-client";
import type { GenerateImageOptions, GenerateImageResult } from "@infiplot/ai-client";
import type {
Beat,
Character,
EngineConfig,
ProviderConfig,
} from "@infiplot/types";
import { mockImageDataUri } from "../mockImage";
import { buildPainterPrompt } from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Painter — final image generation with multi-reference anchoring.
//
// FLUX.2 [klein] 9B KV does NOT support seedImage (img2img). Instead,
// visual continuity comes entirely from `referenceImages` (capped at 4),
// which the KV-optimized variant accelerates ~2.5× via key-value caching
// of reference latents.
//
// References are slotted in priority order (max 4):
// 1. Prior scene image — when sceneKey matched a previous scene, this
// anchors the same physical space (lighting/layout/style continuity)
// 2. Entry beat's speaker portrait — the NPC the player is talking with
// (most visually prominent)
// 3. Other on-stage NPCs' portraits — secondary characters in the frame
//
// References are sent as UUIDs (preferred — cheapest in transport) or URLs
// (fallback — still cheaper than base64). Base64 fallback was removed when
// generateImage switched to outputType=URL, which always returns both a UUID
// and a URL so we never lack a cheap reference handle.
//
// Failure handling — two-tier degradation:
// A. referenceImages call (preferred — full visual anchoring)
// B. pure text-to-image fallback (last resort if Runware refs API errors)
// ──────────────────────────────────────────────────────────────────────
const MAX_REFERENCE_IMAGES = 4;
export type PainterInput = {
integratedPrompt: string;
styleGuide: string;
onStageCharacters: Character[];
/**
* Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
* scene), it slots into referenceImages[0] for spatial continuity.
* Capacity-wise this displaces ONE character portrait — slot is shared
* with character refs, capped at 4 total per Runware spec.
*/
priorSceneImage?: string;
/**
* User-uploaded style reference (data URL base64). When set, it takes the
* highest-priority slot in referenceImages so the painting STYLE (brush /
* color / mood) of the user's image is anchored across every scene this
* session paints — even before any priorScene exists.
*/
styleReferenceImage?: string;
};
// Pick the references we send to Runware as `referenceImages`. Priority:
// slot 0: priorSceneImage (if any — sceneKey continuity)
// slot 1: entry beat's speaker portrait (the NPC speaking to the player)
// slot 2+: other on-stage NPCs from entry beat's activeCharacters
// Caps at 4 total. Returns the array exactly as it'll be sent — already
// truncated, already deduplicated.
export function collectReferenceImages(
characters: Character[],
entryBeat: Beat | undefined,
priorSceneImage: string | undefined,
styleReferenceImage?: string,
): string[] {
const refs: string[] = [];
const seen = new Set<string>();
// Slot 0 — user-uploaded style reference image, if any. Goes first because
// it anchors the whole-session painting STYLE (brush / color / mood) that
// the user explicitly chose. priorScene continuity comes second; character
// archetypes are partially covered by the prompt text anyway.
if (styleReferenceImage) {
refs.push(styleReferenceImage);
}
// Slot N — prior scene image for spatial continuity. Backdrop drift is the
// next-most jarring discontinuity across same-sceneKey scenes; character
// drift is partially masked by character archetype text in the prompt.
if (priorSceneImage) {
refs.push(priorSceneImage);
}
// Slot 1+ — character portraits, speaker-first.
//
// Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
// UUID isn't always recognized by the `referenceImages` pipeline (the error
// surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
// they can always fetch it from their own infra. UUID is kept as a backstop
// for any edge case where URL is missing (e.g., legacy session state).
const speakerName = entryBeat?.speaker;
if (speakerName) {
const speaker = characters.find((c) => c.name === speakerName);
const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
if (ref && refs.length < MAX_REFERENCE_IMAGES) {
refs.push(ref);
seen.add(speakerName);
}
}
for (const c of entryBeat?.activeCharacters ?? []) {
if (refs.length >= MAX_REFERENCE_IMAGES) break;
if (seen.has(c.name)) continue;
const char = characters.find((x) => x.name === c.name);
const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
if (ref) {
refs.push(ref);
seen.add(c.name);
}
}
return refs.slice(0, MAX_REFERENCE_IMAGES);
}
async function tryGenerate(
config: ProviderConfig,
prompt: string,
options: GenerateImageOptions,
label: string,
): Promise<GenerateImageResult | null> {
try {
return await generateImage(config, prompt, options);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.warn(`[painter] ${label} failed: ${msg}`);
return null;
}
}
export type PainterResult =
| { kind: "real"; imageUrl: string; imageUuid: string }
| { kind: "mock"; imageUrl: string };
export async function runPainter(
config: EngineConfig,
input: PainterInput,
entryBeat: Beat | undefined,
): Promise<PainterResult> {
if (config.mockImage) {
return { kind: "mock", imageUrl: await mockImageDataUri() };
}
const prompt = buildPainterPrompt(
input.integratedPrompt,
input.styleGuide,
input.onStageCharacters,
);
const refs = collectReferenceImages(
input.onStageCharacters,
entryBeat,
input.priorSceneImage,
input.styleReferenceImage,
);
// Tier A — with referenceImages (priorSceneImage + character portraits).
// FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
// ~2.5× compared to the non-KV variant.
if (refs.length > 0) {
const r = await tryGenerate(
config.image,
prompt,
{ referenceImages: refs },
`referenceImages (${refs.length})`,
);
if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}
// Tier B — pure text-to-image. Last resort, used when Tier A failed OR
// there are no references to send (first scene with no characters yet).
// Errors here propagate to the caller.
const r = await generateImage(config.image, prompt);
return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}