feat(web,engine): custom style — image upload, AI-extract prompt, painter ref

自定义画风入口里加上传按钮:客户端把图缩到 512px webp(base64),传到新
路由 /api/parse-style-image,vision LLM 解析成英文 style prompt 回填 textarea;
图本身随 sessionStorage → /api/start → Session.styleReferenceImage 透传,
painter.collectReferenceImages 把它置于 slot 0,整局每一幕都作为 reference
图锚定画风(brush / color / mood),比 priorScene 优先级更高。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
DESKTOP-I1T6TF3\Q
2026-06-03 19:15:19 +08:00
parent 298ecd4ec0
commit 347ab297d5
10 changed files with 396 additions and 15 deletions
+1 -1
View File
@@ -1,5 +1,5 @@
export { chat } from "./chat";
export { generateImage } from "./image";
export type { GenerateImageOptions, GenerateImageResult } from "./image";
export { interpretClick } from "./vision";
export { interpretClick, analyzeImageDataUrl } from "./vision";
export type { ChatMessage } from "./chat";
+26 -6
View File
@@ -5,26 +5,46 @@ export async function interpretClick(
config: ProviderConfig,
imageBase64: string,
prompt: string,
): Promise<string> {
// Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
// client encodes as PNG. analyzeImageDataUrl handles the actual request.
return analyzeImageDataUrl(
config,
`data:image/png;base64,${imageBase64}`,
prompt,
{ responseFormat: "json_object" },
);
}
/**
* General single-image vision call. Accepts a complete data URL (preserves
* the source mime type, e.g. webp/jpeg) and lets the caller opt out of
* `response_format: json_object` for free-form text responses.
*/
export async function analyzeImageDataUrl(
config: ProviderConfig,
imageDataUrl: string,
prompt: string,
opts: { responseFormat?: "json_object" | "text" } = {},
): Promise<string> {
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
const body = {
const body: Record<string, unknown> = {
model: config.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "image_url",
image_url: { url: `data:image/png;base64,${imageBase64}` },
},
{ type: "image_url", image_url: { url: imageDataUrl } },
],
},
],
temperature: 0.2,
response_format: { type: "json_object" },
};
if (opts.responseFormat === "json_object") {
body.response_format = { type: "json_object" };
}
const timeoutCtrl = new AbortController();
const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
+20 -4
View File
@@ -47,6 +47,13 @@ export type PainterInput = {
* with character refs, capped at 4 total per Runware spec.
*/
priorSceneImage?: string;
/**
* User-uploaded style reference (data URL base64). When set, it takes the
* highest-priority slot in referenceImages so the painting STYLE (brush /
* color / mood) of the user's image is anchored across every scene this
* session paints — even before any priorScene exists.
*/
styleReferenceImage?: string;
};
// Pick the references we send to Runware as `referenceImages`. Priority:
@@ -59,14 +66,22 @@ export function collectReferenceImages(
characters: Character[],
entryBeat: Beat | undefined,
priorSceneImage: string | undefined,
styleReferenceImage?: string,
): string[] {
const refs: string[] = [];
const seen = new Set<string>();
// Slot 0 — prior scene image for spatial continuity. Goes first because
// backdrop drift is the most jarring discontinuity across same-sceneKey
// scenes; character drift is partially masked by character archetype text
// in the prompt anyway.
// Slot 0 — user-uploaded style reference image, if any. Goes first because
// it anchors the whole-session painting STYLE (brush / color / mood) that
// the user explicitly chose. priorScene continuity comes second; character
// archetypes are partially covered by the prompt text anyway.
if (styleReferenceImage) {
refs.push(styleReferenceImage);
}
// Slot N — prior scene image for spatial continuity. Backdrop drift is the
// next-most jarring discontinuity across same-sceneKey scenes; character
// drift is partially masked by character archetype text in the prompt.
if (priorSceneImage) {
refs.push(priorSceneImage);
}
@@ -140,6 +155,7 @@ export async function runPainter(
input.onStageCharacters,
entryBeat,
input.priorSceneImage,
input.styleReferenceImage,
);
// Tier A — with referenceImages (priorSceneImage + character portraits).
+1
View File
@@ -327,6 +327,7 @@ export async function directScene(
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
styleReferenceImage: session.styleReferenceImage,
},
entryBeat,
);
+1
View File
@@ -47,6 +47,7 @@ export async function startSession(
styleGuide: req.styleGuide.trim(),
history: [],
characters: [],
styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
};
// Stage 0 — Architect: expand the terse world/style prompt into a story
+23
View File
@@ -206,6 +206,14 @@ export type Session = {
* session payload created before this field existed.
*/
storyState?: StoryState;
/**
* Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
* When set, the Painter prepends it to `referenceImages` on every scene so the
* uploaded image anchors painting style (brush, color, mood) across the whole
* session. Resized client-side before upload (~512px max dim) to keep session
* payload small for /api/scene round-trips.
*/
styleReferenceImage?: string;
};
// ──────────────────────────────────────────────────────────────────────
@@ -253,6 +261,21 @@ export type EngineConfig = {
export type StartRequest = {
worldSetting: string;
styleGuide: string;
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
styleReferenceImage?: string;
};
// /api/parse-style-image — vision LLM extracts a textual painting-style
// prompt from a user-uploaded reference image. The same base64 is echoed
// back so the client can later pass it through to /api/start.
export type ParseStyleImageRequest = {
/** Data URL: `data:image/...;base64,...`. */
imageDataUrl: string;
};
export type ParseStyleImageResponse = {
/** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
stylePrompt: string;
};
export type StartResponse = {