feat(web,engine): custom style — image upload, AI-extract prompt, painter ref
自定义画风入口里加上传按钮:客户端把图缩到 512px webp(base64),传到新 路由 /api/parse-style-image,vision LLM 解析成英文 style prompt 回填 textarea; 图本身随 sessionStorage → /api/start → Session.styleReferenceImage 透传, painter.collectReferenceImages 把它置于 slot 0,整局每一幕都作为 reference 图锚定画风(brush / color / mood),比 priorScene 优先级更高。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
export { chat } from "./chat";
|
||||
export { generateImage } from "./image";
|
||||
export type { GenerateImageOptions, GenerateImageResult } from "./image";
|
||||
export { interpretClick } from "./vision";
|
||||
export { interpretClick, analyzeImageDataUrl } from "./vision";
|
||||
export type { ChatMessage } from "./chat";
|
||||
|
||||
+26
-6
@@ -5,26 +5,46 @@ export async function interpretClick(
|
||||
config: ProviderConfig,
|
||||
imageBase64: string,
|
||||
prompt: string,
|
||||
): Promise<string> {
|
||||
// Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
|
||||
// client encodes as PNG. analyzeImageDataUrl handles the actual request.
|
||||
return analyzeImageDataUrl(
|
||||
config,
|
||||
`data:image/png;base64,${imageBase64}`,
|
||||
prompt,
|
||||
{ responseFormat: "json_object" },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* General single-image vision call. Accepts a complete data URL (preserves
|
||||
* the source mime type, e.g. webp/jpeg) and lets the caller opt out of
|
||||
* `response_format: json_object` for free-form text responses.
|
||||
*/
|
||||
export async function analyzeImageDataUrl(
|
||||
config: ProviderConfig,
|
||||
imageDataUrl: string,
|
||||
prompt: string,
|
||||
opts: { responseFormat?: "json_object" | "text" } = {},
|
||||
): Promise<string> {
|
||||
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
|
||||
|
||||
const body = {
|
||||
const body: Record<string, unknown> = {
|
||||
model: config.model,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: { url: `data:image/png;base64,${imageBase64}` },
|
||||
},
|
||||
{ type: "image_url", image_url: { url: imageDataUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature: 0.2,
|
||||
response_format: { type: "json_object" },
|
||||
};
|
||||
if (opts.responseFormat === "json_object") {
|
||||
body.response_format = { type: "json_object" };
|
||||
}
|
||||
|
||||
const timeoutCtrl = new AbortController();
|
||||
const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
|
||||
|
||||
@@ -47,6 +47,13 @@ export type PainterInput = {
|
||||
* with character refs, capped at 4 total per Runware spec.
|
||||
*/
|
||||
priorSceneImage?: string;
|
||||
/**
|
||||
* User-uploaded style reference (data URL base64). When set, it takes the
|
||||
* highest-priority slot in referenceImages so the painting STYLE (brush /
|
||||
* color / mood) of the user's image is anchored across every scene this
|
||||
* session paints — even before any priorScene exists.
|
||||
*/
|
||||
styleReferenceImage?: string;
|
||||
};
|
||||
|
||||
// Pick the references we send to Runware as `referenceImages`. Priority:
|
||||
@@ -59,14 +66,22 @@ export function collectReferenceImages(
|
||||
characters: Character[],
|
||||
entryBeat: Beat | undefined,
|
||||
priorSceneImage: string | undefined,
|
||||
styleReferenceImage?: string,
|
||||
): string[] {
|
||||
const refs: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Slot 0 — prior scene image for spatial continuity. Goes first because
|
||||
// backdrop drift is the most jarring discontinuity across same-sceneKey
|
||||
// scenes; character drift is partially masked by character archetype text
|
||||
// in the prompt anyway.
|
||||
// Slot 0 — user-uploaded style reference image, if any. Goes first because
|
||||
// it anchors the whole-session painting STYLE (brush / color / mood) that
|
||||
// the user explicitly chose. priorScene continuity comes second; character
|
||||
// archetypes are partially covered by the prompt text anyway.
|
||||
if (styleReferenceImage) {
|
||||
refs.push(styleReferenceImage);
|
||||
}
|
||||
|
||||
// Slot N — prior scene image for spatial continuity. Backdrop drift is the
|
||||
// next-most jarring discontinuity across same-sceneKey scenes; character
|
||||
// drift is partially masked by character archetype text in the prompt.
|
||||
if (priorSceneImage) {
|
||||
refs.push(priorSceneImage);
|
||||
}
|
||||
@@ -140,6 +155,7 @@ export async function runPainter(
|
||||
input.onStageCharacters,
|
||||
entryBeat,
|
||||
input.priorSceneImage,
|
||||
input.styleReferenceImage,
|
||||
);
|
||||
|
||||
// Tier A — with referenceImages (priorSceneImage + character portraits).
|
||||
|
||||
@@ -327,6 +327,7 @@ export async function directScene(
|
||||
styleGuide: session.styleGuide,
|
||||
onStageCharacters,
|
||||
priorSceneImage: priorSceneReference,
|
||||
styleReferenceImage: session.styleReferenceImage,
|
||||
},
|
||||
entryBeat,
|
||||
);
|
||||
|
||||
@@ -47,6 +47,7 @@ export async function startSession(
|
||||
styleGuide: req.styleGuide.trim(),
|
||||
history: [],
|
||||
characters: [],
|
||||
styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
|
||||
};
|
||||
|
||||
// Stage 0 — Architect: expand the terse world/style prompt into a story
|
||||
|
||||
@@ -206,6 +206,14 @@ export type Session = {
|
||||
* session payload created before this field existed.
|
||||
*/
|
||||
storyState?: StoryState;
|
||||
/**
|
||||
* Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
|
||||
* When set, the Painter prepends it to `referenceImages` on every scene so the
|
||||
* uploaded image anchors painting style (brush, color, mood) across the whole
|
||||
* session. Resized client-side before upload (~512px max dim) to keep session
|
||||
* payload small for /api/scene round-trips.
|
||||
*/
|
||||
styleReferenceImage?: string;
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
@@ -253,6 +261,21 @@ export type EngineConfig = {
|
||||
export type StartRequest = {
|
||||
worldSetting: string;
|
||||
styleGuide: string;
|
||||
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
|
||||
styleReferenceImage?: string;
|
||||
};
|
||||
|
||||
// /api/parse-style-image — vision LLM extracts a textual painting-style
|
||||
// prompt from a user-uploaded reference image. The same base64 is echoed
|
||||
// back so the client can later pass it through to /api/start.
|
||||
export type ParseStyleImageRequest = {
|
||||
/** Data URL: `data:image/...;base64,...`. */
|
||||
imageDataUrl: string;
|
||||
};
|
||||
|
||||
export type ParseStyleImageResponse = {
|
||||
/** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
|
||||
stylePrompt: string;
|
||||
};
|
||||
|
||||
export type StartResponse = {
|
||||
|
||||
Reference in New Issue
Block a user