feat(web,engine): custom style — image upload, AI-extract prompt, painter ref

自定义画风入口里加上传按钮:客户端把图缩到 512px webp(base64),传到新
路由 /api/parse-style-image,vision LLM 解析成英文 style prompt 回填 textarea;
图本身随 sessionStorage → /api/start → Session.styleReferenceImage 透传,
painter.collectReferenceImages 把它置于 slot 0,整局每一幕都作为 reference
图锚定画风(brush / color / mood),比 priorScene 优先级更高。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
DESKTOP-I1T6TF3\Q
2026-06-03 19:15:19 +08:00
parent 298ecd4ec0
commit 347ab297d5
10 changed files with 396 additions and 15 deletions
+83
View File
@@ -0,0 +1,83 @@
import { analyzeImageDataUrl } from "@infiplot/ai-client";
import type {
ParseStyleImageRequest,
ParseStyleImageResponse,
} from "@infiplot/types";
import { NextResponse } from "next/server";
import { loadEngineConfig } from "@/lib/config";
export const runtime = "nodejs";
export const maxDuration = 60;
// Same rationale as /api/vision: the client resizes to 512px max-dim webp
// (~30-80KB base64 typical) before upload, so 3 MB is generous headroom
// against malformed / abusive direct-API payloads.
const MAX_IMAGE_BYTES = 3 * 1024 * 1024;
const STYLE_EXTRACTION_PROMPT = `You are a senior concept artist helping describe an image's visual style so that a text-to-image diffusion model (FLUX) can reproduce the same aesthetic on different subjects.
Look at the attached image and produce a single English style-prompt string that captures ONLY its visual style — NOT its subject matter. Focus on:
- Medium / technique (e.g., watercolor, oil painting, cel-shaded anime, 3D render, pixel art)
- Line work and rendering (sharp ink outlines, soft shading, painterly brushstrokes, flat colors)
- Color palette and lighting (pastel, saturated, monochrome, warm golden-hour, cool neon, high contrast)
- Mood and atmosphere (dreamy, melancholic, cinematic, nostalgic, gritty)
- Any recognizable artistic influence (Ghibli, Makoto Shinkai, ukiyo-e, vaporwave, cyberpunk anime, etc.)
Do NOT describe the characters, objects, or scene contents. Output exactly one JSON object:
{"stylePrompt": "<comma-separated English visual-style attributes, ~30-60 words>"}`;
export async function POST(req: Request) {
let body: ParseStyleImageRequest;
try {
body = (await req.json()) as ParseStyleImageRequest;
} catch {
return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
}
if (
typeof body.imageDataUrl !== "string" ||
!body.imageDataUrl.startsWith("data:image/")
) {
return NextResponse.json(
{ error: "imageDataUrl must be a data:image/... base64 URL" },
{ status: 400 },
);
}
if (body.imageDataUrl.length > MAX_IMAGE_BYTES) {
return NextResponse.json(
{ error: `imageDataUrl exceeds ${MAX_IMAGE_BYTES} bytes` },
{ status: 413 },
);
}
try {
const config = loadEngineConfig();
const raw = await analyzeImageDataUrl(
config.vision,
body.imageDataUrl,
STYLE_EXTRACTION_PROMPT,
{ responseFormat: "json_object" },
);
let parsed: { stylePrompt?: string };
try {
parsed = JSON.parse(raw);
} catch {
// Fall back: treat the raw response as the style prompt directly.
parsed = { stylePrompt: raw };
}
const stylePrompt = (parsed.stylePrompt ?? "").trim();
if (!stylePrompt) {
return NextResponse.json(
{ error: "Vision model returned an empty stylePrompt" },
{ status: 502 },
);
}
const payload: ParseStyleImageResponse = { stylePrompt };
return NextResponse.json(payload);
} catch (err) {
const message = err instanceof Error ? err.message : "Unknown error";
return NextResponse.json({ error: message }, { status: 500 });
}
}
+19
View File
@@ -6,6 +6,11 @@ import { loadEngineConfig } from "@/lib/config";
export const runtime = "nodejs";
export const maxDuration = 60;
// Matches /api/vision and /api/parse-style-image — the user's resized 512px
// webp is ~30-80 KB; this caps pathological direct-API payloads (which would
// then ride along in every subsequent /api/scene request body via session).
const MAX_STYLE_REF_BYTES = 3 * 1024 * 1024;
export async function POST(req: Request) {
let body: StartRequest;
try {
@@ -20,6 +25,20 @@ export async function POST(req: Request) {
{ status: 400 },
);
}
if (typeof body.styleReferenceImage === "string") {
if (!body.styleReferenceImage.startsWith("data:image/")) {
return NextResponse.json(
{ error: "styleReferenceImage must be a data:image/... base64 URL" },
{ status: 400 },
);
}
if (body.styleReferenceImage.length > MAX_STYLE_REF_BYTES) {
return NextResponse.json(
{ error: `styleReferenceImage exceeds ${MAX_STYLE_REF_BYTES} bytes` },
{ status: 413 },
);
}
}
try {
const config = loadEngineConfig();
+197 -1
View File
@@ -889,6 +889,8 @@ function StyleModal({
setCustomStyleGuide,
styleOverrides,
setStyleOverrides,
customStyleRefImage,
setCustomStyleRefImage,
}: {
items: string[];
value: number;
@@ -898,6 +900,8 @@ function StyleModal({
setCustomStyleGuide: (s: string) => void;
styleOverrides: Record<string, string>;
setStyleOverrides: (o: Record<string, string>) => void;
customStyleRefImage: string;
setCustomStyleRefImage: (s: string) => void;
}) {
const [q, setQ] = useState("");
const [shown, setShown] = useState(false);
@@ -905,6 +909,10 @@ function StyleModal({
// 列表保持原位(不跳新页面),其他卡片继续可见——用户随时可以取消并切到别处。
const [editingIdx, setEditingIdx] = useState<number | null>(null);
const [draft, setDraft] = useState("");
// 上传 / 解析参考图的瞬时状态——失败/进行中提示只在此次弹窗内可见。
const [parsing, setParsing] = useState(false);
const [parseError, setParseError] = useState<string | null>(null);
const fileInputRef = useRef<HTMLInputElement>(null);
useEffect(() => {
const id = requestAnimationFrame(() => setShown(true));
return () => cancelAnimationFrame(id);
@@ -942,6 +950,76 @@ function StyleModal({
setStyleOverrides(next);
setDraft(STYLE_MAP[name] ?? "");
};
// 客户端把上传的图片缩到 512px 长边 + webp(0.85)base64 通常落在 30-80KB。
// 必须客户端做:(1) 上传 / 后续 /api/scene 都会带这串,包不能太大;
// (2) Runware referenceImages 支持 base64,无需另外加 upload 端点。
const resizeImageToDataUrl = async (file: File): Promise<string> => {
const dataUrl = await new Promise<string>((resolve, reject) => {
const r = new FileReader();
r.onload = () => resolve(String(r.result));
r.onerror = () => reject(new Error("读取文件失败"));
r.readAsDataURL(file);
});
const img = await new Promise<HTMLImageElement>((resolve, reject) => {
const i = new Image();
i.onload = () => resolve(i);
i.onerror = () => reject(new Error("无法解码图片"));
i.src = dataUrl;
});
const MAX_DIM = 512;
const scale = Math.min(1, MAX_DIM / Math.max(img.width, img.height));
const w = Math.round(img.width * scale);
const h = Math.round(img.height * scale);
const canvas = document.createElement("canvas");
canvas.width = w;
canvas.height = h;
const ctx = canvas.getContext("2d");
if (!ctx) throw new Error("Canvas 2D context unavailable");
ctx.drawImage(img, 0, 0, w, h);
// webp 比 jpeg 体积更小一些;浏览器全支持。降级到 jpeg 作为兜底。
let out = canvas.toDataURL("image/webp", 0.85);
if (!out.startsWith("data:image/webp")) {
out = canvas.toDataURL("image/jpeg", 0.85);
}
return out;
};
const handleUploadStyleImage = async (file: File) => {
setParseError(null);
if (!file.type.startsWith("image/")) {
setParseError("只支持图片文件");
return;
}
setParsing(true);
try {
const resized = await resizeImageToDataUrl(file);
const res = await fetch("/api/parse-style-image", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ imageDataUrl: resized }),
});
if (!res.ok) {
const j = (await res.json().catch(() => ({}))) as { error?: string };
throw new Error(j.error ?? `${res.status}`);
}
const data = (await res.json()) as { stylePrompt: string };
// 收到 AI 解析后的 prompt → 覆盖正在编辑的 draft + 持久化参考图。
// 用户事后还可以手动改 draft(仍是 textarea)。
setDraft(data.stylePrompt);
setCustomStyleRefImage(resized);
} catch (err) {
const msg = err instanceof Error ? err.message : "解析失败";
setParseError(msg);
} finally {
setParsing(false);
}
};
const removeStyleRefImage = () => {
setCustomStyleRefImage("");
setParseError(null);
};
// 标题取去掉括号后缀的"主名"——括号里的英文 / 「Image N参考」之类的脚注
// 在标题位上显示噪声太大,挪到下方 prompt 行也已经覆盖到了。两种括号都
// 兼容(中文「()」和英文「()」)。
@@ -1069,6 +1147,15 @@ function StyleModal({
</span>
)}
{isCustom && customStyleRefImage && !isEditing && (
<span
className="inline-flex items-center gap-1 rounded-sm border border-ember-500/40 bg-ember-500/10 px-1.5 py-0.5 font-sans text-[10px] tracking-wide text-ember-500"
title="参考图已附带——每一幕画师都会参考这张图"
>
<i className="fa-regular fa-image text-[9px]" />
</span>
)}
</span>
{/* 「自动」语义就是「让 AI 自己判断画风」,没有 prompt 可显示也无从编辑;
@@ -1080,6 +1167,103 @@ function StyleModal({
) : /* prompt 区域:非编辑态是看起来像文本框的只读容器;编辑态是真的 textarea */
isEditing ? (
<div className="mt-1.5 flex flex-col gap-2">
{/* 自定义卡专属:上传画风参考图。上传后会:(1) 用 vision LLM
解析成 prompt 覆盖到下方 textarea(2) 图片本身随会话送到
画师,每幕都作为 reference 锚定画风。 */}
{isCustom && (
<div
onClick={(e) => e.stopPropagation()}
className="flex flex-col gap-2"
>
<input
ref={fileInputRef}
type="file"
accept="image/*"
className="hidden"
onChange={(e) => {
const f = e.target.files?.[0];
if (f) handleUploadStyleImage(f);
// reset 让同一文件重选能再次触发 onChange
if (fileInputRef.current) fileInputRef.current.value = "";
}}
/>
{customStyleRefImage ? (
<div className="flex items-center gap-3 rounded-sm border border-clay-900/12 bg-cream-100 px-3 py-2.5">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={customStyleRefImage}
alt="画风参考图"
className="h-14 w-14 shrink-0 rounded-sm border border-clay-900/10 object-cover"
/>
<div className="flex min-w-0 flex-1 flex-col">
<span className="font-sans text-[12px] text-clay-900">
<i className="fa-solid fa-check mr-1.5 text-ember-500" />
</span>
<span className="font-sans text-[11px] leading-snug text-clay-500">
AI prompt
</span>
</div>
<div className="flex flex-col items-end gap-1">
<button
type="button"
onClick={(e) => {
e.stopPropagation();
fileInputRef.current?.click();
}}
disabled={parsing}
className="font-sans text-[11px] text-clay-500 hover:text-ember-500 transition-colors disabled:opacity-50"
>
</button>
<button
type="button"
onClick={(e) => {
e.stopPropagation();
removeStyleRefImage();
}}
className="font-sans text-[11px] text-clay-400 hover:text-clay-900 transition-colors"
>
</button>
</div>
</div>
) : (
<button
type="button"
onClick={(e) => {
e.stopPropagation();
fileInputRef.current?.click();
}}
disabled={parsing}
className={
"flex items-center justify-center gap-2 rounded-sm border border-dashed px-3 py-2.5 font-sans text-[12px] transition-colors " +
(parsing
? "border-clay-900/15 bg-cream-100 text-clay-400 cursor-wait"
: "border-clay-900/25 text-clay-700 hover:border-ember-500 hover:bg-ember-500/5 hover:text-ember-500")
}
>
{parsing ? (
<>
<i className="fa-solid fa-circle-notch fa-spin text-[11px]" />
AI
</>
) : (
<>
<i className="fa-regular fa-image text-[13px]" />
· AI prompt
</>
)}
</button>
)}
{parseError && (
<span className="font-sans text-[11px] text-rose-500">
<i className="fa-solid fa-circle-exclamation mr-1" />
{parseError}
</span>
)}
</div>
)}
<textarea
value={draft}
onChange={(e) => setDraft(e.target.value)}
@@ -1215,6 +1399,10 @@ export default function HomePage() {
// 这个 source-of-truth。键是预设名(如 "京阿尼细腻日常"),值是 override prompt。
// 选中该预设 + 有 override → 把 override 当 styleGuide 喂给画师。
const [styleOverrides, setStyleOverrides] = useState<Record<string, string>>({});
// 用户在「自定义」里上传的参考图(已客户端缩到 512px、webp base64)。
// 同时随 sessionStorage 透传到 /play → /api/start → session → painter
// 每一幕的 painter 都会把它作为 reference slot 0,锚定整局画风。
const [customStyleRefImage, setCustomStyleRefImage] = useState<string>("");
const inputRef = useRef<HTMLTextAreaElement>(null);
// 顶部使用提示:默认展示,用户可点 × 永久关闭(localStorage:infiplot:hintClosed)。
@@ -1330,9 +1518,15 @@ export default function HomePage() {
}
const audioEnabled = voice === "开启";
// 只有「自定义」风格选中、且确实上传了参考图时才透传——其他预设没必要
// 占用 reference slot(也避免 styleGuide 已经是文本预设、画师收到不相关
// 参考图反而产生干扰)。
const styleReferenceImage =
artStyle === "自定义" && customStyleRefImage ? customStyleRefImage : undefined;
sessionStorage.setItem(
"infiplot:custom",
JSON.stringify({ worldSetting, styleGuide, audioEnabled }),
JSON.stringify({ worldSetting, styleGuide, audioEnabled, styleReferenceImage }),
);
router.push("/play?custom=1");
};
@@ -1606,6 +1800,8 @@ export default function HomePage() {
setCustomStyleGuide={setCustomStyleGuide}
styleOverrides={styleOverrides}
setStyleOverrides={setStyleOverrides}
customStyleRefImage={customStyleRefImage}
setCustomStyleRefImage={setCustomStyleRefImage}
/>
)}
</div>
+25 -3
View File
@@ -500,7 +500,11 @@ function PlayInner() {
const presetId = params.get("preset");
const isCustom = params.get("custom") === "1";
let livePayload: { worldSetting: string; styleGuide: string } | null = null;
let livePayload: {
worldSetting: string;
styleGuide: string;
styleReferenceImage?: string;
} | null = null;
if (!cardName) {
if (presetId) {
const p = PRESETS.find((x) => x.id === presetId);
@@ -513,8 +517,13 @@ function PlayInner() {
worldSetting: string;
styleGuide: string;
audioEnabled?: boolean;
styleReferenceImage?: string;
};
livePayload = {
worldSetting: parsed.worldSetting,
styleGuide: parsed.styleGuide,
styleReferenceImage: parsed.styleReferenceImage || undefined,
};
livePayload = { worldSetting: parsed.worldSetting, styleGuide: parsed.styleGuide };
// audioEnabled 已在 useState 初始化时反向投射到 muted;这里无需再额外存。
} catch {
livePayload = null;
@@ -531,6 +540,11 @@ function PlayInner() {
type PrebakedFirstAct = StartResponse & {
worldSetting: string;
styleGuide: string;
// Live /api/start path tags this on after the response (prebaked card
// JSONs never have one — they were rendered at build time without any
// user-uploaded reference). Carried into Session so /api/scene's painter
// anchors the same style image on every subsequent scene.
styleReferenceImage?: string;
cardName?: string;
cardTitle?: string;
cardGender?: string;
@@ -554,7 +568,14 @@ function PlayInner() {
}
const data = (await r.json()) as StartResponse;
// Live /api/start doesn't echo ws/sg back — splice in what we sent.
return { ...data, worldSetting: livePayload!.worldSetting, styleGuide: livePayload!.styleGuide };
// styleReferenceImage is similarly not in StartResponse; tag it on so
// the session we build below carries it for every /api/scene call.
return {
...data,
worldSetting: livePayload!.worldSetting,
styleGuide: livePayload!.styleGuide,
styleReferenceImage: livePayload!.styleReferenceImage,
};
});
fetchStart
@@ -577,6 +598,7 @@ function PlayInner() {
],
characters: data.characters,
storyState: data.storyState,
styleReferenceImage: data.styleReferenceImage,
};
visitedBeatsRef.current = [data.scene.entryBeatId];
setSession(initial);
+1 -1
View File
@@ -1,5 +1,5 @@
export { chat } from "./chat";
export { generateImage } from "./image";
export type { GenerateImageOptions, GenerateImageResult } from "./image";
export { interpretClick } from "./vision";
export { interpretClick, analyzeImageDataUrl } from "./vision";
export type { ChatMessage } from "./chat";
+26 -6
View File
@@ -5,26 +5,46 @@ export async function interpretClick(
config: ProviderConfig,
imageBase64: string,
prompt: string,
): Promise<string> {
// Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
// client encodes as PNG. analyzeImageDataUrl handles the actual request.
return analyzeImageDataUrl(
config,
`data:image/png;base64,${imageBase64}`,
prompt,
{ responseFormat: "json_object" },
);
}
/**
* General single-image vision call. Accepts a complete data URL (preserves
* the source mime type, e.g. webp/jpeg) and lets the caller opt out of
* `response_format: json_object` for free-form text responses.
*/
export async function analyzeImageDataUrl(
config: ProviderConfig,
imageDataUrl: string,
prompt: string,
opts: { responseFormat?: "json_object" | "text" } = {},
): Promise<string> {
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
const body = {
const body: Record<string, unknown> = {
model: config.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "image_url",
image_url: { url: `data:image/png;base64,${imageBase64}` },
},
{ type: "image_url", image_url: { url: imageDataUrl } },
],
},
],
temperature: 0.2,
response_format: { type: "json_object" },
};
if (opts.responseFormat === "json_object") {
body.response_format = { type: "json_object" };
}
const timeoutCtrl = new AbortController();
const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
+20 -4
View File
@@ -47,6 +47,13 @@ export type PainterInput = {
* with character refs, capped at 4 total per Runware spec.
*/
priorSceneImage?: string;
/**
* User-uploaded style reference (data URL base64). When set, it takes the
* highest-priority slot in referenceImages so the painting STYLE (brush /
* color / mood) of the user's image is anchored across every scene this
* session paints — even before any priorScene exists.
*/
styleReferenceImage?: string;
};
// Pick the references we send to Runware as `referenceImages`. Priority:
@@ -59,14 +66,22 @@ export function collectReferenceImages(
characters: Character[],
entryBeat: Beat | undefined,
priorSceneImage: string | undefined,
styleReferenceImage?: string,
): string[] {
const refs: string[] = [];
const seen = new Set<string>();
// Slot 0 — prior scene image for spatial continuity. Goes first because
// backdrop drift is the most jarring discontinuity across same-sceneKey
// scenes; character drift is partially masked by character archetype text
// in the prompt anyway.
// Slot 0 — user-uploaded style reference image, if any. Goes first because
// it anchors the whole-session painting STYLE (brush / color / mood) that
// the user explicitly chose. priorScene continuity comes second; character
// archetypes are partially covered by the prompt text anyway.
if (styleReferenceImage) {
refs.push(styleReferenceImage);
}
// Slot N — prior scene image for spatial continuity. Backdrop drift is the
// next-most jarring discontinuity across same-sceneKey scenes; character
// drift is partially masked by character archetype text in the prompt.
if (priorSceneImage) {
refs.push(priorSceneImage);
}
@@ -140,6 +155,7 @@ export async function runPainter(
input.onStageCharacters,
entryBeat,
input.priorSceneImage,
input.styleReferenceImage,
);
// Tier A — with referenceImages (priorSceneImage + character portraits).
+1
View File
@@ -327,6 +327,7 @@ export async function directScene(
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
styleReferenceImage: session.styleReferenceImage,
},
entryBeat,
);
+1
View File
@@ -47,6 +47,7 @@ export async function startSession(
styleGuide: req.styleGuide.trim(),
history: [],
characters: [],
styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
};
// Stage 0 — Architect: expand the terse world/style prompt into a story
+23
View File
@@ -206,6 +206,14 @@ export type Session = {
* session payload created before this field existed.
*/
storyState?: StoryState;
/**
* Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
* When set, the Painter prepends it to `referenceImages` on every scene so the
* uploaded image anchors painting style (brush, color, mood) across the whole
* session. Resized client-side before upload (~512px max dim) to keep session
* payload small for /api/scene round-trips.
*/
styleReferenceImage?: string;
};
// ──────────────────────────────────────────────────────────────────────
@@ -253,6 +261,21 @@ export type EngineConfig = {
export type StartRequest = {
worldSetting: string;
styleGuide: string;
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
styleReferenceImage?: string;
};
// /api/parse-style-image — vision LLM extracts a textual painting-style
// prompt from a user-uploaded reference image. The same base64 is echoed
// back so the client can later pass it through to /api/start.
export type ParseStyleImageRequest = {
/** Data URL: `data:image/...;base64,...`. */
imageDataUrl: string;
};
export type ParseStyleImageResponse = {
/** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
stylePrompt: string;
};
export type StartResponse = {