diff --git a/app/api/parse-style-image/route.ts b/app/api/parse-style-image/route.ts new file mode 100644 index 0000000..02d165e --- /dev/null +++ b/app/api/parse-style-image/route.ts @@ -0,0 +1,83 @@ +import { analyzeImageDataUrl } from "@infiplot/ai-client"; +import type { + ParseStyleImageRequest, + ParseStyleImageResponse, +} from "@infiplot/types"; +import { NextResponse } from "next/server"; +import { loadEngineConfig } from "@/lib/config"; + +export const runtime = "nodejs"; +export const maxDuration = 60; + +// Same rationale as /api/vision: the client resizes to 512px max-dim webp +// (~30-80KB base64 typical) before upload, so 3 MB is generous headroom +// against malformed / abusive direct-API payloads. +const MAX_IMAGE_BYTES = 3 * 1024 * 1024; + +const STYLE_EXTRACTION_PROMPT = `You are a senior concept artist helping describe an image's visual style so that a text-to-image diffusion model (FLUX) can reproduce the same aesthetic on different subjects. + +Look at the attached image and produce a single English style-prompt string that captures ONLY its visual style — NOT its subject matter. Focus on: +- Medium / technique (e.g., watercolor, oil painting, cel-shaded anime, 3D render, pixel art) +- Line work and rendering (sharp ink outlines, soft shading, painterly brushstrokes, flat colors) +- Color palette and lighting (pastel, saturated, monochrome, warm golden-hour, cool neon, high contrast) +- Mood and atmosphere (dreamy, melancholic, cinematic, nostalgic, gritty) +- Any recognizable artistic influence (Ghibli, Makoto Shinkai, ukiyo-e, vaporwave, cyberpunk anime, etc.) + +Do NOT describe the characters, objects, or scene contents. Output exactly one JSON object: +{"stylePrompt": ""}`; + +export async function POST(req: Request) { + let body: ParseStyleImageRequest; + try { + body = (await req.json()) as ParseStyleImageRequest; + } catch { + return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); + } + + if ( + typeof body.imageDataUrl !== "string" || + !body.imageDataUrl.startsWith("data:image/") + ) { + return NextResponse.json( + { error: "imageDataUrl must be a data:image/... base64 URL" }, + { status: 400 }, + ); + } + if (body.imageDataUrl.length > MAX_IMAGE_BYTES) { + return NextResponse.json( + { error: `imageDataUrl exceeds ${MAX_IMAGE_BYTES} bytes` }, + { status: 413 }, + ); + } + + try { + const config = loadEngineConfig(); + const raw = await analyzeImageDataUrl( + config.vision, + body.imageDataUrl, + STYLE_EXTRACTION_PROMPT, + { responseFormat: "json_object" }, + ); + + let parsed: { stylePrompt?: string }; + try { + parsed = JSON.parse(raw); + } catch { + // Fall back: treat the raw response as the style prompt directly. + parsed = { stylePrompt: raw }; + } + const stylePrompt = (parsed.stylePrompt ?? "").trim(); + if (!stylePrompt) { + return NextResponse.json( + { error: "Vision model returned an empty stylePrompt" }, + { status: 502 }, + ); + } + + const payload: ParseStyleImageResponse = { stylePrompt }; + return NextResponse.json(payload); + } catch (err) { + const message = err instanceof Error ? err.message : "Unknown error"; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/app/api/start/route.ts b/app/api/start/route.ts index 5c78760..ecd5312 100644 --- a/app/api/start/route.ts +++ b/app/api/start/route.ts @@ -6,6 +6,11 @@ import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; export const maxDuration = 60; +// Matches /api/vision and /api/parse-style-image — the user's resized 512px +// webp is ~30-80 KB; this caps pathological direct-API payloads (which would +// then ride along in every subsequent /api/scene request body via session). +const MAX_STYLE_REF_BYTES = 3 * 1024 * 1024; + export async function POST(req: Request) { let body: StartRequest; try { @@ -20,6 +25,20 @@ export async function POST(req: Request) { { status: 400 }, ); } + if (typeof body.styleReferenceImage === "string") { + if (!body.styleReferenceImage.startsWith("data:image/")) { + return NextResponse.json( + { error: "styleReferenceImage must be a data:image/... base64 URL" }, + { status: 400 }, + ); + } + if (body.styleReferenceImage.length > MAX_STYLE_REF_BYTES) { + return NextResponse.json( + { error: `styleReferenceImage exceeds ${MAX_STYLE_REF_BYTES} bytes` }, + { status: 413 }, + ); + } + } try { const config = loadEngineConfig(); diff --git a/app/page.tsx b/app/page.tsx index 20827c5..24ca848 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -889,6 +889,8 @@ function StyleModal({ setCustomStyleGuide, styleOverrides, setStyleOverrides, + customStyleRefImage, + setCustomStyleRefImage, }: { items: string[]; value: number; @@ -898,6 +900,8 @@ function StyleModal({ setCustomStyleGuide: (s: string) => void; styleOverrides: Record; setStyleOverrides: (o: Record) => void; + customStyleRefImage: string; + setCustomStyleRefImage: (s: string) => void; }) { const [q, setQ] = useState(""); const [shown, setShown] = useState(false); @@ -905,6 +909,10 @@ function StyleModal({ // 列表保持原位(不跳新页面),其他卡片继续可见——用户随时可以取消并切到别处。 const [editingIdx, setEditingIdx] = useState(null); const [draft, setDraft] = useState(""); + // 上传 / 解析参考图的瞬时状态——失败/进行中提示只在此次弹窗内可见。 + const [parsing, setParsing] = useState(false); + const [parseError, setParseError] = useState(null); + const fileInputRef = useRef(null); useEffect(() => { const id = requestAnimationFrame(() => setShown(true)); return () => cancelAnimationFrame(id); @@ -942,6 +950,76 @@ function StyleModal({ setStyleOverrides(next); setDraft(STYLE_MAP[name] ?? ""); }; + + // 客户端把上传的图片缩到 512px 长边 + webp(0.85),base64 通常落在 30-80KB。 + // 必须客户端做:(1) 上传 / 后续 /api/scene 都会带这串,包不能太大; + // (2) Runware referenceImages 支持 base64,无需另外加 upload 端点。 + const resizeImageToDataUrl = async (file: File): Promise => { + const dataUrl = await new Promise((resolve, reject) => { + const r = new FileReader(); + r.onload = () => resolve(String(r.result)); + r.onerror = () => reject(new Error("读取文件失败")); + r.readAsDataURL(file); + }); + const img = await new Promise((resolve, reject) => { + const i = new Image(); + i.onload = () => resolve(i); + i.onerror = () => reject(new Error("无法解码图片")); + i.src = dataUrl; + }); + const MAX_DIM = 512; + const scale = Math.min(1, MAX_DIM / Math.max(img.width, img.height)); + const w = Math.round(img.width * scale); + const h = Math.round(img.height * scale); + const canvas = document.createElement("canvas"); + canvas.width = w; + canvas.height = h; + const ctx = canvas.getContext("2d"); + if (!ctx) throw new Error("Canvas 2D context unavailable"); + ctx.drawImage(img, 0, 0, w, h); + // webp 比 jpeg 体积更小一些;浏览器全支持。降级到 jpeg 作为兜底。 + let out = canvas.toDataURL("image/webp", 0.85); + if (!out.startsWith("data:image/webp")) { + out = canvas.toDataURL("image/jpeg", 0.85); + } + return out; + }; + + const handleUploadStyleImage = async (file: File) => { + setParseError(null); + if (!file.type.startsWith("image/")) { + setParseError("只支持图片文件"); + return; + } + setParsing(true); + try { + const resized = await resizeImageToDataUrl(file); + const res = await fetch("/api/parse-style-image", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ imageDataUrl: resized }), + }); + if (!res.ok) { + const j = (await res.json().catch(() => ({}))) as { error?: string }; + throw new Error(j.error ?? `${res.status}`); + } + const data = (await res.json()) as { stylePrompt: string }; + // 收到 AI 解析后的 prompt → 覆盖正在编辑的 draft + 持久化参考图。 + // 用户事后还可以手动改 draft(仍是 textarea)。 + setDraft(data.stylePrompt); + setCustomStyleRefImage(resized); + } catch (err) { + const msg = err instanceof Error ? err.message : "解析失败"; + setParseError(msg); + } finally { + setParsing(false); + } + }; + + const removeStyleRefImage = () => { + setCustomStyleRefImage(""); + setParseError(null); + }; // 标题取去掉括号后缀的"主名"——括号里的英文 / 「Image N参考」之类的脚注 // 在标题位上显示噪声太大,挪到下方 prompt 行也已经覆盖到了。两种括号都 // 兼容(中文「()」和英文「()」)。 @@ -1069,6 +1147,15 @@ function StyleModal({ 已改 )} + {isCustom && customStyleRefImage && !isEditing && ( + + + 附参考图 + + )} {/* 「自动」语义就是「让 AI 自己判断画风」,没有 prompt 可显示也无从编辑; @@ -1080,6 +1167,103 @@ function StyleModal({ ) : /* prompt 区域:非编辑态是看起来像文本框的只读容器;编辑态是真的 textarea */ isEditing ? (
+ {/* 自定义卡专属:上传画风参考图。上传后会:(1) 用 vision LLM + 解析成 prompt 覆盖到下方 textarea;(2) 图片本身随会话送到 + 画师,每幕都作为 reference 锚定画风。 */} + {isCustom && ( +
e.stopPropagation()} + className="flex flex-col gap-2" + > + { + const f = e.target.files?.[0]; + if (f) handleUploadStyleImage(f); + // reset 让同一文件重选能再次触发 onChange + if (fileInputRef.current) fileInputRef.current.value = ""; + }} + /> + {customStyleRefImage ? ( +
+ {/* eslint-disable-next-line @next/next/no-img-element */} + 画风参考图 +
+ + + 参考图已上传 + + + AI 已解析为下方 prompt;每一幕画师都会参考这张图 + +
+
+ + +
+
+ ) : ( + + )} + {parseError && ( + + + {parseError} + + )} +
+ )}