feat(web,engine): custom style — image upload, AI-extract prompt, painter ref

自定义画风入口里加上传按钮：客户端把图缩到 512px webp(base64)，传到新路由 /api/parse-style-image，vision LLM 解析成英文 style prompt 回填 textarea；图本身随 sessionStorage → /api/start → Session.styleReferenceImage 透传， painter.collectReferenceImages 把它置于 slot 0，整局每一幕都作为 reference 图锚定画风（brush / color / mood），比 priorScene 优先级更高。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 19:15:19 +08:00
parent 298ecd4ec0
commit 347ab297d5
10 changed files with 396 additions and 15 deletions
@@ -1,5 +1,5 @@
 export { chat } from "./chat";
 export { generateImage } from "./image";
 export type { GenerateImageOptions, GenerateImageResult } from "./image";
-export { interpretClick } from "./vision";
+export { interpretClick, analyzeImageDataUrl } from "./vision";
 export type { ChatMessage } from "./chat";
@@ -5,26 +5,46 @@ export async function interpretClick(
  config: ProviderConfig,
  imageBase64: string,
  prompt: string,
+): Promise<string> {
+  // Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
+  // client encodes as PNG. analyzeImageDataUrl handles the actual request.
+  return analyzeImageDataUrl(
+    config,
+    `data:image/png;base64,${imageBase64}`,
+    prompt,
+    { responseFormat: "json_object" },
+  );
+}
+
+/**
+ * General single-image vision call. Accepts a complete data URL (preserves
+ * the source mime type, e.g. webp/jpeg) and lets the caller opt out of
+ * `response_format: json_object` for free-form text responses.
+ */
+export async function analyzeImageDataUrl(
+  config: ProviderConfig,
+  imageDataUrl: string,
+  prompt: string,
+  opts: { responseFormat?: "json_object" | "text" } = {},
 ): Promise<string> {
  const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;

-  const body = {
+  const body: Record<string, unknown> = {
    model: config.model,
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: prompt },
-          {
-            type: "image_url",
-            image_url: { url: `data:image/png;base64,${imageBase64}` },
-          },
+          { type: "image_url", image_url: { url: imageDataUrl } },
        ],
      },
    ],
    temperature: 0.2,
-    response_format: { type: "json_object" },
  };
+  if (opts.responseFormat === "json_object") {
+    body.response_format = { type: "json_object" };
+  }

  const timeoutCtrl = new AbortController();
  const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000);
@@ -47,6 +47,13 @@ export type PainterInput = {
   * with character refs, capped at 4 total per Runware spec.
   */
  priorSceneImage?: string;
+  /**
+   * User-uploaded style reference (data URL base64). When set, it takes the
+   * highest-priority slot in referenceImages so the painting STYLE (brush /
+   * color / mood) of the user's image is anchored across every scene this
+   * session paints — even before any priorScene exists.
+   */
+  styleReferenceImage?: string;
 };

 // Pick the references we send to Runware as `referenceImages`. Priority:
@@ -59,14 +66,22 @@ export function collectReferenceImages(
  characters: Character[],
  entryBeat: Beat | undefined,
  priorSceneImage: string | undefined,
+  styleReferenceImage?: string,
 ): string[] {
  const refs: string[] = [];
  const seen = new Set<string>();

-  // Slot 0 — prior scene image for spatial continuity. Goes first because
-  // backdrop drift is the most jarring discontinuity across same-sceneKey
-  // scenes; character drift is partially masked by character archetype text
-  // in the prompt anyway.
+  // Slot 0 — user-uploaded style reference image, if any. Goes first because
+  // it anchors the whole-session painting STYLE (brush / color / mood) that
+  // the user explicitly chose. priorScene continuity comes second; character
+  // archetypes are partially covered by the prompt text anyway.
+  if (styleReferenceImage) {
+    refs.push(styleReferenceImage);
+  }
+
+  // Slot N — prior scene image for spatial continuity. Backdrop drift is the
+  // next-most jarring discontinuity across same-sceneKey scenes; character
+  // drift is partially masked by character archetype text in the prompt.
  if (priorSceneImage) {
    refs.push(priorSceneImage);
  }
@@ -140,6 +155,7 @@ export async function runPainter(
    input.onStageCharacters,
    entryBeat,
    input.priorSceneImage,
+    input.styleReferenceImage,
  );

  // Tier A — with referenceImages (priorSceneImage + character portraits).
@@ -327,6 +327,7 @@ export async function directScene(
      styleGuide: session.styleGuide,
      onStageCharacters,
      priorSceneImage: priorSceneReference,
+      styleReferenceImage: session.styleReferenceImage,
    },
    entryBeat,
  );
@@ -47,6 +47,7 @@ export async function startSession(
    styleGuide: req.styleGuide.trim(),
    history: [],
    characters: [],
+    styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
  };

  // Stage 0 — Architect: expand the terse world/style prompt into a story
@@ -206,6 +206,14 @@ export type Session = {
   * session payload created before this field existed.
   */
  storyState?: StoryState;
+  /**
+   * Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
+   * When set, the Painter prepends it to `referenceImages` on every scene so the
+   * uploaded image anchors painting style (brush, color, mood) across the whole
+   * session. Resized client-side before upload (~512px max dim) to keep session
+   * payload small for /api/scene round-trips.
+   */
+  styleReferenceImage?: string;
 };

 // ──────────────────────────────────────────────────────────────────────
@@ -253,6 +261,21 @@ export type EngineConfig = {
 export type StartRequest = {
  worldSetting: string;
  styleGuide: string;
+  /** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
+  styleReferenceImage?: string;
+};
+
+// /api/parse-style-image — vision LLM extracts a textual painting-style
+// prompt from a user-uploaded reference image. The same base64 is echoed
+// back so the client can later pass it through to /api/start.
+export type ParseStyleImageRequest = {
+  /** Data URL: `data:image/...;base64,...`. */
+  imageDataUrl: string;
+};
+
+export type ParseStyleImageResponse = {
+  /** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
+  stylePrompt: string;
 };

 export type StartResponse = {