feat: Runware FLUX.2 image + lazy per-beat TTS (#5)

Reduce median scene-load latency from ~30-80s to ~17-25s by switching image generation to Runware FLUX.2 [klein] 9B KV and moving per-beat TTS synthesis off the scene response into a new lazy /api/beat-audio endpoint with hard timeout + abort support. - feat(image): migrate to Runware FLUX.2 [klein] 9B KV — task-array API, $0.001/image, sub-second inference. - feat(tts): split /api/scene into directScene + image + voicedesign-provisioning; lazily synth per beat via /api/beat-audio with 15s hard timeout + AbortSignal threaded to MiMo so timed-out calls don't keep burning sockets/quota; client fans out per-beat fetches on scene-id change with abort + identity-check finally to prevent cross-scene beat-id collisions. - refactor(tts): slim BeatAudioRequest to { beat, voice } — ~800KB per-beat upload dropped to ~160KB by sending only the speaker's voice instead of the full session. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 23:43:51 +08:00
parent fcd4e6c1ab
commit e261f4a346
10 changed files with 431 additions and 214 deletions
@@ -1,28 +1,43 @@
 import type { ProviderConfig } from "@yume/types";
 import { fetchWithRetry } from "./fetchWithRetry";

-type ImageUrlPart = { type: string; image_url?: { url?: string } };
-type ChatResponse = {
-  choices: {
-    message: {
-      content: string | ImageUrlPart[];
-      images?: ImageUrlPart[];
-    };
-  }[];
+// Runware uses its own task-array protocol (not OpenAI-compatible).
+// POST <baseUrl> with [{ taskType: "imageInference", ... }]; errors come
+// back as a 200 with `errors[]`, so we have to inspect the body either way.
+type RunwareImageResult = {
+  imageBase64Data?: string;
+};
+type RunwareError = {
+  code?: string;
+  message?: string;
+  parameter?: string;
+};
+type RunwareResponse = {
+  data?: RunwareImageResult[];
+  errors?: RunwareError[];
 };

 export async function generateImage(
  config: ProviderConfig,
  prompt: string,
 ): Promise<string> {
-  const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
+  const url = config.baseUrl.replace(/\/$/, "");

-  const body = {
-    model: config.model,
-    modalities: ["image", "text"],
-    size: "1792x1024",
-    messages: [{ role: "user", content: prompt }],
-  };
+  const body = [
+    {
+      taskType: "imageInference",
+      taskUUID: crypto.randomUUID(),
+      model: config.model,
+      positivePrompt: prompt,
+      width: 1792,
+      height: 1024,
+      steps: 4,
+      CFGScale: 3.5,
+      numberResults: 1,
+      outputType: "base64Data",
+      outputFormat: "PNG",
+    },
+  ];

  const res = await fetchWithRetry(url, {
    method: "POST",
@@ -33,47 +48,27 @@ export async function generateImage(
    body: JSON.stringify(body),
  });

-  if (!res.ok) {
-    const text = await res.text();
+  const text = await res.text();
+  let json: RunwareResponse;
+  try {
+    json = JSON.parse(text) as RunwareResponse;
+  } catch {
    throw new Error(`Image API error ${res.status}: ${text.slice(0, 500)}`);
  }

-  const json = (await res.json()) as ChatResponse;
-  const msg = json.choices[0]?.message;
-  if (!msg) throw new Error("Image API returned no message");
-
-  // 1) OpenRouter-style: msg.images = [{ image_url: { url } }]
-  // 2) OpenAI multimodal: msg.content = [{ type: "image_url", image_url: { url } }]
-  const structured: ImageUrlPart[] = [];
-  if (msg.images) structured.push(...msg.images);
-  if (Array.isArray(msg.content)) structured.push(...msg.content);
-  for (const part of structured) {
-    const u = part.image_url?.url;
-    if (u) return await urlToBase64(u);
+  if (json.errors?.length) {
+    const e = json.errors[0]!;
+    throw new Error(
+      `Runware error [${e.code ?? "unknown"}]: ${e.message ?? "no message"}` +
+        (e.parameter ? ` (parameter: ${e.parameter})` : ""),
+    );
  }

-  // 3) provider-style: content is a string with markdown image ![alt](url)
-  //    or a bare URL fragment
-  if (typeof msg.content === "string") {
-    const md = msg.content.match(/!\[[^\]]*\]\((https?:\/\/[^\s)]+)\)/);
-    if (md?.[1]) return await urlToBase64(md[1]);
-    const bare = msg.content.match(/https?:\/\/\S+?\.(?:png|jpg|jpeg|webp)/i);
-    if (bare?.[0]) return await urlToBase64(bare[0]);
+  const b64 = json.data?.[0]?.imageBase64Data;
+  if (!b64) {
+    throw new Error(
+      `No image in Runware response: ${text.slice(0, 300)}`,
+    );
  }
-
-  throw new Error(
-    `No image found in response: ${JSON.stringify(msg).slice(0, 300)}`,
-  );
-}
-
-async function urlToBase64(url: string): Promise<string> {
-  if (url.startsWith("data:")) {
-    const idx = url.indexOf("base64,");
-    if (idx === -1) throw new Error("data URL is not base64-encoded");
-    return url.slice(idx + "base64,".length);
-  }
-  const res = await fetch(url);
-  if (!res.ok) throw new Error(`Failed to fetch image url: ${res.status}`);
-  const buf = await res.arrayBuffer();
-  return Buffer.from(buf).toString("base64");
+  return b64;
 }