feat(ai-client): multi-provider compat — native Anthropic/Google + URL tolerance

- TEXT/VISION: add native Anthropic & Google Gemini paths via Vercel AI SDK, selectable through TEXT_PROVIDER / VISION_PROVIDER (default openai_compatible) - IMAGE: expand to openai (gpt-image) / google (Nano Banana) via AI SDK alongside the existing Runware task-array and OpenAI-compatible REST paths - normalizeBaseUrl: tolerate URLs with/without /v1 (or /chat/completions); append the per-protocol version segment only for bare hosts - config: readProvider() reads *_PROVIDER; types: ProviderProtocol + provider? - deps: @ai-sdk/anthropic, @ai-sdk/google; docs in .env.example + README Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-04 15:51:53 +08:00
parent a4dc57a1b6
commit 83fd5717e7
10 changed files with 614 additions and 67 deletions
@@ -1,5 +1,9 @@
-import type { ProviderConfig } from "@infiplot/types";
+import { generateImage as generateImageSdk } from "ai";
+import { createOpenAI } from "@ai-sdk/openai";
+import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { fetchWithRetry } from "./fetchWithRetry";
+import { normalizeBaseUrl } from "./normalizeUrl";

 // Runware uses its own task-array protocol (not OpenAI-compatible).
 // POST <baseUrl> with [{ taskType: "imageInference", ... }]; errors come
@@ -38,30 +42,52 @@ export type GenerateImageOptions = {
   * Reference image (UUID, public URL, or base64) for img2img. When set,
   * FLUX preserves the seed image's composition and applies `strength` to
   * deviate. NOTE: FLUX.2 [klein] 9B KV does NOT support seedImage — use
-   * `referenceImages` for visual continuity instead.
+   * `referenceImages` for visual continuity instead. Runware-only.
   */
  seedImage?: string;
  /**
   * Reference images (UUIDs, URLs, or base64) to condition generation on —
   * typically character portraits + the prior scene image. Runware caps at 4;
-   * we silently truncate beyond that.
+   * we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these
+   * map to `prompt.images` (the SDK accepts public URLs or data URLs).
   */
  referenceImages?: string[];
-  /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. */
+  /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */
  strength?: number;
 };

 export type GenerateImageResult = {
-  /** Public CDN URL of the generated image (Runware-hosted). */
+  /**
+   * Image the client can render directly. A Runware CDN URL on the Runware
+   * path; a `data:<mime>;base64,...` URI on the AI SDK paths (OpenAI/Gemini
+   * return raw bytes, not a hosted URL).
+   */
  imageUrl: string;
-  /** Stable UUID for cheap re-reference in later `referenceImages`. */
+  /**
+   * Stable handle for cheap re-reference in later `referenceImages`. A real
+   * Runware UUID on the Runware path; a synthetic UUID on other paths (those
+   * re-reference via the URL/data-URL form instead).
+   */
  imageUuid: string;
 };

+// Image roles support more protocols than text/vision. When IMAGE_PROVIDER is
+// unset we keep the historical URL-based inference so existing deployments
+// (Runware, or an OpenAI-compatible gateway) behave exactly as before.
+function inferImageProtocol(config: ProviderConfig): ProviderProtocol {
+  const isOpenAiCompat =
+    !config.baseUrl.includes("runware.ai") || config.model === "image-2-vip";
+  return isOpenAiCompat ? "openai_compatible" : "runware";
+}
+
+function resolveImageProtocol(config: ProviderConfig): ProviderProtocol {
+  return config.provider ?? inferImageProtocol(config);
+}
+
 // ──────────────────────────────────────────────────────────────────────
 //  generateImage — text-to-image (default) or referenceImages-conditioned.
-//  Returns both the public URL (for client display + future references)
-//  and the UUID (cheapest reference form for subsequent calls).
+//  Returns both a renderable image URL and a re-reference handle (see
+//  GenerateImageResult). Dispatches on the resolved wire protocol.
 // ──────────────────────────────────────────────────────────────────────

 export async function generateImage(
@@ -69,51 +95,120 @@ export async function generateImage(
  prompt: string,
  options?: GenerateImageOptions,
 ): Promise<GenerateImageResult> {
-  const url = config.baseUrl.replace(/\/$/, "");
+  const protocol = resolveImageProtocol(config);
+  switch (protocol) {
+    case "openai":
+    case "google":
+      return generateImageViaAiSdk(config, prompt, options, protocol);
+    case "runware":
+      return generateImageRunware(config, prompt, options);
+    case "anthropic":
+      throw new Error(
+        'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".',
+      );
+    case "openai_compatible":
+    default:
+      return generateImageOpenAiCompatible(config, prompt);
+  }
+}

-  // 1. OpenAI-compatible route (GPTGod, DALL-E, etc.)
-  const isOpenAi = !url.includes("runware.ai") || config.model === "image-2-vip";
-  if (isOpenAi) {
-    const endpoint = url.endsWith("/images/generations") ? url : `${url}/images/generations`;
-    console.log(`[ai-client] Calling OpenAI-compatible image generations at: ${endpoint} with model: ${config.model}`);
-    
-    const res = await fetchWithRetry(endpoint, {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-        Authorization: `Bearer ${config.apiKey}`,
-      },
-      body: JSON.stringify({
-        model: config.model,
-        prompt: prompt,
-        n: 1,
-        size: "1792x1024", // Use horizontal size (16:9)
-      }),
-    });
+// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK.
+// Unlike the fetch path, this supports reference-image editing via
+// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the
+// client a data URI and synthesize a UUID; continuity references reuse the
+// data URI rather than a provider UUID.
+async function generateImageViaAiSdk(
+  config: ProviderConfig,
+  prompt: string,
+  options: GenerateImageOptions | undefined,
+  protocol: "openai" | "google",
+): Promise<GenerateImageResult> {
+  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
+  const imageModel =
+    protocol === "openai"
+      ? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model)
+      : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image(
+          config.model,
+        );

-    const text = await res.text();
-    let json: any;
-    try {
-      json = JSON.parse(text);
-    } catch {
-      throw new Error(`OpenAI Image API error ${res.status}: ${text.slice(0, 500)}`);
-    }
+  const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
+  const promptArg =
+    refs.length > 0 ? { text: prompt, images: refs } : prompt;

-    if (json.error) {
-      throw new Error(`OpenAI Image API error: ${json.error.message || JSON.stringify(json.error)}`);
-    }
+  // OpenAI's image models take an explicit `size`; gpt-image's widest landscape
+  // option is 1536x1024. Gemini takes an `aspectRatio` instead.
+  const { image } = await generateImageSdk({
+    model: imageModel,
+    prompt: promptArg,
+    ...(protocol === "openai"
+      ? { size: "1536x1024" as `${number}x${number}` }
+      : { aspectRatio: "16:9" as `${number}:${number}` }),
+  });

-    const data = json.data?.[0];
-    const imageUrl = data?.url;
-    if (!imageUrl) {
-      throw new Error(`No image URL in OpenAI response: ${text.slice(0, 300)}`);
-    }
-    // Generate a mock UUID since OpenAI compatible endpoint doesn't have UUIDs
-    const imageUuid = crypto.randomUUID();
-    return { imageUrl, imageUuid };
+  return {
+    imageUrl: `data:${image.mediaType};base64,${image.base64}`,
+    imageUuid: crypto.randomUUID(),
+  };
+}
+
+// OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic
+// text-to-image only — no reference images on this path; for editing/anchoring
+// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above.
+async function generateImageOpenAiCompatible(
+  config: ProviderConfig,
+  prompt: string,
+): Promise<GenerateImageResult> {
+  const base = normalizeBaseUrl(config.baseUrl, "openai_compatible");
+  const endpoint = `${base}/images/generations`;
+  console.log(
+    `[ai-client] Calling OpenAI-compatible image generations at: ${endpoint} with model: ${config.model}`,
+  );
+
+  const res = await fetchWithRetry(endpoint, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${config.apiKey}`,
+    },
+    body: JSON.stringify({
+      model: config.model,
+      prompt: prompt,
+      n: 1,
+      size: "1792x1024", // Use horizontal size (16:9)
+    }),
+  });
+
+  const text = await res.text();
+  let json: any;
+  try {
+    json = JSON.parse(text);
+  } catch {
+    throw new Error(`OpenAI Image API error ${res.status}: ${text.slice(0, 500)}`);
  }

-  // 2. Runware task-array route
+  if (json.error) {
+    throw new Error(`OpenAI Image API error: ${json.error.message || JSON.stringify(json.error)}`);
+  }
+
+  const data = json.data?.[0];
+  const imageUrl = data?.url;
+  if (!imageUrl) {
+    throw new Error(`No image URL in OpenAI response: ${text.slice(0, 300)}`);
+  }
+  // Generate a mock UUID since OpenAI compatible endpoint doesn't have UUIDs
+  const imageUuid = crypto.randomUUID();
+  return { imageUrl, imageUuid };
+}
+
+// Runware task-array route — self-implemented to preserve the UUID/URL closed
+// loop (the official @runware/ai-sdk-provider drops both).
+async function generateImageRunware(
+  config: ProviderConfig,
+  prompt: string,
+  options?: GenerateImageOptions,
+): Promise<GenerateImageResult> {
+  const url = normalizeBaseUrl(config.baseUrl, "runware");
+
  const task: Record<string, unknown> = {
    taskType: "imageInference",
    taskUUID: crypto.randomUUID(),