refactor(ai-client): replace AI SDK adapters with OpenAI SDK

2026-06-11 16:11:44 +08:00
parent 6cd7d88326
commit ef3b57953b
11 changed files with 191 additions and 275 deletions
@@ -1,29 +1,24 @@
-import { generateText } from "ai";
-import type { LanguageModelUsage, ModelMessage } from "ai";
+import OpenAI from "openai";
 import type { ProviderConfig } from "@infiplot/types";
-import { createLanguageModel, resolveProtocol } from "./model";
+import { normalizeBaseUrl } from "./normalizeUrl";

 export type ChatMessage = {
  role: "system" | "user" | "assistant";
  content: string;
 };

-// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
-// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
 function summarizeSdkUsage(
  tag: string,
-  usage: LanguageModelUsage | undefined,
+  usage: OpenAI.Completions.CompletionUsage | undefined,
 ): string {
  if (!usage) return `[cache] ${tag} no-usage`;
-  const input = usage.inputTokens ?? 0;
-  const output = usage.outputTokens ?? 0;
-  const read = usage.inputTokenDetails?.cacheReadTokens;
-  const write = usage.inputTokenDetails?.cacheWriteTokens;
-  if (typeof read === "number" || typeof write === "number") {
-    const hit = read ?? 0;
-    const create = write ?? 0;
-    const rate = input > 0 ? ((hit / input) * 100).toFixed(1) : "n/a";
-    return `[cache] ${tag} hit=${hit} create=${create} input=${input} rate=${rate}% completion=${output}`;
+  const input = usage.prompt_tokens ?? 0;
+  const output = usage.completion_tokens ?? 0;
+  const details = (usage as { prompt_tokens_details?: { cached_tokens?: number } }).prompt_tokens_details;
+  const cached = details?.cached_tokens;
+  if (typeof cached === "number") {
+    const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
+    return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
  }
  return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
 }
@@ -36,28 +31,28 @@ export async function chat(
    tag?: string;
  },
 ): Promise<string> {
-  const protocol = resolveProtocol(config);
-  const model = createLanguageModel(config, protocol);
-
-  const system = messages.find((m) => m.role === "system")?.content;
-  const convo: ModelMessage[] = messages
-    .filter((m) => m.role !== "system")
-    .map((m) => ({
-      role: m.role as "user" | "assistant",
-      content: m.content,
-    }));
-
-  const { text, usage } = await generateText({
-    model,
-    system,
-    messages: convo,
-    temperature: opts?.temperature ?? 0.9,
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
+    maxRetries: 0,
+    dangerouslyAllowBrowser: true,
  });

-  console.log(summarizeSdkUsage(opts?.tag ?? "chat", usage));
+  const completion = await client.chat.completions.create({
+    model: config.model,
+    messages: messages.map((m) => ({
+      role: m.role as "system" | "user" | "assistant",
+      content: m.content,
+    })),
+    temperature: opts?.temperature ?? 0.9,
+    stream: false,
+  });

-  if (typeof text !== "string" || text.length === 0) {
-    throw new Error(`Chat API (AI SDK ${protocol}) returned no content.`);
+  const text = completion.choices[0]?.message?.content ?? "";
+  console.log(summarizeSdkUsage(opts?.tag ?? "chat", completion.usage ?? undefined));
+
+  if (text.length === 0) {
+    throw new Error(`Chat API returned no content.`);
  }
  return text;
 }
@@ -1,6 +1,4 @@
-import { generateImage as generateImageSdk } from "ai";
-import { createOpenAI } from "@ai-sdk/openai";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import OpenAI, { toFile, type Uploadable } from "openai";
 import type { Orientation, ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { fetchWithRetry } from "./fetchWithRetry";
 import { normalizeBaseUrl } from "./normalizeUrl";
@@ -48,8 +46,8 @@ export type GenerateImageOptions = {
  /**
   * Reference images (UUIDs, URLs, or base64) to condition generation on —
   * typically character portraits + the prior scene image. Runware caps at 4;
-   * we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these
-   * map to `prompt.images` (the SDK accepts public URLs or data URLs).
+   * we silently truncate beyond that. On the native OpenAI path these are
+   * fetched/decoded and sent to `images.edit`.
   */
  referenceImages?: string[];
  /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */
@@ -58,7 +56,7 @@ export type GenerateImageOptions = {
   * Output aspect, locked per session. "portrait" → 9:16 vertical for mobile;
   * default/"landscape" → 16:9 widescreen. Mapped to each provider's nearest
   * supported size: Runware 1024×1792, OpenAI-compatible REST 1024x1792,
-   * native gpt-image 1024x1536, Gemini aspectRatio 9:16.
+   * native gpt-image 1024x1536.
   */
  orientation?: Orientation;
 };
@@ -66,8 +64,8 @@ export type GenerateImageOptions = {
 export type GenerateImageResult = {
  /**
   * Image the client can render directly. A Runware CDN URL on the Runware
-   * path; a `data:<mime>;base64,...` URI on the AI SDK paths (OpenAI/Gemini
-   * return raw bytes, not a hosted URL).
+   * path; a `data:<mime>;base64,...` URI on the native OpenAI path when GPT
+   * image models return raw bytes instead of a hosted URL.
   */
  imageUrl: string;
  /**
@@ -117,63 +115,124 @@ export async function generateImage(
  const protocol = resolveImageProtocol(config);
  switch (protocol) {
    case "openai":
-    case "google":
-      return generateImageViaAiSdk(config, prompt, options, protocol);
+      return generateImageOpenAi(config, prompt, options);
    case "runware":
      return generateImageRunware(config, prompt, options);
-    case "anthropic":
-      throw new Error(
-        'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".',
-      );
    case "openai_compatible":
    default:
      return generateImageOpenAiCompatible(config, prompt, options);
  }
 }

-// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK.
-// Unlike the fetch path, this supports reference-image editing via
-// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the
-// client a data URI and synthesize a UUID; continuity references reuse the
-// data URI rather than a provider UUID.
-async function generateImageViaAiSdk(
+// Native OpenAI (gpt-image) via the official OpenAI SDK. Unlike the compatible
+// fetch path, this supports reference-image editing through `images.edit`.
+// GPT image models return raw bytes, so we hand the client a data URI and
+// synthesize a UUID; continuity references reuse the data URI rather than a
+// provider UUID.
+async function generateImageOpenAi(
  config: ProviderConfig,
  prompt: string,
-  options: GenerateImageOptions | undefined,
-  protocol: "openai" | "google",
+  options?: GenerateImageOptions,
 ): Promise<GenerateImageResult> {
-  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
-  const imageModel =
-    protocol === "openai"
-      ? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model)
-      : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image(
-          config.model,
-        );
-
-  const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
-  const promptArg =
-    refs.length > 0 ? { text: prompt, images: refs } : prompt;
-
-  // Session-locked aspect. gpt-image takes an explicit `size` (portrait /
-  // landscape options are 1024x1536 / 1536x1024); Gemini takes an `aspectRatio`.
-  const portrait = options?.orientation === "portrait";
-  const { image } = await generateImageSdk({
-    model: imageModel,
-    prompt: promptArg,
-    ...(protocol === "openai"
-      ? { size: (portrait ? "1024x1536" : "1536x1024") as `${number}x${number}` }
-      : { aspectRatio: (portrait ? "9:16" : "16:9") as `${number}:${number}` }),
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai"),
+    maxRetries: 2,
+    dangerouslyAllowBrowser: true,
  });
+  const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
+  const portrait = options?.orientation === "portrait";
+  const size = portrait ? "1024x1536" : "1536x1024";

-  return {
-    imageUrl: `data:${image.mediaType};base64,${image.base64}`,
-    imageUuid: crypto.randomUUID(),
-  };
+  const response =
+    refs.length > 0
+      ? await client.images.edit({
+          model: config.model,
+          prompt,
+          image: await Promise.all(refs.map(referenceImageToUploadable)),
+          n: 1,
+          size,
+        })
+      : await client.images.generate({
+          model: config.model,
+          prompt,
+          n: 1,
+          size,
+        });
+
+  return imageResponseToResult(response);
+}
+
+async function referenceImageToUploadable(ref: string): Promise<Uploadable> {
+  if (ref.startsWith("data:")) {
+    const response = await fetch(ref);
+    if (!response.ok) {
+      throw new Error(`Failed to read data URL reference image.`);
+    }
+    const mediaType = response.headers.get("content-type") ?? "image/png";
+    return toFile(response, `reference.${extensionFromMediaType(mediaType)}`, {
+      type: mediaType,
+    });
+  }
+
+  if (/^https?:\/\//i.test(ref)) {
+    const response = await fetch(ref);
+    if (!response.ok) {
+      throw new Error(
+        `Failed to fetch reference image ${ref}: HTTP ${response.status}`,
+      );
+    }
+    const mediaType = response.headers.get("content-type") ?? "image/png";
+    return toFile(response, filenameFromUrl(ref, mediaType), {
+      type: mediaType,
+    });
+  }
+
+  throw new Error(
+    `Native OpenAI image editing requires reference image URLs or data URLs; got "${ref.slice(0, 32)}...".`,
+  );
+}
+
+function imageResponseToResult(
+  response: OpenAI.Images.ImagesResponse,
+): GenerateImageResult {
+  const data = response.data?.[0];
+  const b64 = data?.b64_json;
+  if (b64) {
+    const format = response.output_format ?? "png";
+    return {
+      imageUrl: `data:image/${format};base64,${b64}`,
+      imageUuid: crypto.randomUUID(),
+    };
+  }
+
+  const imageUrl = data?.url;
+  if (imageUrl) {
+    return { imageUrl, imageUuid: crypto.randomUUID() };
+  }
+
+  throw new Error(`No image data in OpenAI response.`);
+}
+
+function filenameFromUrl(url: string, mediaType: string): string {
+  try {
+    const name = new URL(url).pathname.split("/").filter(Boolean).at(-1);
+    if (name && /\.[a-z0-9]+$/i.test(name)) return name;
+  } catch {
+    // Fall back to the media type below.
+  }
+  return `reference.${extensionFromMediaType(mediaType)}`;
+}
+
+function extensionFromMediaType(mediaType: string): string {
+  if (mediaType.includes("jpeg") || mediaType.includes("jpg")) return "jpg";
+  if (mediaType.includes("webp")) return "webp";
+  return "png";
 }

 // OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic
 // text-to-image only — no reference images on this path; for editing/anchoring
-// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above.
+// set IMAGE_PROVIDER=openai to take the native OpenAI path above.
 async function generateImageOpenAiCompatible(
  config: ProviderConfig,
  prompt: string,
@@ -1,23 +0,0 @@
-import { createAnthropic } from "@ai-sdk/anthropic";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
-import { createOpenAI } from "@ai-sdk/openai";
-import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
-import { normalizeBaseUrl } from "./normalizeUrl";
-
-export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
-  return config.provider ?? "openai_compatible";
-}
-
-export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
-  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
-  switch (protocol) {
-    case "anthropic":
-      return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
-    case "google":
-      return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
-    case "openai_compatible":
-    case "openai":
-    default:
-      return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
-  }
-}
@@ -31,8 +31,6 @@ const ENDPOINT_SUFFIX =
 const DEFAULT_VERSION_SEGMENT: Record<ProviderProtocol, string | null> = {
  openai_compatible: "v1",
  openai: "v1",
-  anthropic: "v1",
-  google: "v1beta",
  // Runware posts to the bare base URL with no version-pathed sub-resource,
  // so never inject a segment for it.
  runware: null,
@@ -1,7 +1,6 @@
-import { generateText } from "ai";
-import type { ModelMessage } from "ai";
+import OpenAI from "openai";
 import type { ProviderConfig } from "@infiplot/types";
-import { createLanguageModel, resolveProtocol } from "./model";
+import { normalizeBaseUrl } from "./normalizeUrl";

 const VISION_TIMEOUT_MS = 60_000;

@@ -22,34 +21,32 @@ export async function analyzeImageDataUrl(
  imageDataUrl: string,
  prompt: string,
 ): Promise<string> {
-  const protocol = resolveProtocol(config);
-  const model = createLanguageModel(config, protocol);
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
+    maxRetries: 0,
+    timeout: VISION_TIMEOUT_MS,
+    dangerouslyAllowBrowser: true,
+  });

-  const messages: ModelMessage[] = [
-    {
-      role: "user",
-      content: [
-        { type: "text", text: prompt },
-        { type: "image", image: imageDataUrl },
-      ],
-    },
-  ];
+  const completion = await client.chat.completions.create({
+    model: config.model,
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: prompt },
+          { type: "image_url", image_url: { url: imageDataUrl } },
+        ],
+      },
+    ],
+    temperature: 0.2,
+    stream: false,
+  });

-  const timeoutCtrl = new AbortController();
-  const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
-  try {
-    const { text } = await generateText({
-      model,
-      messages,
-      temperature: 0.2,
-      maxRetries: 0,
-      abortSignal: timeoutCtrl.signal,
-    });
-    if (typeof text !== "string" || text.length === 0) {
-      throw new Error(`Vision API (AI SDK ${protocol}) returned no content.`);
-    }
-    return text;
-  } finally {
-    clearTimeout(timeoutId);
+  const text = completion.choices[0]?.message?.content ?? "";
+  if (text.length === 0) {
+    throw new Error(`Vision API returned no content.`);
  }
+  return text;
 }
@@ -10,8 +10,6 @@ const STORAGE_KEY = "infiplot:model";

 const VALID_PROTOCOLS: ProviderProtocol[] = [
  "openai_compatible",
-  "anthropic",
-  "google",
  "openai",
  "runware",
 ];
@@ -6,8 +6,6 @@ import type {

 const VALID_PROTOCOLS = [
  "openai_compatible",
-  "anthropic",
-  "google",
  "openai",
  "runware",
 ] as const;
@@ -327,19 +327,15 @@ export type VisionClassify = "insert-beat" | "change-scene";
 *   openai_compatible  text / vision / image  — OpenAI Chat Completions +
 *                      `/images/generations` (self-implemented fetch; the
 *                      default for text/vision when unset)
- *   anthropic          text / vision          — native Anthropic Messages (AI SDK)
- *   google             text / vision / image  — native Gemini (AI SDK); image
- *                      uses the Nano Banana family
- *   openai             image only             — OpenAI gpt-image via AI SDK,
- *                      unlocks reference-image editing (for text/vision use
- *                      openai_compatible, which already speaks OpenAI's format)
+ *   openai             image only             — OpenAI gpt-image via the
+ *                      official OpenAI SDK, unlocks reference-image editing
+ *                      (for text/vision use openai_compatible, which already
+ *                      speaks OpenAI's format)
 *   runware            image only             — Runware task-array protocol
 *                      (self-implemented; the default for runware.ai URLs)
 */
 export type ProviderProtocol =
  | "openai_compatible"
-  | "anthropic"
-  | "google"
  | "openai"
  | "runware";