refactor(ai-client): replace AI SDK adapters with OpenAI SDK

This commit is contained in:
baizhi958216
2026-06-11 16:11:44 +08:00
parent 6cd7d88326
commit ef3b57953b
11 changed files with 191 additions and 275 deletions
+29 -34
View File
@@ -1,29 +1,24 @@
import { generateText } from "ai";
import type { LanguageModelUsage, ModelMessage } from "ai";
import OpenAI from "openai";
import type { ProviderConfig } from "@infiplot/types";
import { createLanguageModel, resolveProtocol } from "./model";
import { normalizeBaseUrl } from "./normalizeUrl";
export type ChatMessage = {
role: "system" | "user" | "assistant";
content: string;
};
// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
function summarizeSdkUsage(
tag: string,
usage: LanguageModelUsage | undefined,
usage: OpenAI.Completions.CompletionUsage | undefined,
): string {
if (!usage) return `[cache] ${tag} no-usage`;
const input = usage.inputTokens ?? 0;
const output = usage.outputTokens ?? 0;
const read = usage.inputTokenDetails?.cacheReadTokens;
const write = usage.inputTokenDetails?.cacheWriteTokens;
if (typeof read === "number" || typeof write === "number") {
const hit = read ?? 0;
const create = write ?? 0;
const rate = input > 0 ? ((hit / input) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${hit} create=${create} input=${input} rate=${rate}% completion=${output}`;
const input = usage.prompt_tokens ?? 0;
const output = usage.completion_tokens ?? 0;
const details = (usage as { prompt_tokens_details?: { cached_tokens?: number } }).prompt_tokens_details;
const cached = details?.cached_tokens;
if (typeof cached === "number") {
const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
}
return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
}
@@ -36,28 +31,28 @@ export async function chat(
tag?: string;
},
): Promise<string> {
const protocol = resolveProtocol(config);
const model = createLanguageModel(config, protocol);
const system = messages.find((m) => m.role === "system")?.content;
const convo: ModelMessage[] = messages
.filter((m) => m.role !== "system")
.map((m) => ({
role: m.role as "user" | "assistant",
content: m.content,
}));
const { text, usage } = await generateText({
model,
system,
messages: convo,
temperature: opts?.temperature ?? 0.9,
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
maxRetries: 0,
dangerouslyAllowBrowser: true,
});
console.log(summarizeSdkUsage(opts?.tag ?? "chat", usage));
const completion = await client.chat.completions.create({
model: config.model,
messages: messages.map((m) => ({
role: m.role as "system" | "user" | "assistant",
content: m.content,
})),
temperature: opts?.temperature ?? 0.9,
stream: false,
});
if (typeof text !== "string" || text.length === 0) {
throw new Error(`Chat API (AI SDK ${protocol}) returned no content.`);
const text = completion.choices[0]?.message?.content ?? "";
console.log(summarizeSdkUsage(opts?.tag ?? "chat", completion.usage ?? undefined));
if (text.length === 0) {
throw new Error(`Chat API returned no content.`);
}
return text;
}
+107 -48
View File
@@ -1,6 +1,4 @@
import { generateImage as generateImageSdk } from "ai";
import { createOpenAI } from "@ai-sdk/openai";
import { createGoogleGenerativeAI } from "@ai-sdk/google";
import OpenAI, { toFile, type Uploadable } from "openai";
import type { Orientation, ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { fetchWithRetry } from "./fetchWithRetry";
import { normalizeBaseUrl } from "./normalizeUrl";
@@ -48,8 +46,8 @@ export type GenerateImageOptions = {
/**
* Reference images (UUIDs, URLs, or base64) to condition generation on —
* typically character portraits + the prior scene image. Runware caps at 4;
* we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these
* map to `prompt.images` (the SDK accepts public URLs or data URLs).
* we silently truncate beyond that. On the native OpenAI path these are
* fetched/decoded and sent to `images.edit`.
*/
referenceImages?: string[];
/** 01, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */
@@ -58,7 +56,7 @@ export type GenerateImageOptions = {
* Output aspect, locked per session. "portrait" → 9:16 vertical for mobile;
* default/"landscape" → 16:9 widescreen. Mapped to each provider's nearest
* supported size: Runware 1024×1792, OpenAI-compatible REST 1024x1792,
* native gpt-image 1024x1536, Gemini aspectRatio 9:16.
* native gpt-image 1024x1536.
*/
orientation?: Orientation;
};
@@ -66,8 +64,8 @@ export type GenerateImageOptions = {
export type GenerateImageResult = {
/**
* Image the client can render directly. A Runware CDN URL on the Runware
* path; a `data:<mime>;base64,...` URI on the AI SDK paths (OpenAI/Gemini
* return raw bytes, not a hosted URL).
* path; a `data:<mime>;base64,...` URI on the native OpenAI path when GPT
* image models return raw bytes instead of a hosted URL.
*/
imageUrl: string;
/**
@@ -117,63 +115,124 @@ export async function generateImage(
const protocol = resolveImageProtocol(config);
switch (protocol) {
case "openai":
case "google":
return generateImageViaAiSdk(config, prompt, options, protocol);
return generateImageOpenAi(config, prompt, options);
case "runware":
return generateImageRunware(config, prompt, options);
case "anthropic":
throw new Error(
'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".',
);
case "openai_compatible":
default:
return generateImageOpenAiCompatible(config, prompt, options);
}
}
// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK.
// Unlike the fetch path, this supports reference-image editing via
// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the
// client a data URI and synthesize a UUID; continuity references reuse the
// data URI rather than a provider UUID.
async function generateImageViaAiSdk(
// Native OpenAI (gpt-image) via the official OpenAI SDK. Unlike the compatible
// fetch path, this supports reference-image editing through `images.edit`.
// GPT image models return raw bytes, so we hand the client a data URI and
// synthesize a UUID; continuity references reuse the data URI rather than a
// provider UUID.
async function generateImageOpenAi(
config: ProviderConfig,
prompt: string,
options: GenerateImageOptions | undefined,
protocol: "openai" | "google",
options?: GenerateImageOptions,
): Promise<GenerateImageResult> {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
const imageModel =
protocol === "openai"
? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model)
: createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image(
config.model,
);
const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
const promptArg =
refs.length > 0 ? { text: prompt, images: refs } : prompt;
// Session-locked aspect. gpt-image takes an explicit `size` (portrait /
// landscape options are 1024x1536 / 1536x1024); Gemini takes an `aspectRatio`.
const portrait = options?.orientation === "portrait";
const { image } = await generateImageSdk({
model: imageModel,
prompt: promptArg,
...(protocol === "openai"
? { size: (portrait ? "1024x1536" : "1536x1024") as `${number}x${number}` }
: { aspectRatio: (portrait ? "9:16" : "16:9") as `${number}:${number}` }),
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai"),
maxRetries: 2,
dangerouslyAllowBrowser: true,
});
const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
const portrait = options?.orientation === "portrait";
const size = portrait ? "1024x1536" : "1536x1024";
return {
imageUrl: `data:${image.mediaType};base64,${image.base64}`,
imageUuid: crypto.randomUUID(),
};
const response =
refs.length > 0
? await client.images.edit({
model: config.model,
prompt,
image: await Promise.all(refs.map(referenceImageToUploadable)),
n: 1,
size,
})
: await client.images.generate({
model: config.model,
prompt,
n: 1,
size,
});
return imageResponseToResult(response);
}
async function referenceImageToUploadable(ref: string): Promise<Uploadable> {
if (ref.startsWith("data:")) {
const response = await fetch(ref);
if (!response.ok) {
throw new Error(`Failed to read data URL reference image.`);
}
const mediaType = response.headers.get("content-type") ?? "image/png";
return toFile(response, `reference.${extensionFromMediaType(mediaType)}`, {
type: mediaType,
});
}
if (/^https?:\/\//i.test(ref)) {
const response = await fetch(ref);
if (!response.ok) {
throw new Error(
`Failed to fetch reference image ${ref}: HTTP ${response.status}`,
);
}
const mediaType = response.headers.get("content-type") ?? "image/png";
return toFile(response, filenameFromUrl(ref, mediaType), {
type: mediaType,
});
}
throw new Error(
`Native OpenAI image editing requires reference image URLs or data URLs; got "${ref.slice(0, 32)}...".`,
);
}
function imageResponseToResult(
response: OpenAI.Images.ImagesResponse,
): GenerateImageResult {
const data = response.data?.[0];
const b64 = data?.b64_json;
if (b64) {
const format = response.output_format ?? "png";
return {
imageUrl: `data:image/${format};base64,${b64}`,
imageUuid: crypto.randomUUID(),
};
}
const imageUrl = data?.url;
if (imageUrl) {
return { imageUrl, imageUuid: crypto.randomUUID() };
}
throw new Error(`No image data in OpenAI response.`);
}
function filenameFromUrl(url: string, mediaType: string): string {
try {
const name = new URL(url).pathname.split("/").filter(Boolean).at(-1);
if (name && /\.[a-z0-9]+$/i.test(name)) return name;
} catch {
// Fall back to the media type below.
}
return `reference.${extensionFromMediaType(mediaType)}`;
}
function extensionFromMediaType(mediaType: string): string {
if (mediaType.includes("jpeg") || mediaType.includes("jpg")) return "jpg";
if (mediaType.includes("webp")) return "webp";
return "png";
}
// OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic
// text-to-image only — no reference images on this path; for editing/anchoring
// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above.
// set IMAGE_PROVIDER=openai to take the native OpenAI path above.
async function generateImageOpenAiCompatible(
config: ProviderConfig,
prompt: string,
-23
View File
@@ -1,23 +0,0 @@
import { createAnthropic } from "@ai-sdk/anthropic";
import { createGoogleGenerativeAI } from "@ai-sdk/google";
import { createOpenAI } from "@ai-sdk/openai";
import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { normalizeBaseUrl } from "./normalizeUrl";
export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
return config.provider ?? "openai_compatible";
}
export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
switch (protocol) {
case "anthropic":
return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
case "google":
return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
case "openai_compatible":
case "openai":
default:
return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
}
}
-2
View File
@@ -31,8 +31,6 @@ const ENDPOINT_SUFFIX =
const DEFAULT_VERSION_SEGMENT: Record<ProviderProtocol, string | null> = {
openai_compatible: "v1",
openai: "v1",
anthropic: "v1",
google: "v1beta",
// Runware posts to the bare base URL with no version-pathed sub-resource,
// so never inject a segment for it.
runware: null,
+27 -30
View File
@@ -1,7 +1,6 @@
import { generateText } from "ai";
import type { ModelMessage } from "ai";
import OpenAI from "openai";
import type { ProviderConfig } from "@infiplot/types";
import { createLanguageModel, resolveProtocol } from "./model";
import { normalizeBaseUrl } from "./normalizeUrl";
const VISION_TIMEOUT_MS = 60_000;
@@ -22,34 +21,32 @@ export async function analyzeImageDataUrl(
imageDataUrl: string,
prompt: string,
): Promise<string> {
const protocol = resolveProtocol(config);
const model = createLanguageModel(config, protocol);
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
maxRetries: 0,
timeout: VISION_TIMEOUT_MS,
dangerouslyAllowBrowser: true,
});
const messages: ModelMessage[] = [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image", image: imageDataUrl },
],
},
];
const completion = await client.chat.completions.create({
model: config.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: imageDataUrl } },
],
},
],
temperature: 0.2,
stream: false,
});
const timeoutCtrl = new AbortController();
const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
try {
const { text } = await generateText({
model,
messages,
temperature: 0.2,
maxRetries: 0,
abortSignal: timeoutCtrl.signal,
});
if (typeof text !== "string" || text.length === 0) {
throw new Error(`Vision API (AI SDK ${protocol}) returned no content.`);
}
return text;
} finally {
clearTimeout(timeoutId);
const text = completion.choices[0]?.message?.content ?? "";
if (text.length === 0) {
throw new Error(`Vision API returned no content.`);
}
return text;
}
-2
View File
@@ -10,8 +10,6 @@ const STORAGE_KEY = "infiplot:model";
const VALID_PROTOCOLS: ProviderProtocol[] = [
"openai_compatible",
"anthropic",
"google",
"openai",
"runware",
];
-2
View File
@@ -6,8 +6,6 @@ import type {
const VALID_PROTOCOLS = [
"openai_compatible",
"anthropic",
"google",
"openai",
"runware",
] as const;
+4 -8
View File
@@ -327,19 +327,15 @@ export type VisionClassify = "insert-beat" | "change-scene";
* openai_compatible text / vision / image — OpenAI Chat Completions +
* `/images/generations` (self-implemented fetch; the
* default for text/vision when unset)
* anthropic text / vision — native Anthropic Messages (AI SDK)
* google text / vision / image — native Gemini (AI SDK); image
* uses the Nano Banana family
* openai image only — OpenAI gpt-image via AI SDK,
* unlocks reference-image editing (for text/vision use
* openai_compatible, which already speaks OpenAI's format)
* openai image only — OpenAI gpt-image via the
* official OpenAI SDK, unlocks reference-image editing
* (for text/vision use openai_compatible, which already
* speaks OpenAI's format)
* runware image only — Runware task-array protocol
* (self-implemented; the default for runware.ai URLs)
*/
export type ProviderProtocol =
| "openai_compatible"
| "anthropic"
| "google"
| "openai"
| "runware";