From 83fd5717e7876f06134220de5b348e7001d9a62b Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Thu, 4 Jun 2026 15:51:53 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat(ai-client):=20multi-provider=20compat?= =?UTF-8?q?=20=E2=80=94=20native=20Anthropic/Google=20+=20URL=20tolerance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TEXT/VISION: add native Anthropic & Google Gemini paths via Vercel AI SDK, selectable through TEXT_PROVIDER / VISION_PROVIDER (default openai_compatible) - IMAGE: expand to openai (gpt-image) / google (Nano Banana) via AI SDK alongside the existing Runware task-array and OpenAI-compatible REST paths - normalizeBaseUrl: tolerate URLs with/without /v1 (or /chat/completions); append the per-protocol version segment only for bare hosts - config: readProvider() reads *_PROVIDER; types: ProviderProtocol + provider? - deps: @ai-sdk/anthropic, @ai-sdk/google; docs in .env.example + README Co-Authored-By: Claude Opus 4.7 --- .env.example | 33 +++++- README.md | 14 ++- lib/ai-client/chat.ts | 90 +++++++++++++++- lib/ai-client/image.ts | 189 +++++++++++++++++++++++++--------- lib/ai-client/normalizeUrl.ts | 66 ++++++++++++ lib/ai-client/vision.ts | 76 +++++++++++++- lib/config.ts | 32 +++++- lib/types/index.ts | 31 ++++++ package.json | 4 + pnpm-lock.yaml | 146 ++++++++++++++++++++++++-- 10 files changed, 614 insertions(+), 67 deletions(-) create mode 100644 lib/ai-client/normalizeUrl.ts diff --git a/.env.example b/.env.example index ae1980e..6d04fa6 100644 --- a/.env.example +++ b/.env.example @@ -3,14 +3,18 @@ # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS # (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]). # -# TEXT / VISION use any OpenAI-compatible endpoint (any OpenAI- -# compatible host works: OpenRouter, OpenAI, Anthropic via proxy, -# Gemini, DeepSeek, Ollama, ...). +# TEXT / VISION default to any OpenAI-compatible endpoint, and can switch to +# native Anthropic or Google Gemini via TEXT_PROVIDER / VISION_PROVIDER. # TTS uses Xiaomi MiMo's own voice design / clone protocol # (not OpenAI-compatible; appends -voicedesign / -voiceclone). # -# IMAGE uses Runware's own task-array protocol (not OpenAI-compatible); -# the adapter posts an `imageInference` task to IMAGE_BASE_URL. +# IMAGE supports Runware (its own task-array protocol), OpenAI (gpt-image), +# and Google Gemini (Nano Banana) via IMAGE_PROVIDER. +# +# *_PROVIDER (optional) selects the wire protocol; leave unset for the +# OpenAI-compatible default (image is auto-detected from the URL). Base URLs +# tolerate a missing or extra /v1 (or a trailing /chat/completions) — the +# engine normalizes them. # ============================================================= # ---- 1. Text LLM · scene director ---------------------------------- @@ -26,6 +30,10 @@ TEXT_BASE_URL=https://api.deepseek.com/v1 TEXT_API_KEY=sk-xxx TEXT_MODEL=deepseek-v4-flash +# TEXT_PROVIDER: openai_compatible (default) | anthropic | google +# anthropic → TEXT_BASE_URL=https://api.anthropic.com TEXT_MODEL=claude-sonnet-4-6 +# google → TEXT_BASE_URL=https://generativelanguage.googleapis.com TEXT_MODEL=gemini-3.5-flash +# TEXT_PROVIDER=openai_compatible # ---- 2. Image generator (renders the scene background) ------------- # Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model, @@ -36,12 +44,27 @@ TEXT_MODEL=deepseek-v4-flash IMAGE_BASE_URL=https://api.runware.ai/v1 IMAGE_API_KEY=runware-xxx IMAGE_MODEL=runware:400@6 +# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible +# | openai | google +# openai → gpt-image, supports referenceImages (character/scene continuity). +# IMAGE_BASE_URL=https://api.openai.com IMAGE_MODEL=gpt-image-1 +# google → Gemini "Nano Banana" (Imagen is EOL 2026-06-24, do not use it). +# IMAGE_BASE_URL=https://generativelanguage.googleapis.com +# IMAGE_MODEL=gemini-2.5-flash-image +# NOTE: openai/google return raw bytes → inlined as a data: URI for the session +# (heavier per-call transport than Runware's UUID re-reference loop). Runware +# stays fastest + cheapest for the scene-by-scene flow. +# IMAGE_PROVIDER=runware # ---- 3. Vision model · multimodal click interpretation ------------- # Recommended: MiMo V2.5 — multimodal, accepts image_url content parts. VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 VISION_API_KEY=tp-xxx VISION_MODEL=mimo-v2.5 +# VISION_PROVIDER: openai_compatible (default) | anthropic | google +# anthropic → VISION_BASE_URL=https://api.anthropic.com VISION_MODEL=claude-sonnet-4-6 +# google → VISION_BASE_URL=https://generativelanguage.googleapis.com VISION_MODEL=gemini-3.5-flash +# VISION_PROVIDER=openai_compatible # ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------ # Per-character voice design → clone, with per-line delivery direction. diff --git a/README.md b/README.md index 80c38ce..09c2220 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ InfiPlot 同时支持部署到 Vercel 与 Cloudflare Workers。Cloudflare 部署 ## 配置教程 -InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Vision)都使用 OpenAI 兼容的接口**,可以自由搭配。**图像(Image)**目前接入 **Runware**(其自有的 task-array 协议,并非 OpenAI 兼容)。**语音(TTS)**使用**小米 MiMo** 自有的音色设计/克隆协议——支持角色级音色设计、克隆与逐行演绎指导。 +InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Vision)** 默认使用 OpenAI 兼容接口,也可原生切换到 **Anthropic** 或 **Google Gemini**。**图像(Image)** 支持 **Runware**(其自有 task-array 协议)、**OpenAI**(`gpt-image`)与 **Google Gemini**(Nano Banana)。**语音(TTS)**使用**小米 MiMo** 自有的音色设计/克隆协议——支持角色级音色设计、克隆与逐行演绎指导。 **1. 选择你的供应商** @@ -136,6 +136,18 @@ InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Visio | Vision · 点击解读 | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | ✅ | Google 的 `gemini-3.5-flash` | | TTS · 角色配音 | `TTS_BASE_URL` `TTS_API_KEY` `TTS_SPEECH_MODEL` | 可选 —— 留空则静音运行 | 小米 MiMo 的 `mimo-v2.5-tts` | +> **可选 · 指定接口协议**:每类模型都可加一个 `*_PROVIDER` 变量(`TEXT_PROVIDER` / `VISION_PROVIDER` / `IMAGE_PROVIDER`)显式选择接口协议。**不设则保持向后兼容**——文本/视觉默认走 OpenAI 兼容接口,图像按 `*_BASE_URL` 自动判断(`runware.ai` → Runware,否则 OpenAI 兼容)。 +> +> | 取值 | 适用 | 说明 | +> |---|---|---| +> | `openai_compatible`(默认) | Text · Vision · Image | OpenAI Chat Completions / `/images/generations` | +> | `anthropic` | Text · Vision | 原生 Anthropic Messages 接口 | +> | `google` | Text · Vision · Image | 原生 Gemini;图像用 Nano Banana 系(如 `gemini-2.5-flash-image`,**勿用已停服的 Imagen**) | +> | `openai` | Image | OpenAI `gpt-image`,支持参考图编辑 | +> | `runware` | Image | Runware task-array 协议 | +> +> 此外,`*_BASE_URL` 带不带 `/v1`(甚至末尾多写了 `/chat/completions`)都能正常工作——引擎会自动规范化。 + **2. 填写环境变量** 九个变量为必填;TTS 可选(留空则静音运行)。此外还有一个用于低成本测试的开关: diff --git a/lib/ai-client/chat.ts b/lib/ai-client/chat.ts index 4480dbb..f28a280 100644 --- a/lib/ai-client/chat.ts +++ b/lib/ai-client/chat.ts @@ -1,5 +1,10 @@ -import type { ProviderConfig } from "@infiplot/types"; +import { generateText } from "ai"; +import type { LanguageModelUsage, ModelMessage } from "ai"; +import { createAnthropic } from "@ai-sdk/anthropic"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; +import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; import { fetchWithRetry } from "./fetchWithRetry"; +import { normalizeBaseUrl } from "./normalizeUrl"; export type ChatMessage = { role: "system" | "user" | "assistant"; @@ -57,6 +62,31 @@ function summarizeUsage(tag: string, usage: Usage | undefined): string { return `[cache] ${tag} prompt=${prompt} completion=${completion} (provider didn't report cache stats)`; } +// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails, +// so a single shape covers Anthropic + Gemini (no per-provider probing). +function summarizeSdkUsage( + tag: string, + usage: LanguageModelUsage | undefined, +): string { + if (!usage) return `[cache] ${tag} no-usage`; + const input = usage.inputTokens ?? 0; + const output = usage.outputTokens ?? 0; + const read = usage.inputTokenDetails?.cacheReadTokens; + const write = usage.inputTokenDetails?.cacheWriteTokens; + if (typeof read === "number" || typeof write === "number") { + const hit = read ?? 0; + const create = write ?? 0; + const rate = input > 0 ? ((hit / input) * 100).toFixed(1) : "n/a"; + return `[cache] ${tag} hit=${hit} create=${create} input=${input} rate=${rate}% completion=${output}`; + } + return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`; +} + +// text/vision default to the OpenAI-compatible wire protocol when unset. +function resolveTextProtocol(config: ProviderConfig): ProviderProtocol { + return config.provider ?? "openai_compatible"; +} + export async function chat( config: ProviderConfig, messages: ChatMessage[], @@ -66,7 +96,63 @@ export async function chat( tag?: string; }, ): Promise { - const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`; + const protocol = resolveTextProtocol(config); + if (protocol === "anthropic" || protocol === "google") { + return chatViaAiSdk(config, messages, opts, protocol); + } + return chatOpenAiCompatible(config, messages, opts); +} + +// Native Anthropic / Gemini via the Vercel AI SDK. response_format is not sent +// (Anthropic has no JSON mode); the engine relies on parseJsonLoose downstream, +// matching how it already tolerates loose JSON from every provider. +async function chatViaAiSdk( + config: ProviderConfig, + messages: ChatMessage[], + opts: { temperature?: number; tag?: string } | undefined, + protocol: "anthropic" | "google", +): Promise { + const baseURL = normalizeBaseUrl(config.baseUrl, protocol); + const model = + protocol === "anthropic" + ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model) + : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })( + config.model, + ); + + const system = messages.find((m) => m.role === "system")?.content; + const convo: ModelMessage[] = messages + .filter((m) => m.role !== "system") + .map((m) => ({ + role: m.role as "user" | "assistant", + content: m.content, + })); + + const { text, usage } = await generateText({ + model, + system, + messages: convo, + temperature: opts?.temperature ?? 0.9, + }); + + console.log(summarizeSdkUsage(opts?.tag ?? "chat", usage)); + + if (typeof text !== "string" || text.length === 0) { + throw new Error(`Chat API (AI SDK ${protocol}) returned no content.`); + } + return text; +} + +async function chatOpenAiCompatible( + config: ProviderConfig, + messages: ChatMessage[], + opts?: { + temperature?: number; + responseFormat?: "json_object" | "text"; + tag?: string; + }, +): Promise { + const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`; const body: Record = { model: config.model, messages, diff --git a/lib/ai-client/image.ts b/lib/ai-client/image.ts index 218de21..bf11a0c 100644 --- a/lib/ai-client/image.ts +++ b/lib/ai-client/image.ts @@ -1,5 +1,9 @@ -import type { ProviderConfig } from "@infiplot/types"; +import { generateImage as generateImageSdk } from "ai"; +import { createOpenAI } from "@ai-sdk/openai"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; +import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; import { fetchWithRetry } from "./fetchWithRetry"; +import { normalizeBaseUrl } from "./normalizeUrl"; // Runware uses its own task-array protocol (not OpenAI-compatible). // POST with [{ taskType: "imageInference", ... }]; errors come @@ -38,30 +42,52 @@ export type GenerateImageOptions = { * Reference image (UUID, public URL, or base64) for img2img. When set, * FLUX preserves the seed image's composition and applies `strength` to * deviate. NOTE: FLUX.2 [klein] 9B KV does NOT support seedImage — use - * `referenceImages` for visual continuity instead. + * `referenceImages` for visual continuity instead. Runware-only. */ seedImage?: string; /** * Reference images (UUIDs, URLs, or base64) to condition generation on — * typically character portraits + the prior scene image. Runware caps at 4; - * we silently truncate beyond that. + * we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these + * map to `prompt.images` (the SDK accepts public URLs or data URLs). */ referenceImages?: string[]; - /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. */ + /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */ strength?: number; }; export type GenerateImageResult = { - /** Public CDN URL of the generated image (Runware-hosted). */ + /** + * Image the client can render directly. A Runware CDN URL on the Runware + * path; a `data:;base64,...` URI on the AI SDK paths (OpenAI/Gemini + * return raw bytes, not a hosted URL). + */ imageUrl: string; - /** Stable UUID for cheap re-reference in later `referenceImages`. */ + /** + * Stable handle for cheap re-reference in later `referenceImages`. A real + * Runware UUID on the Runware path; a synthetic UUID on other paths (those + * re-reference via the URL/data-URL form instead). + */ imageUuid: string; }; +// Image roles support more protocols than text/vision. When IMAGE_PROVIDER is +// unset we keep the historical URL-based inference so existing deployments +// (Runware, or an OpenAI-compatible gateway) behave exactly as before. +function inferImageProtocol(config: ProviderConfig): ProviderProtocol { + const isOpenAiCompat = + !config.baseUrl.includes("runware.ai") || config.model === "image-2-vip"; + return isOpenAiCompat ? "openai_compatible" : "runware"; +} + +function resolveImageProtocol(config: ProviderConfig): ProviderProtocol { + return config.provider ?? inferImageProtocol(config); +} + // ────────────────────────────────────────────────────────────────────── // generateImage — text-to-image (default) or referenceImages-conditioned. -// Returns both the public URL (for client display + future references) -// and the UUID (cheapest reference form for subsequent calls). +// Returns both a renderable image URL and a re-reference handle (see +// GenerateImageResult). Dispatches on the resolved wire protocol. // ────────────────────────────────────────────────────────────────────── export async function generateImage( @@ -69,51 +95,120 @@ export async function generateImage( prompt: string, options?: GenerateImageOptions, ): Promise { - const url = config.baseUrl.replace(/\/$/, ""); + const protocol = resolveImageProtocol(config); + switch (protocol) { + case "openai": + case "google": + return generateImageViaAiSdk(config, prompt, options, protocol); + case "runware": + return generateImageRunware(config, prompt, options); + case "anthropic": + throw new Error( + 'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".', + ); + case "openai_compatible": + default: + return generateImageOpenAiCompatible(config, prompt); + } +} - // 1. OpenAI-compatible route (GPTGod, DALL-E, etc.) - const isOpenAi = !url.includes("runware.ai") || config.model === "image-2-vip"; - if (isOpenAi) { - const endpoint = url.endsWith("/images/generations") ? url : `${url}/images/generations`; - console.log(`[ai-client] Calling OpenAI-compatible image generations at: ${endpoint} with model: ${config.model}`); - - const res = await fetchWithRetry(endpoint, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${config.apiKey}`, - }, - body: JSON.stringify({ - model: config.model, - prompt: prompt, - n: 1, - size: "1792x1024", // Use horizontal size (16:9) - }), - }); +// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK. +// Unlike the fetch path, this supports reference-image editing via +// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the +// client a data URI and synthesize a UUID; continuity references reuse the +// data URI rather than a provider UUID. +async function generateImageViaAiSdk( + config: ProviderConfig, + prompt: string, + options: GenerateImageOptions | undefined, + protocol: "openai" | "google", +): Promise { + const baseURL = normalizeBaseUrl(config.baseUrl, protocol); + const imageModel = + protocol === "openai" + ? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model) + : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image( + config.model, + ); - const text = await res.text(); - let json: any; - try { - json = JSON.parse(text); - } catch { - throw new Error(`OpenAI Image API error ${res.status}: ${text.slice(0, 500)}`); - } + const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES); + const promptArg = + refs.length > 0 ? { text: prompt, images: refs } : prompt; - if (json.error) { - throw new Error(`OpenAI Image API error: ${json.error.message || JSON.stringify(json.error)}`); - } + // OpenAI's image models take an explicit `size`; gpt-image's widest landscape + // option is 1536x1024. Gemini takes an `aspectRatio` instead. + const { image } = await generateImageSdk({ + model: imageModel, + prompt: promptArg, + ...(protocol === "openai" + ? { size: "1536x1024" as `${number}x${number}` } + : { aspectRatio: "16:9" as `${number}:${number}` }), + }); - const data = json.data?.[0]; - const imageUrl = data?.url; - if (!imageUrl) { - throw new Error(`No image URL in OpenAI response: ${text.slice(0, 300)}`); - } - // Generate a mock UUID since OpenAI compatible endpoint doesn't have UUIDs - const imageUuid = crypto.randomUUID(); - return { imageUrl, imageUuid }; + return { + imageUrl: `data:${image.mediaType};base64,${image.base64}`, + imageUuid: crypto.randomUUID(), + }; +} + +// OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic +// text-to-image only — no reference images on this path; for editing/anchoring +// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above. +async function generateImageOpenAiCompatible( + config: ProviderConfig, + prompt: string, +): Promise { + const base = normalizeBaseUrl(config.baseUrl, "openai_compatible"); + const endpoint = `${base}/images/generations`; + console.log( + `[ai-client] Calling OpenAI-compatible image generations at: ${endpoint} with model: ${config.model}`, + ); + + const res = await fetchWithRetry(endpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${config.apiKey}`, + }, + body: JSON.stringify({ + model: config.model, + prompt: prompt, + n: 1, + size: "1792x1024", // Use horizontal size (16:9) + }), + }); + + const text = await res.text(); + let json: any; + try { + json = JSON.parse(text); + } catch { + throw new Error(`OpenAI Image API error ${res.status}: ${text.slice(0, 500)}`); } - // 2. Runware task-array route + if (json.error) { + throw new Error(`OpenAI Image API error: ${json.error.message || JSON.stringify(json.error)}`); + } + + const data = json.data?.[0]; + const imageUrl = data?.url; + if (!imageUrl) { + throw new Error(`No image URL in OpenAI response: ${text.slice(0, 300)}`); + } + // Generate a mock UUID since OpenAI compatible endpoint doesn't have UUIDs + const imageUuid = crypto.randomUUID(); + return { imageUrl, imageUuid }; +} + +// Runware task-array route — self-implemented to preserve the UUID/URL closed +// loop (the official @runware/ai-sdk-provider drops both). +async function generateImageRunware( + config: ProviderConfig, + prompt: string, + options?: GenerateImageOptions, +): Promise { + const url = normalizeBaseUrl(config.baseUrl, "runware"); + const task: Record = { taskType: "imageInference", taskUUID: crypto.randomUUID(), diff --git a/lib/ai-client/normalizeUrl.ts b/lib/ai-client/normalizeUrl.ts new file mode 100644 index 0000000..10de5f3 --- /dev/null +++ b/lib/ai-client/normalizeUrl.ts @@ -0,0 +1,66 @@ +import type { ProviderProtocol } from "@infiplot/types"; + +// ────────────────────────────────────────────────────────────────────── +// Base-URL normalization — tolerate whatever shape the user pastes. +// +// The README never specified whether the base URL needs a `/v1` suffix, +// so users provide all of these for the same endpoint: +// https://api.deepseek.com +// https://api.deepseek.com/v1 +// https://api.deepseek.com/v1/chat/completions +// We normalize to a canonical base the adapter can safely append its own +// endpoint path to. This also fixes the pre-existing double-suffix bug +// where a pasted `.../chat/completions` became `.../chat/completions/chat/completions`. +// +// Strategy (bare-host-only version append): +// 1. strip trailing slashes +// 2. strip a trailing known endpoint suffix (chat/completions, messages, …) +// 3. only when the URL the user gave is a BARE host (scheme://host[:port] +// with no path) do we append the protocol's default version segment. +// Any path the user wrote (/v1, /beta, /zen/go, /chat/completions, …) is +// treated as an explicit location and left intact — so we never turn +// `/beta` into `/beta/v1`, and a version-less `/chat/completions` +// endpoint is preserved. +// ────────────────────────────────────────────────────────────────────── + +// Endpoint paths an adapter appends itself — stripped so we keep only the base. +const ENDPOINT_SUFFIX = + /\/(chat\/completions|completions|responses|messages|images\/(generations|edits))\/?$/i; + +// Default version segment to append per protocol for a bare host. +const DEFAULT_VERSION_SEGMENT: Record = { + openai_compatible: "v1", + openai: "v1", + anthropic: "v1", + google: "v1beta", + // Runware posts to the bare base URL with no version-pathed sub-resource, + // so never inject a segment for it. + runware: null, +}; + +// True when `raw` is just scheme://host[:port] with no meaningful path — the +// only shape where we infer a default version segment. A lone "/" counts as +// bare. Falls back to a scheme-anchored regex if the URL can't be parsed. +function isBareHost(raw: string): boolean { + try { + const { pathname } = new URL(raw); + return pathname === "" || pathname === "/"; + } catch { + return !/^[a-z][a-z0-9+.-]*:\/\/[^/]+\/.+/i.test(raw); + } +} + +export function normalizeBaseUrl( + raw: string, + protocol: ProviderProtocol, +): string { + const trimmed = raw.trim(); + let u = trimmed.replace(/\/+$/, ""); + u = u.replace(ENDPOINT_SUFFIX, "").replace(/\/+$/, ""); + + const seg = DEFAULT_VERSION_SEGMENT[protocol]; + if (seg && isBareHost(trimmed)) { + u = `${u}/${seg}`; + } + return u; +} diff --git a/lib/ai-client/vision.ts b/lib/ai-client/vision.ts index ade15d6..b43429a 100644 --- a/lib/ai-client/vision.ts +++ b/lib/ai-client/vision.ts @@ -1,5 +1,12 @@ -import type { ProviderConfig } from "@infiplot/types"; +import { generateText } from "ai"; +import type { ModelMessage } from "ai"; +import { createAnthropic } from "@ai-sdk/anthropic"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; +import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; import { fetchWithRetry } from "./fetchWithRetry"; +import { normalizeBaseUrl } from "./normalizeUrl"; + +const VISION_TIMEOUT_MS = 60_000; export async function interpretClick( config: ProviderConfig, @@ -16,6 +23,11 @@ export async function interpretClick( ); } +// text/vision default to the OpenAI-compatible wire protocol when unset. +function resolveVisionProtocol(config: ProviderConfig): ProviderProtocol { + return config.provider ?? "openai_compatible"; +} + /** * General single-image vision call. Accepts a complete data URL (preserves * the source mime type, e.g. webp/jpeg) and lets the caller opt out of @@ -27,7 +39,65 @@ export async function analyzeImageDataUrl( prompt: string, opts: { responseFormat?: "json_object" | "text" } = {}, ): Promise { - const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`; + const protocol = resolveVisionProtocol(config); + if (protocol === "anthropic" || protocol === "google") { + return analyzeViaAiSdk(config, imageDataUrl, prompt, protocol); + } + return analyzeOpenAiCompatible(config, imageDataUrl, prompt, opts); +} + +// Native Anthropic / Gemini multimodal via the AI SDK. The image part takes +// the full data URL directly; the SDK decodes it. response_format is not sent +// (no JSON mode on Anthropic) — the engine's parseJsonLoose handles output. +async function analyzeViaAiSdk( + config: ProviderConfig, + imageDataUrl: string, + prompt: string, + protocol: "anthropic" | "google", +): Promise { + const baseURL = normalizeBaseUrl(config.baseUrl, protocol); + const model = + protocol === "anthropic" + ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model) + : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })( + config.model, + ); + + const messages: ModelMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + { type: "image", image: imageDataUrl }, + ], + }, + ]; + + const timeoutCtrl = new AbortController(); + const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS); + try { + const { text } = await generateText({ + model, + messages, + temperature: 0.2, + abortSignal: timeoutCtrl.signal, + }); + if (typeof text !== "string" || text.length === 0) { + throw new Error(`Vision API (AI SDK ${protocol}) returned no content.`); + } + return text; + } finally { + clearTimeout(timeoutId); + } +} + +async function analyzeOpenAiCompatible( + config: ProviderConfig, + imageDataUrl: string, + prompt: string, + opts: { responseFormat?: "json_object" | "text" } = {}, +): Promise { + const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`; const body: Record = { model: config.model, @@ -47,7 +117,7 @@ export async function analyzeImageDataUrl( } const timeoutCtrl = new AbortController(); - const timeoutId = setTimeout(() => timeoutCtrl.abort(), 60_000); + const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS); let res: Response; try { diff --git a/lib/config.ts b/lib/config.ts index 576199b..10def17 100644 --- a/lib/config.ts +++ b/lib/config.ts @@ -1,4 +1,16 @@ -import type { EngineConfig, TtsConfig } from "@infiplot/types"; +import type { + EngineConfig, + ProviderProtocol, + TtsConfig, +} from "@infiplot/types"; + +const VALID_PROTOCOLS = [ + "openai_compatible", + "anthropic", + "google", + "openai", + "runware", +] as const; function readVar(name: string): string { const v = process.env[name]; @@ -11,6 +23,21 @@ function readOptionalVar(name: string): string | undefined { return v && v.length > 0 ? v : undefined; } +// Optional *_PROVIDER selector. Unset → undefined, and each ai-client adapter +// applies its own default (text/vision → openai_compatible; image → inferred +// from the base URL). Validated eagerly so a typo fails fast at boot rather +// than mid-request. +function readProvider(name: string): ProviderProtocol | undefined { + const v = readOptionalVar(name)?.trim().toLowerCase(); + if (!v) return undefined; + if ((VALID_PROTOCOLS as readonly string[]).includes(v)) { + return v as ProviderProtocol; + } + throw new Error( + `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}`, + ); +} + function loadTtsConfig(): TtsConfig | undefined { const baseUrl = readOptionalVar("TTS_BASE_URL"); const apiKey = readOptionalVar("TTS_API_KEY"); @@ -28,16 +55,19 @@ export function loadEngineConfig(headers?: Headers): EngineConfig { baseUrl: readVar("TEXT_BASE_URL"), apiKey: readVar("TEXT_API_KEY"), model: readVar("TEXT_MODEL"), + provider: readProvider("TEXT_PROVIDER"), }, image: { baseUrl: readVar("IMAGE_BASE_URL"), apiKey: readVar("IMAGE_API_KEY"), model: readVar("IMAGE_MODEL"), + provider: readProvider("IMAGE_PROVIDER"), }, vision: { baseUrl: readVar("VISION_BASE_URL"), apiKey: readVar("VISION_API_KEY"), model: readVar("VISION_MODEL"), + provider: readProvider("VISION_PROVIDER"), }, tts: loadTtsConfig(), mockImage: readOptionalVar("MOCK_IMAGE") === "true", diff --git a/lib/types/index.ts b/lib/types/index.ts index c5e6a35..43b3859 100644 --- a/lib/types/index.ts +++ b/lib/types/index.ts @@ -268,10 +268,41 @@ export type VisionClassify = "insert-beat" | "change-scene"; // Provider config // ────────────────────────────────────────────────────────────────────── +/** + * Wire protocol used to talk to a model provider. Which values are valid + * depends on the model role — each ai-client adapter accepts its own subset + * and falls back to a sensible default for anything else: + * + * openai_compatible text / vision / image — OpenAI Chat Completions + + * `/images/generations` (self-implemented fetch; the + * default for text/vision when unset) + * anthropic text / vision — native Anthropic Messages (AI SDK) + * google text / vision / image — native Gemini (AI SDK); image + * uses the Nano Banana family + * openai image only — OpenAI gpt-image via AI SDK, + * unlocks reference-image editing (for text/vision use + * openai_compatible, which already speaks OpenAI's format) + * runware image only — Runware task-array protocol + * (self-implemented; the default for runware.ai URLs) + */ +export type ProviderProtocol = + | "openai_compatible" + | "anthropic" + | "google" + | "openai" + | "runware"; + export type ProviderConfig = { baseUrl: string; apiKey: string; model: string; + /** + * Wire protocol. When unset, callers apply a role-specific default: + * text/vision → "openai_compatible"; image → inferred from baseUrl + * (runware.ai → "runware", otherwise "openai_compatible") so existing + * deployments keep working without setting *_PROVIDER. + */ + provider?: ProviderProtocol; }; export type TtsConfig = { diff --git a/package.json b/package.json index c5d3a40..f02121b 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,10 @@ "deploy:cf": "opennextjs-cloudflare deploy" }, "dependencies": { + "@ai-sdk/anthropic": "^3.0.81", + "@ai-sdk/google": "^3.0.80", + "@ai-sdk/openai": "^3.0.67", + "ai": "^6.0.196", "jsonrepair": "^3.14.0", "next": "^16.0.0", "react": "^19.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ce3df06..e70280e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,12 +8,24 @@ importers: .: dependencies: + '@ai-sdk/anthropic': + specifier: ^3.0.81 + version: 3.0.81(zod@4.4.3) + '@ai-sdk/google': + specifier: ^3.0.80 + version: 3.0.80(zod@4.4.3) + '@ai-sdk/openai': + specifier: ^3.0.67 + version: 3.0.67(zod@4.4.3) + ai: + specifier: ^6.0.196 + version: 6.0.196(zod@4.4.3) jsonrepair: specifier: ^3.14.0 version: 3.14.0 next: specifier: ^16.0.0 - version: 16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7) + version: 16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7) react: specifier: ^19.0.0 version: 19.2.7 @@ -23,7 +35,7 @@ importers: devDependencies: '@opennextjs/cloudflare': specifier: ^1.19.11 - version: 1.19.11(next@16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7))(wrangler@4.97.0) + version: 1.19.11(next@16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7))(wrangler@4.97.0) '@types/node': specifier: ^22.9.0 version: 22.19.19 @@ -54,6 +66,40 @@ importers: packages: + '@ai-sdk/anthropic@3.0.81': + resolution: {integrity: sha512-B1JDd9Ugq9R5AgIaW3674lhGCMMYJcPUxnrZh8fzbGojgg4QvHFRv6eZahGQAUsmGHbcf74G9bdSBDLWQGY2GA==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/gateway@3.0.124': + resolution: {integrity: sha512-h8CrmbSG+8X0C+M/E1M4oiDHYevqwbzAPN+uLRHS0eJaatF2MZ+juNtOHXNOjk7Bsk9mD2RjYMjJO9dFkb9I7Q==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/google@3.0.80': + resolution: {integrity: sha512-5ORbm/yFUPO0MEvZsxBMN0cdKw2+lwU/wVn5KN3KF8Dmk1LughuDuUohMh/7iU/XFTiyB0OvmTW/tdV/J7O9zg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/openai@3.0.67': + resolution: {integrity: sha512-oAiGC9eWG7IgtdsdS74bOCnAAHarAfTJhWN9x5INwnWPekL802AvF+0I5DvLzIF1MIRmNw4N8mPSL/GUVbX9Mw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/provider-utils@4.0.27': + resolution: {integrity: sha512-ubkAJ+xODouwtmN1tYlvTPphH1hPOBfZaEQe8U7skGvFAnIRs9PPpsq57bC2+Ky/MB4yzhd6YOsxTAx9sGpazw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/provider@3.0.10': + resolution: {integrity: sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw==} + engines: {node: '>=18'} + '@alloc/quick-lru@5.2.0': resolution: {integrity: sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==} engines: {node: '>=10'} @@ -1036,6 +1082,10 @@ packages: next: '>=15.5.18 <16 || >=16.2.6' wrangler: ^4.86.0 + '@opentelemetry/api@1.9.1': + resolution: {integrity: sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==} + engines: {node: '>=8.0.0'} + '@poppinss/colors@4.1.6': resolution: {integrity: sha512-H9xkIdFswbS8n1d6vmRd8+c10t2Qe+rZITbbDHHkQixH5+2x1FDGmi/0K+WgWiqQFKPSlIYB7jlH6Kpfn6Fleg==} @@ -1204,6 +1254,9 @@ packages: '@speed-highlight/core@1.2.15': resolution: {integrity: sha512-BMq1K3DsElxDWawkX6eLg9+CKJrTVGCBAWVuHXVUV2u0s2711qiChLSId6ikYPfxhdYocLNt3wWwSvDiTvFabw==} + '@standard-schema/spec@1.1.0': + resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} + '@swc/helpers@0.5.15': resolution: {integrity: sha512-JQ5TuMi45Owi4/BIMAJBoSQoOJu12oOk/gADqlcUL9JEdHB8vyjUSsxqeNXnmXHjYKMi2WcYtezGEEhqUI/E2g==} @@ -1227,6 +1280,10 @@ packages: '@types/react@19.2.16': resolution: {integrity: sha512-esJiCAnl0kfpNdE69f3So4WJUXy95dLZydX0KwK46riIHDzHM7O9Vtf9xCHW0PXIqvgqNrswl522kA/5yx+F4w==} + '@vercel/oidc@3.2.0': + resolution: {integrity: sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug==} + engines: {node: '>= 20'} + abort-controller@3.0.0: resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} engines: {node: '>=6.5'} @@ -1244,6 +1301,12 @@ packages: resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==} engines: {node: '>= 8.0.0'} + ai@6.0.196: + resolution: {integrity: sha512-2T45UeqKL4a11KQ14I5i1YYHOvCFrMF478E1k6PVjlQSGUvXSv4xrxIaQbUL4qgv91DADSbddwv3oR49pPAK3g==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + ansi-colors@4.1.3: resolution: {integrity: sha512-/6w/C21Pm1A7aZitlI5Ni/2J6FFQN8i1Cvz3kHABAAbw93v/NlvKdVOqz7CCWz/3iv/JplRSEEZ83XION15ovw==} engines: {node: '>=6'} @@ -1549,6 +1612,10 @@ packages: resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==} engines: {node: '>=6'} + eventsource-parser@3.1.0: + resolution: {integrity: sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg==} + engines: {node: '>=18.0.0'} + execa@5.1.1: resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} engines: {node: '>=10'} @@ -1754,6 +1821,9 @@ packages: resolution: {integrity: sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==} hasBin: true + json-schema@0.4.0: + resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==} + jsonrepair@3.14.0: resolution: {integrity: sha512-tWPGKMZf/8UPim+fcW2EfcQ/d/7aKUrP6IECz9G3Tu6Q5dX0orSleqJ9z6sSw7qrQkjF8/Edo4DvsWBZ8H+HNg==} hasBin: true @@ -2384,8 +2454,47 @@ packages: youch@4.1.0-beta.10: resolution: {integrity: sha512-rLfVLB4FgQneDr0dv1oddCVZmKjcJ6yX6mS4pU82Mq/Dt9a3cLZQ62pDBL4AUO+uVrCvtWz3ZFUL2HFAFJ/BXQ==} + zod@4.4.3: + resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==} + snapshots: + '@ai-sdk/anthropic@3.0.81(zod@4.4.3)': + dependencies: + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) + zod: 4.4.3 + + '@ai-sdk/gateway@3.0.124(zod@4.4.3)': + dependencies: + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) + '@vercel/oidc': 3.2.0 + zod: 4.4.3 + + '@ai-sdk/google@3.0.80(zod@4.4.3)': + dependencies: + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) + zod: 4.4.3 + + '@ai-sdk/openai@3.0.67(zod@4.4.3)': + dependencies: + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) + zod: 4.4.3 + + '@ai-sdk/provider-utils@4.0.27(zod@4.4.3)': + dependencies: + '@ai-sdk/provider': 3.0.10 + '@standard-schema/spec': 1.1.0 + eventsource-parser: 3.1.0 + zod: 4.4.3 + + '@ai-sdk/provider@3.0.10': + dependencies: + json-schema: 0.4.0 + '@alloc/quick-lru@5.2.0': {} '@ast-grep/napi-darwin-arm64@0.40.5': @@ -3446,7 +3555,7 @@ snapshots: '@nodelib/fs.scandir': 2.1.5 fastq: 1.20.1 - '@opennextjs/aws@4.0.2(next@16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7))': + '@opennextjs/aws@4.0.2(next@16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7))': dependencies: '@ast-grep/napi': 0.40.5 '@aws-sdk/client-cloudfront': 3.984.0 @@ -3462,24 +3571,24 @@ snapshots: cookie: 1.1.1 esbuild: 0.25.4 express: 5.2.1 - next: 16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7) + next: 16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7) path-to-regexp: 6.3.0 urlpattern-polyfill: 10.1.0 yaml: 2.9.0 transitivePeerDependencies: - supports-color - '@opennextjs/cloudflare@1.19.11(next@16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7))(wrangler@4.97.0)': + '@opennextjs/cloudflare@1.19.11(next@16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7))(wrangler@4.97.0)': dependencies: '@ast-grep/napi': 0.40.5 '@dotenvx/dotenvx': 1.31.0 - '@opennextjs/aws': 4.0.2(next@16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7)) + '@opennextjs/aws': 4.0.2(next@16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7)) ci-info: 4.4.0 cloudflare: 4.5.0 comment-json: 4.6.2 enquirer: 2.4.1 glob: 12.0.0 - next: 16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7) + next: 16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7) ts-tqdm: 0.8.6 wrangler: 4.97.0 yargs: 18.0.0 @@ -3487,6 +3596,8 @@ snapshots: - encoding - supports-color + '@opentelemetry/api@1.9.1': {} + '@poppinss/colors@4.1.6': dependencies: kleur: 4.1.5 @@ -3697,6 +3808,8 @@ snapshots: '@speed-highlight/core@1.2.15': {} + '@standard-schema/spec@1.1.0': {} + '@swc/helpers@0.5.15': dependencies: tslib: 2.8.1 @@ -3724,6 +3837,8 @@ snapshots: dependencies: csstype: 3.2.3 + '@vercel/oidc@3.2.0': {} + abort-controller@3.0.0: dependencies: event-target-shim: 5.0.1 @@ -3739,6 +3854,14 @@ snapshots: dependencies: humanize-ms: 1.2.1 + ai@6.0.196(zod@4.4.3): + dependencies: + '@ai-sdk/gateway': 3.0.124(zod@4.4.3) + '@ai-sdk/provider': 3.0.10 + '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) + '@opentelemetry/api': 1.9.1 + zod: 4.4.3 + ansi-colors@4.1.3: {} ansi-regex@5.0.1: {} @@ -4052,6 +4175,8 @@ snapshots: event-target-shim@5.0.1: {} + eventsource-parser@3.1.0: {} + execa@5.1.1: dependencies: cross-spawn: 7.0.6 @@ -4293,6 +4418,8 @@ snapshots: jiti@1.21.7: {} + json-schema@0.4.0: {} + jsonrepair@3.14.0: {} kleur@4.1.5: {} @@ -4376,7 +4503,7 @@ snapshots: negotiator@1.0.0: {} - next@16.2.7(react-dom@19.2.7(react@19.2.7))(react@19.2.7): + next@16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7): dependencies: '@next/env': 16.2.7 '@swc/helpers': 0.5.15 @@ -4395,6 +4522,7 @@ snapshots: '@next/swc-linux-x64-musl': 16.2.7 '@next/swc-win32-arm64-msvc': 16.2.7 '@next/swc-win32-x64-msvc': 16.2.7 + '@opentelemetry/api': 1.9.1 sharp: 0.34.5 transitivePeerDependencies: - '@babel/core' @@ -4928,3 +5056,5 @@ snapshots: '@speed-highlight/core': 1.2.15 cookie: 1.1.1 youch-core: 0.3.3 + + zod@4.4.3: {} From 865bf322e99487fdb5d189b314a173c5f80176a6 Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Thu, 4 Jun 2026 16:47:56 +0800 Subject: [PATCH 2/2] fix(ai-client): parse Runware host by hostname; doc nits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - inferImageProtocol: match runware.ai by parsed hostname (exact match or subdomain) instead of a bare substring, so notrunware.ai / runware.ai.evil.com no longer misroute to the Runware protocol - README: document the image-2-vip → OpenAI-compatible exception; correct the Imagen wording (deprecated, EOL 2026-06-24 — not yet discontinued) Addresses Copilot review on #30. Co-Authored-By: Claude Opus 4.7 --- README.md | 4 ++-- lib/ai-client/image.ts | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 09c2220..edafdea 100644 --- a/README.md +++ b/README.md @@ -136,13 +136,13 @@ InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Visio | Vision · 点击解读 | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | ✅ | Google 的 `gemini-3.5-flash` | | TTS · 角色配音 | `TTS_BASE_URL` `TTS_API_KEY` `TTS_SPEECH_MODEL` | 可选 —— 留空则静音运行 | 小米 MiMo 的 `mimo-v2.5-tts` | -> **可选 · 指定接口协议**:每类模型都可加一个 `*_PROVIDER` 变量(`TEXT_PROVIDER` / `VISION_PROVIDER` / `IMAGE_PROVIDER`)显式选择接口协议。**不设则保持向后兼容**——文本/视觉默认走 OpenAI 兼容接口,图像按 `*_BASE_URL` 自动判断(`runware.ai` → Runware,否则 OpenAI 兼容)。 +> **可选 · 指定接口协议**:每类模型都可加一个 `*_PROVIDER` 变量(`TEXT_PROVIDER` / `VISION_PROVIDER` / `IMAGE_PROVIDER`)显式选择接口协议。**不设则保持向后兼容**——文本/视觉默认走 OpenAI 兼容接口,图像按 `*_BASE_URL` 自动判断(`runware.ai` → Runware,否则 OpenAI 兼容;个别在 `runware.ai` 上以 OpenAI 协议提供的模型——如 `image-2-vip`——会按 OpenAI 兼容处理,需要时用 `IMAGE_PROVIDER` 显式覆盖即可)。 > > | 取值 | 适用 | 说明 | > |---|---|---| > | `openai_compatible`(默认) | Text · Vision · Image | OpenAI Chat Completions / `/images/generations` | > | `anthropic` | Text · Vision | 原生 Anthropic Messages 接口 | -> | `google` | Text · Vision · Image | 原生 Gemini;图像用 Nano Banana 系(如 `gemini-2.5-flash-image`,**勿用已停服的 Imagen**) | +> | `google` | Text · Vision · Image | 原生 Gemini;图像用 Nano Banana 系(如 `gemini-2.5-flash-image`,**勿用 Imagen(已废弃,2026-06-24 停服)**) | > | `openai` | Image | OpenAI `gpt-image`,支持参考图编辑 | > | `runware` | Image | Runware task-array 协议 | > diff --git a/lib/ai-client/image.ts b/lib/ai-client/image.ts index bf11a0c..f7c03f8 100644 --- a/lib/ai-client/image.ts +++ b/lib/ai-client/image.ts @@ -71,12 +71,24 @@ export type GenerateImageResult = { imageUuid: string; }; +// Match the Runware host by parsed hostname (exact match or subdomain), not a +// bare substring — otherwise `notrunware.ai` or `api.runware.ai.evil.com` would +// misroute to the Runware protocol. Falls back to false on an unparseable URL. +function isRunwareHost(baseUrl: string): boolean { + try { + const host = new URL(baseUrl).hostname.toLowerCase(); + return host === "runware.ai" || host.endsWith(".runware.ai"); + } catch { + return false; + } +} + // Image roles support more protocols than text/vision. When IMAGE_PROVIDER is // unset we keep the historical URL-based inference so existing deployments // (Runware, or an OpenAI-compatible gateway) behave exactly as before. function inferImageProtocol(config: ProviderConfig): ProviderProtocol { const isOpenAiCompat = - !config.baseUrl.includes("runware.ai") || config.model === "image-2-vip"; + !isRunwareHost(config.baseUrl) || config.model === "image-2-vip"; return isOpenAiCompat ? "openai_compatible" : "runware"; }