Merge pull request #43 from zonghaoyuan/worktree-ai-sdk-migration

refactor(ai-client): unify OpenAI-compatible path to AI SDK generateText
This commit is contained in:
Zonghao Yuan
2026-06-07 12:04:47 +08:00
committed by GitHub
9 changed files with 39 additions and 258 deletions
-1
View File
@@ -55,7 +55,6 @@ export async function POST(req: Request) {
config.vision, config.vision,
body.imageDataUrl, body.imageDataUrl,
STYLE_EXTRACTION_PROMPT, STYLE_EXTRACTION_PROMPT,
{ responseFormat: "json_object" },
); );
let parsed: { stylePrompt?: string }; let parsed: { stylePrompt?: string };
+5 -142
View File
@@ -1,69 +1,15 @@
import { generateText } from "ai"; import { generateText } from "ai";
import type { LanguageModelUsage, ModelMessage } from "ai"; import type { LanguageModelUsage, ModelMessage } from "ai";
import { createAnthropic } from "@ai-sdk/anthropic"; import type { ProviderConfig } from "@infiplot/types";
import { createGoogleGenerativeAI } from "@ai-sdk/google"; import { createLanguageModel, resolveProtocol } from "./model";
import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { fetchWithRetry } from "./fetchWithRetry";
import { normalizeBaseUrl } from "./normalizeUrl";
export type ChatMessage = { export type ChatMessage = {
role: "system" | "user" | "assistant"; role: "system" | "user" | "assistant";
content: string; content: string;
}; };
// Different providers expose prompt-cache stats under different keys. We probe
// for the three forms we've seen in the wild and fall back to total tokens
// when no cache field exists.
//
// DeepSeek (v3+) usage.prompt_cache_hit_tokens / prompt_cache_miss_tokens
// OpenAI / o-series usage.prompt_tokens_details.cached_tokens
// Anthropic / others usage.cache_read_input_tokens / cache_creation_input_tokens
// No-cache (MiMo,
// local Ollama, …) only prompt_tokens / completion_tokens — print those
// so we still get a rough cost baseline.
type Usage = {
prompt_tokens?: number;
completion_tokens?: number;
prompt_cache_hit_tokens?: number;
prompt_cache_miss_tokens?: number;
prompt_tokens_details?: { cached_tokens?: number };
cache_read_input_tokens?: number;
cache_creation_input_tokens?: number;
};
function summarizeUsage(tag: string, usage: Usage | undefined): string {
if (!usage) return `[cache] ${tag} no-usage`;
const prompt = usage.prompt_tokens ?? 0;
const completion = usage.completion_tokens ?? 0;
// DeepSeek-style
if (typeof usage.prompt_cache_hit_tokens === "number") {
const hit = usage.prompt_cache_hit_tokens;
const miss = usage.prompt_cache_miss_tokens ?? Math.max(0, prompt - hit);
const denom = hit + miss;
const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${hit} miss=${miss} rate=${rate}% completion=${completion}`;
}
// OpenAI-style
const oaiCached = usage.prompt_tokens_details?.cached_tokens;
if (typeof oaiCached === "number") {
const miss = Math.max(0, prompt - oaiCached);
const rate = prompt > 0 ? ((oaiCached / prompt) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${oaiCached} miss=${miss} rate=${rate}% completion=${completion}`;
}
// Anthropic-style
if (typeof usage.cache_read_input_tokens === "number") {
const hit = usage.cache_read_input_tokens;
const create = usage.cache_creation_input_tokens ?? 0;
const denom = hit + create + prompt;
const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${hit} create=${create} miss=${prompt} rate=${rate}% completion=${completion}`;
}
// No cache field at all
return `[cache] ${tag} prompt=${prompt} completion=${completion} (provider didn't report cache stats)`;
}
// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails, // AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
// so a single shape covers Anthropic + Gemini (no per-provider probing). // so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
function summarizeSdkUsage( function summarizeSdkUsage(
tag: string, tag: string,
usage: LanguageModelUsage | undefined, usage: LanguageModelUsage | undefined,
@@ -82,43 +28,16 @@ function summarizeSdkUsage(
return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`; return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
} }
// text/vision default to the OpenAI-compatible wire protocol when unset.
function resolveTextProtocol(config: ProviderConfig): ProviderProtocol {
return config.provider ?? "openai_compatible";
}
export async function chat( export async function chat(
config: ProviderConfig, config: ProviderConfig,
messages: ChatMessage[], messages: ChatMessage[],
opts?: { opts?: {
temperature?: number; temperature?: number;
responseFormat?: "json_object" | "text";
tag?: string; tag?: string;
}, },
): Promise<string> { ): Promise<string> {
const protocol = resolveTextProtocol(config); const protocol = resolveProtocol(config);
if (protocol === "anthropic" || protocol === "google") { const model = createLanguageModel(config, protocol);
return chatViaAiSdk(config, messages, opts, protocol);
}
return chatOpenAiCompatible(config, messages, opts);
}
// Native Anthropic / Gemini via the Vercel AI SDK. response_format is not sent
// (Anthropic has no JSON mode); the engine relies on parseJsonLoose downstream,
// matching how it already tolerates loose JSON from every provider.
async function chatViaAiSdk(
config: ProviderConfig,
messages: ChatMessage[],
opts: { temperature?: number; tag?: string } | undefined,
protocol: "anthropic" | "google",
): Promise<string> {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
const model =
protocol === "anthropic"
? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model)
: createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(
config.model,
);
const system = messages.find((m) => m.role === "system")?.content; const system = messages.find((m) => m.role === "system")?.content;
const convo: ModelMessage[] = messages const convo: ModelMessage[] = messages
@@ -142,59 +61,3 @@ async function chatViaAiSdk(
} }
return text; return text;
} }
async function chatOpenAiCompatible(
config: ProviderConfig,
messages: ChatMessage[],
opts?: {
temperature?: number;
responseFormat?: "json_object" | "text";
tag?: string;
},
): Promise<string> {
const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`;
const body: Record<string, unknown> = {
model: config.model,
messages,
temperature: opts?.temperature ?? 0.9,
};
if (opts?.responseFormat === "json_object") {
body.response_format = { type: "json_object" };
}
const res = await fetchWithRetry(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${config.apiKey}`,
},
body: JSON.stringify(body),
});
const text = await res.text();
if (!res.ok) {
throw new Error(`Chat API error ${res.status}: ${text}`);
}
let json: {
choices: { message: { content: string } }[];
usage?: Usage;
};
try {
json = JSON.parse(text);
} catch {
throw new Error(`Chat API returned invalid JSON: ${text.slice(0, 500)}`);
}
// Guard against empty choices array or missing message/content fields
const content = json.choices?.[0]?.message?.content;
if (typeof content !== "string") {
throw new Error(
`Chat API returned no content. Response: ${text.slice(0, 500)}`
);
}
console.log(summarizeUsage(opts?.tag ?? "chat", json.usage));
return content;
}
+23
View File
@@ -0,0 +1,23 @@
import { createAnthropic } from "@ai-sdk/anthropic";
import { createGoogleGenerativeAI } from "@ai-sdk/google";
import { createOpenAI } from "@ai-sdk/openai";
import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { normalizeBaseUrl } from "./normalizeUrl";
export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
return config.provider ?? "openai_compatible";
}
export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
switch (protocol) {
case "anthropic":
return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
case "google":
return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
case "openai_compatible":
case "openai":
default:
return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
}
}
+5 -109
View File
@@ -1,10 +1,7 @@
import { generateText } from "ai"; import { generateText } from "ai";
import type { ModelMessage } from "ai"; import type { ModelMessage } from "ai";
import { createAnthropic } from "@ai-sdk/anthropic"; import type { ProviderConfig } from "@infiplot/types";
import { createGoogleGenerativeAI } from "@ai-sdk/google"; import { createLanguageModel, resolveProtocol } from "./model";
import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { fetchWithRetry } from "./fetchWithRetry";
import { normalizeBaseUrl } from "./normalizeUrl";
const VISION_TIMEOUT_MS = 60_000; const VISION_TIMEOUT_MS = 60_000;
@@ -13,55 +10,20 @@ export async function interpretClick(
imageBase64: string, imageBase64: string,
prompt: string, prompt: string,
): Promise<string> { ): Promise<string> {
// Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
// client encodes as PNG. analyzeImageDataUrl handles the actual request.
return analyzeImageDataUrl( return analyzeImageDataUrl(
config, config,
`data:image/png;base64,${imageBase64}`, `data:image/png;base64,${imageBase64}`,
prompt, prompt,
{ responseFormat: "json_object" },
); );
} }
// text/vision default to the OpenAI-compatible wire protocol when unset.
function resolveVisionProtocol(config: ProviderConfig): ProviderProtocol {
return config.provider ?? "openai_compatible";
}
/**
* General single-image vision call. Accepts a complete data URL (preserves
* the source mime type, e.g. webp/jpeg) and lets the caller opt out of
* `response_format: json_object` for free-form text responses.
*/
export async function analyzeImageDataUrl( export async function analyzeImageDataUrl(
config: ProviderConfig, config: ProviderConfig,
imageDataUrl: string, imageDataUrl: string,
prompt: string, prompt: string,
opts: { responseFormat?: "json_object" | "text" } = {},
): Promise<string> { ): Promise<string> {
const protocol = resolveVisionProtocol(config); const protocol = resolveProtocol(config);
if (protocol === "anthropic" || protocol === "google") { const model = createLanguageModel(config, protocol);
return analyzeViaAiSdk(config, imageDataUrl, prompt, protocol);
}
return analyzeOpenAiCompatible(config, imageDataUrl, prompt, opts);
}
// Native Anthropic / Gemini multimodal via the AI SDK. The image part takes
// the full data URL directly; the SDK decodes it. response_format is not sent
// (no JSON mode on Anthropic) — the engine's parseJsonLoose handles output.
async function analyzeViaAiSdk(
config: ProviderConfig,
imageDataUrl: string,
prompt: string,
protocol: "anthropic" | "google",
): Promise<string> {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
const model =
protocol === "anthropic"
? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model)
: createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(
config.model,
);
const messages: ModelMessage[] = [ const messages: ModelMessage[] = [
{ {
@@ -80,6 +42,7 @@ async function analyzeViaAiSdk(
model, model,
messages, messages,
temperature: 0.2, temperature: 0.2,
maxRetries: 0,
abortSignal: timeoutCtrl.signal, abortSignal: timeoutCtrl.signal,
}); });
if (typeof text !== "string" || text.length === 0) { if (typeof text !== "string" || text.length === 0) {
@@ -90,70 +53,3 @@ async function analyzeViaAiSdk(
clearTimeout(timeoutId); clearTimeout(timeoutId);
} }
} }
async function analyzeOpenAiCompatible(
config: ProviderConfig,
imageDataUrl: string,
prompt: string,
opts: { responseFormat?: "json_object" | "text" } = {},
): Promise<string> {
const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`;
const body: Record<string, unknown> = {
model: config.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: imageDataUrl } },
],
},
],
temperature: 0.2,
};
if (opts.responseFormat === "json_object") {
body.response_format = { type: "json_object" };
}
const timeoutCtrl = new AbortController();
const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
let res: Response;
try {
res = await fetchWithRetry(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${config.apiKey}`,
},
body: JSON.stringify(body),
signal: timeoutCtrl.signal,
retries: 0,
});
} finally {
clearTimeout(timeoutId);
}
const text = await res.text();
if (!res.ok) {
throw new Error(`Vision API error ${res.status}: ${text}`);
}
let json: { choices: { message: { content: string } }[] };
try {
json = JSON.parse(text);
} catch {
throw new Error(`Vision API returned invalid JSON: ${text.slice(0, 500)}`);
}
// Guard against empty choices array or missing message/content fields
const content = json.choices?.[0]?.message?.content;
if (typeof content !== "string") {
throw new Error(
`Vision API returned no content. Response: ${text.slice(0, 500)}`
);
}
return content;
}
+1 -1
View File
@@ -53,7 +53,7 @@ export async function runArchitect(
{ role: "system", content: ARCHITECT_SYSTEM }, { role: "system", content: ARCHITECT_SYSTEM },
{ role: "user", content: buildArchitectUserMessage(session) }, { role: "user", content: buildArchitectUserMessage(session) },
], ],
{ temperature: 0.85, responseFormat: "json_object", tag: "architect" }, { temperature: 0.85, tag: "architect" },
); );
const parsed = parseJsonLoose<RawStoryState>(raw); const parsed = parseJsonLoose<RawStoryState>(raw);
+1 -1
View File
@@ -56,7 +56,7 @@ async function runDesignLLM(
content: buildCharacterDesignerUserMessage(charName, session), content: buildCharacterDesignerUserMessage(charName, session),
}, },
], ],
{ temperature: 0.7, responseFormat: "json_object", tag: "character-designer" }, { temperature: 0.7, tag: "character-designer" },
); );
return parseJsonLoose<CharacterDesignOutput>(raw); return parseJsonLoose<CharacterDesignOutput>(raw);
} }
+1 -1
View File
@@ -67,7 +67,7 @@ export async function runCinematographer(
), ),
}, },
], ],
{ temperature: 0.6, responseFormat: "json_object", tag: "cinematographer" }, { temperature: 0.6, tag: "cinematographer" },
); );
const parsed = parseJsonLoose<RawCinematographerOutput>(raw); const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
+2 -2
View File
@@ -423,7 +423,7 @@ export async function runWriterPlan(
{ role: "system", content: WRITER_PLAN_SYSTEM }, { role: "system", content: WRITER_PLAN_SYSTEM },
{ role: "user", content: buildWriterPlanUserMessage(session) }, { role: "user", content: buildWriterPlanUserMessage(session) },
], ],
{ temperature: 0.9, responseFormat: "json_object", tag: "writer-plan" }, { temperature: 0.9, tag: "writer-plan" },
); );
const parsed = parseJsonLoose<RawPlan>(raw); const parsed = parseJsonLoose<RawPlan>(raw);
@@ -473,7 +473,7 @@ export async function runWriterBeats(
{ role: "system", content: WRITER_BEATS_SYSTEM }, { role: "system", content: WRITER_BEATS_SYSTEM },
{ role: "user", content: buildWriterBeatsUserMessage(session, plan) }, { role: "user", content: buildWriterBeatsUserMessage(session, plan) },
], ],
{ temperature: 0.9, responseFormat: "json_object", tag: "writer-beats" }, { temperature: 0.9, tag: "writer-beats" },
); );
const parsed = parseJsonLoose<RawBeats>(raw); const parsed = parseJsonLoose<RawBeats>(raw);
+1 -1
View File
@@ -446,7 +446,7 @@ export async function directInsertBeat(
content: buildInsertBeatUserMessage(session, freeformAction), content: buildInsertBeatUserMessage(session, freeformAction),
}, },
], ],
{ temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" }, { temperature: 0.9, tag: "insert-beat" },
); );
const parsed = parseJsonLoose<InsertBeatPartial>(raw); const parsed = parseJsonLoose<InsertBeatPartial>(raw);