Merge pull request #43 from zonghaoyuan/worktree-ai-sdk-migration

refactor(ai-client): unify OpenAI-compatible path to AI SDK generateText
2026-06-07 12:04:47 +08:00
parent 57b3ac78cd f4aca0b59c
commit 5acffb6f85
9 changed files with 39 additions and 258 deletions
@@ -55,7 +55,6 @@ export async function POST(req: Request) {
      config.vision,
      body.imageDataUrl,
      STYLE_EXTRACTION_PROMPT,
      { responseFormat: "json_object" },
    );
    let parsed: { stylePrompt?: string };
@@ -1,69 +1,15 @@
 import { generateText } from "ai";
 import type { LanguageModelUsage, ModelMessage } from "ai";
-import { createAnthropic } from "@ai-sdk/anthropic";
+import type { ProviderConfig } from "@infiplot/types";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import { createLanguageModel, resolveProtocol } from "./model";
 import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { fetchWithRetry } from "./fetchWithRetry";
 import { normalizeBaseUrl } from "./normalizeUrl";
 export type ChatMessage = {
  role: "system" | "user" | "assistant";
  content: string;
 };
 // Different providers expose prompt-cache stats under different keys. We probe
 // for the three forms we've seen in the wild and fall back to total tokens
 // when no cache field exists.
 //
 //   DeepSeek (v3+)    usage.prompt_cache_hit_tokens / prompt_cache_miss_tokens
 //   OpenAI / o-series usage.prompt_tokens_details.cached_tokens
 //   Anthropic / others  usage.cache_read_input_tokens / cache_creation_input_tokens
 //   No-cache (MiMo,
 //     local Ollama, …) only prompt_tokens / completion_tokens — print those
 //                       so we still get a rough cost baseline.
 type Usage = {
  prompt_tokens?: number;
  completion_tokens?: number;
  prompt_cache_hit_tokens?: number;
  prompt_cache_miss_tokens?: number;
  prompt_tokens_details?: { cached_tokens?: number };
  cache_read_input_tokens?: number;
  cache_creation_input_tokens?: number;
 };
 function summarizeUsage(tag: string, usage: Usage | undefined): string {
  if (!usage) return `[cache] ${tag} no-usage`;
  const prompt = usage.prompt_tokens ?? 0;
  const completion = usage.completion_tokens ?? 0;
  // DeepSeek-style
  if (typeof usage.prompt_cache_hit_tokens === "number") {
    const hit = usage.prompt_cache_hit_tokens;
    const miss = usage.prompt_cache_miss_tokens ?? Math.max(0, prompt - hit);
    const denom = hit + miss;
    const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
    return `[cache] ${tag} hit=${hit} miss=${miss} rate=${rate}% completion=${completion}`;
  }
  // OpenAI-style
  const oaiCached = usage.prompt_tokens_details?.cached_tokens;
  if (typeof oaiCached === "number") {
    const miss = Math.max(0, prompt - oaiCached);
    const rate = prompt > 0 ? ((oaiCached / prompt) * 100).toFixed(1) : "n/a";
    return `[cache] ${tag} hit=${oaiCached} miss=${miss} rate=${rate}% completion=${completion}`;
  }
  // Anthropic-style
  if (typeof usage.cache_read_input_tokens === "number") {
    const hit = usage.cache_read_input_tokens;
    const create = usage.cache_creation_input_tokens ?? 0;
    const denom = hit + create + prompt;
    const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
    return `[cache] ${tag} hit=${hit} create=${create} miss=${prompt} rate=${rate}% completion=${completion}`;
  }
  // No cache field at all
  return `[cache] ${tag} prompt=${prompt} completion=${completion} (provider didn't report cache stats)`;
 }
 // AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
-// so a single shape covers Anthropic + Gemini (no per-provider probing).
+// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
 function summarizeSdkUsage(
  tag: string,
  usage: LanguageModelUsage | undefined,
@@ -82,43 +28,16 @@ function summarizeSdkUsage(
  return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
 }
 // text/vision default to the OpenAI-compatible wire protocol when unset.
 function resolveTextProtocol(config: ProviderConfig): ProviderProtocol {
  return config.provider ?? "openai_compatible";
 }
 export async function chat(
  config: ProviderConfig,
  messages: ChatMessage[],
  opts?: {
    temperature?: number;
    responseFormat?: "json_object" | "text";
    tag?: string;
  },
 ): Promise<string> {
-  const protocol = resolveTextProtocol(config);
+  const protocol = resolveProtocol(config);
-  if (protocol === "anthropic" || protocol === "google") {
+  const model = createLanguageModel(config, protocol);
    return chatViaAiSdk(config, messages, opts, protocol);
  }
  return chatOpenAiCompatible(config, messages, opts);
 }
 // Native Anthropic / Gemini via the Vercel AI SDK. response_format is not sent
 // (Anthropic has no JSON mode); the engine relies on parseJsonLoose downstream,
 // matching how it already tolerates loose JSON from every provider.
 async function chatViaAiSdk(
  config: ProviderConfig,
  messages: ChatMessage[],
  opts: { temperature?: number; tag?: string } | undefined,
  protocol: "anthropic" | "google",
 ): Promise<string> {
  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
  const model =
    protocol === "anthropic"
      ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model)
      : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(
          config.model,
        );
  const system = messages.find((m) => m.role === "system")?.content;
  const convo: ModelMessage[] = messages
@@ -142,59 +61,3 @@ async function chatViaAiSdk(
  }
  return text;
 }
 async function chatOpenAiCompatible(
  config: ProviderConfig,
  messages: ChatMessage[],
  opts?: {
    temperature?: number;
    responseFormat?: "json_object" | "text";
    tag?: string;
  },
 ): Promise<string> {
  const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`;
  const body: Record<string, unknown> = {
    model: config.model,
    messages,
    temperature: opts?.temperature ?? 0.9,
  };
  if (opts?.responseFormat === "json_object") {
    body.response_format = { type: "json_object" };
  }
  const res = await fetchWithRetry(url, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      Authorization: `Bearer ${config.apiKey}`,
    },
    body: JSON.stringify(body),
  });
  const text = await res.text();
  if (!res.ok) {
    throw new Error(`Chat API error ${res.status}: ${text}`);
  }
  let json: {
    choices: { message: { content: string } }[];
    usage?: Usage;
  };
  try {
    json = JSON.parse(text);
  } catch {
    throw new Error(`Chat API returned invalid JSON: ${text.slice(0, 500)}`);
  }
  // Guard against empty choices array or missing message/content fields
  const content = json.choices?.[0]?.message?.content;
  if (typeof content !== "string") {
    throw new Error(
      `Chat API returned no content. Response: ${text.slice(0, 500)}`
    );
  }
  console.log(summarizeUsage(opts?.tag ?? "chat", json.usage));
  return content;
 }
@@ -0,0 +1,23 @@
 import { createAnthropic } from "@ai-sdk/anthropic";
 import { createGoogleGenerativeAI } from "@ai-sdk/google";
 import { createOpenAI } from "@ai-sdk/openai";
 import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { normalizeBaseUrl } from "./normalizeUrl";
 export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
  return config.provider ?? "openai_compatible";
 }
 export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
  switch (protocol) {
    case "anthropic":
      return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
    case "google":
      return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
    case "openai_compatible":
    case "openai":
    default:
      return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
  }
 }
@@ -1,10 +1,7 @@
 import { generateText } from "ai";
 import type { ModelMessage } from "ai";
-import { createAnthropic } from "@ai-sdk/anthropic";
+import type { ProviderConfig } from "@infiplot/types";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import { createLanguageModel, resolveProtocol } from "./model";
 import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { fetchWithRetry } from "./fetchWithRetry";
 import { normalizeBaseUrl } from "./normalizeUrl";
 const VISION_TIMEOUT_MS = 60_000;
@@ -13,55 +10,20 @@ export async function interpretClick(
  imageBase64: string,
  prompt: string,
 ): Promise<string> {
  // Wrap the raw base64 in a PNG data URL — the Canvas annotator on the
  // client encodes as PNG. analyzeImageDataUrl handles the actual request.
  return analyzeImageDataUrl(
    config,
    `data:image/png;base64,${imageBase64}`,
    prompt,
    { responseFormat: "json_object" },
  );
 }
 // text/vision default to the OpenAI-compatible wire protocol when unset.
 function resolveVisionProtocol(config: ProviderConfig): ProviderProtocol {
  return config.provider ?? "openai_compatible";
 }
 /**
 * General single-image vision call. Accepts a complete data URL (preserves
 * the source mime type, e.g. webp/jpeg) and lets the caller opt out of
 * `response_format: json_object` for free-form text responses.
 */
 export async function analyzeImageDataUrl(
  config: ProviderConfig,
  imageDataUrl: string,
  prompt: string,
  opts: { responseFormat?: "json_object" | "text" } = {},
 ): Promise<string> {
-  const protocol = resolveVisionProtocol(config);
+  const protocol = resolveProtocol(config);
-  if (protocol === "anthropic" || protocol === "google") {
+  const model = createLanguageModel(config, protocol);
    return analyzeViaAiSdk(config, imageDataUrl, prompt, protocol);
  }
  return analyzeOpenAiCompatible(config, imageDataUrl, prompt, opts);
 }
 // Native Anthropic / Gemini multimodal via the AI SDK. The image part takes
 // the full data URL directly; the SDK decodes it. response_format is not sent
 // (no JSON mode on Anthropic) — the engine's parseJsonLoose handles output.
 async function analyzeViaAiSdk(
  config: ProviderConfig,
  imageDataUrl: string,
  prompt: string,
  protocol: "anthropic" | "google",
 ): Promise<string> {
  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
  const model =
    protocol === "anthropic"
      ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model)
      : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(
          config.model,
        );
  const messages: ModelMessage[] = [
    {
@@ -80,6 +42,7 @@ async function analyzeViaAiSdk(
      model,
      messages,
      temperature: 0.2,
      maxRetries: 0,
      abortSignal: timeoutCtrl.signal,
    });
    if (typeof text !== "string" || text.length === 0) {
@@ -90,70 +53,3 @@ async function analyzeViaAiSdk(
    clearTimeout(timeoutId);
  }
 }
 async function analyzeOpenAiCompatible(
  config: ProviderConfig,
  imageDataUrl: string,
  prompt: string,
  opts: { responseFormat?: "json_object" | "text" } = {},
 ): Promise<string> {
  const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`;
  const body: Record<string, unknown> = {
    model: config.model,
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: prompt },
          { type: "image_url", image_url: { url: imageDataUrl } },
        ],
      },
    ],
    temperature: 0.2,
  };
  if (opts.responseFormat === "json_object") {
    body.response_format = { type: "json_object" };
  }
  const timeoutCtrl = new AbortController();
  const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
  let res: Response;
  try {
    res = await fetchWithRetry(url, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        Authorization: `Bearer ${config.apiKey}`,
      },
      body: JSON.stringify(body),
      signal: timeoutCtrl.signal,
      retries: 0,
    });
  } finally {
    clearTimeout(timeoutId);
  }
  const text = await res.text();
  if (!res.ok) {
    throw new Error(`Vision API error ${res.status}: ${text}`);
  }
  let json: { choices: { message: { content: string } }[] };
  try {
    json = JSON.parse(text);
  } catch {
    throw new Error(`Vision API returned invalid JSON: ${text.slice(0, 500)}`);
  }
  // Guard against empty choices array or missing message/content fields
  const content = json.choices?.[0]?.message?.content;
  if (typeof content !== "string") {
    throw new Error(
      `Vision API returned no content. Response: ${text.slice(0, 500)}`
    );
  }
  return content;
 }
@@ -53,7 +53,7 @@ export async function runArchitect(
        { role: "system", content: ARCHITECT_SYSTEM },
        { role: "user", content: buildArchitectUserMessage(session) },
      ],
-      { temperature: 0.85, responseFormat: "json_object", tag: "architect" },
+      { temperature: 0.85, tag: "architect" },
    );
    const parsed = parseJsonLoose<RawStoryState>(raw);
@@ -56,7 +56,7 @@ async function runDesignLLM(
        content: buildCharacterDesignerUserMessage(charName, session),
      },
    ],
-    { temperature: 0.7, responseFormat: "json_object", tag: "character-designer" },
+    { temperature: 0.7, tag: "character-designer" },
  );
  return parseJsonLoose<CharacterDesignOutput>(raw);
 }
@@ -67,7 +67,7 @@ export async function runCinematographer(
        ),
      },
    ],
-    { temperature: 0.6, responseFormat: "json_object", tag: "cinematographer" },
+    { temperature: 0.6, tag: "cinematographer" },
  );
  const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
@@ -423,7 +423,7 @@ export async function runWriterPlan(
      { role: "system", content: WRITER_PLAN_SYSTEM },
      { role: "user", content: buildWriterPlanUserMessage(session) },
    ],
-    { temperature: 0.9, responseFormat: "json_object", tag: "writer-plan" },
+    { temperature: 0.9, tag: "writer-plan" },
  );
  const parsed = parseJsonLoose<RawPlan>(raw);
@@ -473,7 +473,7 @@ export async function runWriterBeats(
      { role: "system", content: WRITER_BEATS_SYSTEM },
      { role: "user", content: buildWriterBeatsUserMessage(session, plan) },
    ],
-    { temperature: 0.9, responseFormat: "json_object", tag: "writer-beats" },
+    { temperature: 0.9, tag: "writer-beats" },
  );
  const parsed = parseJsonLoose<RawBeats>(raw);
@@ -446,7 +446,7 @@ export async function directInsertBeat(
        content: buildInsertBeatUserMessage(session, freeformAction),
      },
    ],
-    { temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" },
+    { temperature: 0.9, tag: "insert-beat" },
  );
  const parsed = parseJsonLoose<InsertBeatPartial>(raw);