diff --git a/app/api/parse-style-image/route.ts b/app/api/parse-style-image/route.ts index 6fb2d4d..1fca6e1 100644 --- a/app/api/parse-style-image/route.ts +++ b/app/api/parse-style-image/route.ts @@ -55,7 +55,6 @@ export async function POST(req: Request) { config.vision, body.imageDataUrl, STYLE_EXTRACTION_PROMPT, - { responseFormat: "json_object" }, ); let parsed: { stylePrompt?: string }; diff --git a/lib/ai-client/chat.ts b/lib/ai-client/chat.ts index f28a280..f869c8f 100644 --- a/lib/ai-client/chat.ts +++ b/lib/ai-client/chat.ts @@ -1,69 +1,15 @@ import { generateText } from "ai"; import type { LanguageModelUsage, ModelMessage } from "ai"; -import { createAnthropic } from "@ai-sdk/anthropic"; -import { createGoogleGenerativeAI } from "@ai-sdk/google"; -import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; -import { fetchWithRetry } from "./fetchWithRetry"; -import { normalizeBaseUrl } from "./normalizeUrl"; +import type { ProviderConfig } from "@infiplot/types"; +import { createLanguageModel, resolveProtocol } from "./model"; export type ChatMessage = { role: "system" | "user" | "assistant"; content: string; }; -// Different providers expose prompt-cache stats under different keys. We probe -// for the three forms we've seen in the wild and fall back to total tokens -// when no cache field exists. -// -// DeepSeek (v3+) usage.prompt_cache_hit_tokens / prompt_cache_miss_tokens -// OpenAI / o-series usage.prompt_tokens_details.cached_tokens -// Anthropic / others usage.cache_read_input_tokens / cache_creation_input_tokens -// No-cache (MiMo, -// local Ollama, …) only prompt_tokens / completion_tokens — print those -// so we still get a rough cost baseline. -type Usage = { - prompt_tokens?: number; - completion_tokens?: number; - prompt_cache_hit_tokens?: number; - prompt_cache_miss_tokens?: number; - prompt_tokens_details?: { cached_tokens?: number }; - cache_read_input_tokens?: number; - cache_creation_input_tokens?: number; -}; - -function summarizeUsage(tag: string, usage: Usage | undefined): string { - if (!usage) return `[cache] ${tag} no-usage`; - const prompt = usage.prompt_tokens ?? 0; - const completion = usage.completion_tokens ?? 0; - // DeepSeek-style - if (typeof usage.prompt_cache_hit_tokens === "number") { - const hit = usage.prompt_cache_hit_tokens; - const miss = usage.prompt_cache_miss_tokens ?? Math.max(0, prompt - hit); - const denom = hit + miss; - const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a"; - return `[cache] ${tag} hit=${hit} miss=${miss} rate=${rate}% completion=${completion}`; - } - // OpenAI-style - const oaiCached = usage.prompt_tokens_details?.cached_tokens; - if (typeof oaiCached === "number") { - const miss = Math.max(0, prompt - oaiCached); - const rate = prompt > 0 ? ((oaiCached / prompt) * 100).toFixed(1) : "n/a"; - return `[cache] ${tag} hit=${oaiCached} miss=${miss} rate=${rate}% completion=${completion}`; - } - // Anthropic-style - if (typeof usage.cache_read_input_tokens === "number") { - const hit = usage.cache_read_input_tokens; - const create = usage.cache_creation_input_tokens ?? 0; - const denom = hit + create + prompt; - const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a"; - return `[cache] ${tag} hit=${hit} create=${create} miss=${prompt} rate=${rate}% completion=${completion}`; - } - // No cache field at all - return `[cache] ${tag} prompt=${prompt} completion=${completion} (provider didn't report cache stats)`; -} - // AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails, -// so a single shape covers Anthropic + Gemini (no per-provider probing). +// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers. function summarizeSdkUsage( tag: string, usage: LanguageModelUsage | undefined, @@ -82,43 +28,16 @@ function summarizeSdkUsage( return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`; } -// text/vision default to the OpenAI-compatible wire protocol when unset. -function resolveTextProtocol(config: ProviderConfig): ProviderProtocol { - return config.provider ?? "openai_compatible"; -} - export async function chat( config: ProviderConfig, messages: ChatMessage[], opts?: { temperature?: number; - responseFormat?: "json_object" | "text"; tag?: string; }, ): Promise { - const protocol = resolveTextProtocol(config); - if (protocol === "anthropic" || protocol === "google") { - return chatViaAiSdk(config, messages, opts, protocol); - } - return chatOpenAiCompatible(config, messages, opts); -} - -// Native Anthropic / Gemini via the Vercel AI SDK. response_format is not sent -// (Anthropic has no JSON mode); the engine relies on parseJsonLoose downstream, -// matching how it already tolerates loose JSON from every provider. -async function chatViaAiSdk( - config: ProviderConfig, - messages: ChatMessage[], - opts: { temperature?: number; tag?: string } | undefined, - protocol: "anthropic" | "google", -): Promise { - const baseURL = normalizeBaseUrl(config.baseUrl, protocol); - const model = - protocol === "anthropic" - ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model) - : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })( - config.model, - ); + const protocol = resolveProtocol(config); + const model = createLanguageModel(config, protocol); const system = messages.find((m) => m.role === "system")?.content; const convo: ModelMessage[] = messages @@ -142,59 +61,3 @@ async function chatViaAiSdk( } return text; } - -async function chatOpenAiCompatible( - config: ProviderConfig, - messages: ChatMessage[], - opts?: { - temperature?: number; - responseFormat?: "json_object" | "text"; - tag?: string; - }, -): Promise { - const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`; - const body: Record = { - model: config.model, - messages, - temperature: opts?.temperature ?? 0.9, - }; - if (opts?.responseFormat === "json_object") { - body.response_format = { type: "json_object" }; - } - - const res = await fetchWithRetry(url, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${config.apiKey}`, - }, - body: JSON.stringify(body), - }); - - const text = await res.text(); - if (!res.ok) { - throw new Error(`Chat API error ${res.status}: ${text}`); - } - - let json: { - choices: { message: { content: string } }[]; - usage?: Usage; - }; - try { - json = JSON.parse(text); - } catch { - throw new Error(`Chat API returned invalid JSON: ${text.slice(0, 500)}`); - } - - // Guard against empty choices array or missing message/content fields - const content = json.choices?.[0]?.message?.content; - if (typeof content !== "string") { - throw new Error( - `Chat API returned no content. Response: ${text.slice(0, 500)}` - ); - } - - console.log(summarizeUsage(opts?.tag ?? "chat", json.usage)); - - return content; -} diff --git a/lib/ai-client/model.ts b/lib/ai-client/model.ts new file mode 100644 index 0000000..155e424 --- /dev/null +++ b/lib/ai-client/model.ts @@ -0,0 +1,23 @@ +import { createAnthropic } from "@ai-sdk/anthropic"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; +import { createOpenAI } from "@ai-sdk/openai"; +import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; +import { normalizeBaseUrl } from "./normalizeUrl"; + +export function resolveProtocol(config: ProviderConfig): ProviderProtocol { + return config.provider ?? "openai_compatible"; +} + +export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) { + const baseURL = normalizeBaseUrl(config.baseUrl, protocol); + switch (protocol) { + case "anthropic": + return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model); + case "google": + return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model); + case "openai_compatible": + case "openai": + default: + return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model); + } +} diff --git a/lib/ai-client/vision.ts b/lib/ai-client/vision.ts index b43429a..12df0fa 100644 --- a/lib/ai-client/vision.ts +++ b/lib/ai-client/vision.ts @@ -1,10 +1,7 @@ import { generateText } from "ai"; import type { ModelMessage } from "ai"; -import { createAnthropic } from "@ai-sdk/anthropic"; -import { createGoogleGenerativeAI } from "@ai-sdk/google"; -import type { ProviderConfig, ProviderProtocol } from "@infiplot/types"; -import { fetchWithRetry } from "./fetchWithRetry"; -import { normalizeBaseUrl } from "./normalizeUrl"; +import type { ProviderConfig } from "@infiplot/types"; +import { createLanguageModel, resolveProtocol } from "./model"; const VISION_TIMEOUT_MS = 60_000; @@ -13,55 +10,20 @@ export async function interpretClick( imageBase64: string, prompt: string, ): Promise { - // Wrap the raw base64 in a PNG data URL — the Canvas annotator on the - // client encodes as PNG. analyzeImageDataUrl handles the actual request. return analyzeImageDataUrl( config, `data:image/png;base64,${imageBase64}`, prompt, - { responseFormat: "json_object" }, ); } -// text/vision default to the OpenAI-compatible wire protocol when unset. -function resolveVisionProtocol(config: ProviderConfig): ProviderProtocol { - return config.provider ?? "openai_compatible"; -} - -/** - * General single-image vision call. Accepts a complete data URL (preserves - * the source mime type, e.g. webp/jpeg) and lets the caller opt out of - * `response_format: json_object` for free-form text responses. - */ export async function analyzeImageDataUrl( config: ProviderConfig, imageDataUrl: string, prompt: string, - opts: { responseFormat?: "json_object" | "text" } = {}, ): Promise { - const protocol = resolveVisionProtocol(config); - if (protocol === "anthropic" || protocol === "google") { - return analyzeViaAiSdk(config, imageDataUrl, prompt, protocol); - } - return analyzeOpenAiCompatible(config, imageDataUrl, prompt, opts); -} - -// Native Anthropic / Gemini multimodal via the AI SDK. The image part takes -// the full data URL directly; the SDK decodes it. response_format is not sent -// (no JSON mode on Anthropic) — the engine's parseJsonLoose handles output. -async function analyzeViaAiSdk( - config: ProviderConfig, - imageDataUrl: string, - prompt: string, - protocol: "anthropic" | "google", -): Promise { - const baseURL = normalizeBaseUrl(config.baseUrl, protocol); - const model = - protocol === "anthropic" - ? createAnthropic({ apiKey: config.apiKey, baseURL })(config.model) - : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })( - config.model, - ); + const protocol = resolveProtocol(config); + const model = createLanguageModel(config, protocol); const messages: ModelMessage[] = [ { @@ -80,6 +42,7 @@ async function analyzeViaAiSdk( model, messages, temperature: 0.2, + maxRetries: 0, abortSignal: timeoutCtrl.signal, }); if (typeof text !== "string" || text.length === 0) { @@ -90,70 +53,3 @@ async function analyzeViaAiSdk( clearTimeout(timeoutId); } } - -async function analyzeOpenAiCompatible( - config: ProviderConfig, - imageDataUrl: string, - prompt: string, - opts: { responseFormat?: "json_object" | "text" } = {}, -): Promise { - const url = `${normalizeBaseUrl(config.baseUrl, "openai_compatible")}/chat/completions`; - - const body: Record = { - model: config.model, - messages: [ - { - role: "user", - content: [ - { type: "text", text: prompt }, - { type: "image_url", image_url: { url: imageDataUrl } }, - ], - }, - ], - temperature: 0.2, - }; - if (opts.responseFormat === "json_object") { - body.response_format = { type: "json_object" }; - } - - const timeoutCtrl = new AbortController(); - const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS); - - let res: Response; - try { - res = await fetchWithRetry(url, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${config.apiKey}`, - }, - body: JSON.stringify(body), - signal: timeoutCtrl.signal, - retries: 0, - }); - } finally { - clearTimeout(timeoutId); - } - - const text = await res.text(); - if (!res.ok) { - throw new Error(`Vision API error ${res.status}: ${text}`); - } - - let json: { choices: { message: { content: string } }[] }; - try { - json = JSON.parse(text); - } catch { - throw new Error(`Vision API returned invalid JSON: ${text.slice(0, 500)}`); - } - - // Guard against empty choices array or missing message/content fields - const content = json.choices?.[0]?.message?.content; - if (typeof content !== "string") { - throw new Error( - `Vision API returned no content. Response: ${text.slice(0, 500)}` - ); - } - - return content; -} diff --git a/lib/engine/agents/architect.ts b/lib/engine/agents/architect.ts index a53d469..6c9cf75 100644 --- a/lib/engine/agents/architect.ts +++ b/lib/engine/agents/architect.ts @@ -53,7 +53,7 @@ export async function runArchitect( { role: "system", content: ARCHITECT_SYSTEM }, { role: "user", content: buildArchitectUserMessage(session) }, ], - { temperature: 0.85, responseFormat: "json_object", tag: "architect" }, + { temperature: 0.85, tag: "architect" }, ); const parsed = parseJsonLoose(raw); diff --git a/lib/engine/agents/characterDesigner.ts b/lib/engine/agents/characterDesigner.ts index e407c10..60835c0 100644 --- a/lib/engine/agents/characterDesigner.ts +++ b/lib/engine/agents/characterDesigner.ts @@ -56,7 +56,7 @@ async function runDesignLLM( content: buildCharacterDesignerUserMessage(charName, session), }, ], - { temperature: 0.7, responseFormat: "json_object", tag: "character-designer" }, + { temperature: 0.7, tag: "character-designer" }, ); return parseJsonLoose(raw); } diff --git a/lib/engine/agents/cinematographer.ts b/lib/engine/agents/cinematographer.ts index e2c3d22..7b994ce 100644 --- a/lib/engine/agents/cinematographer.ts +++ b/lib/engine/agents/cinematographer.ts @@ -67,7 +67,7 @@ export async function runCinematographer( ), }, ], - { temperature: 0.6, responseFormat: "json_object", tag: "cinematographer" }, + { temperature: 0.6, tag: "cinematographer" }, ); const parsed = parseJsonLoose(raw); diff --git a/lib/engine/agents/writer.ts b/lib/engine/agents/writer.ts index b560d56..935d2e8 100644 --- a/lib/engine/agents/writer.ts +++ b/lib/engine/agents/writer.ts @@ -423,7 +423,7 @@ export async function runWriterPlan( { role: "system", content: WRITER_PLAN_SYSTEM }, { role: "user", content: buildWriterPlanUserMessage(session) }, ], - { temperature: 0.9, responseFormat: "json_object", tag: "writer-plan" }, + { temperature: 0.9, tag: "writer-plan" }, ); const parsed = parseJsonLoose(raw); @@ -473,7 +473,7 @@ export async function runWriterBeats( { role: "system", content: WRITER_BEATS_SYSTEM }, { role: "user", content: buildWriterBeatsUserMessage(session, plan) }, ], - { temperature: 0.9, responseFormat: "json_object", tag: "writer-beats" }, + { temperature: 0.9, tag: "writer-beats" }, ); const parsed = parseJsonLoose(raw); diff --git a/lib/engine/director.ts b/lib/engine/director.ts index 28114a7..24c92a9 100644 --- a/lib/engine/director.ts +++ b/lib/engine/director.ts @@ -446,7 +446,7 @@ export async function directInsertBeat( content: buildInsertBeatUserMessage(session, freeformAction), }, ], - { temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" }, + { temperature: 0.9, tag: "insert-beat" }, ); const parsed = parseJsonLoose(raw);