Merge pull request #64 from zonghaoyuan/refactor/settings-modal

feat: add client-side model configuration and server fallback
This commit is contained in:
baizhi958216
2026-06-12 22:09:43 +08:00
committed by GitHub
18 changed files with 1167 additions and 780 deletions
+29 -34
View File
@@ -1,29 +1,24 @@
import { generateText } from "ai";
import type { LanguageModelUsage, ModelMessage } from "ai";
import OpenAI from "openai";
import type { ProviderConfig } from "@infiplot/types";
import { createLanguageModel, resolveProtocol } from "./model";
import { normalizeBaseUrl } from "./normalizeUrl";
export type ChatMessage = {
role: "system" | "user" | "assistant";
content: string;
};
// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
function summarizeSdkUsage(
tag: string,
usage: LanguageModelUsage | undefined,
usage: OpenAI.Completions.CompletionUsage | undefined,
): string {
if (!usage) return `[cache] ${tag} no-usage`;
const input = usage.inputTokens ?? 0;
const output = usage.outputTokens ?? 0;
const read = usage.inputTokenDetails?.cacheReadTokens;
const write = usage.inputTokenDetails?.cacheWriteTokens;
if (typeof read === "number" || typeof write === "number") {
const hit = read ?? 0;
const create = write ?? 0;
const rate = input > 0 ? ((hit / input) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${hit} create=${create} input=${input} rate=${rate}% completion=${output}`;
const input = usage.prompt_tokens ?? 0;
const output = usage.completion_tokens ?? 0;
const details = (usage as { prompt_tokens_details?: { cached_tokens?: number } }).prompt_tokens_details;
const cached = details?.cached_tokens;
if (typeof cached === "number") {
const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
}
return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
}
@@ -36,28 +31,28 @@ export async function chat(
tag?: string;
},
): Promise<string> {
const protocol = resolveProtocol(config);
const model = createLanguageModel(config, protocol);
const system = messages.find((m) => m.role === "system")?.content;
const convo: ModelMessage[] = messages
.filter((m) => m.role !== "system")
.map((m) => ({
role: m.role as "user" | "assistant",
content: m.content,
}));
const { text, usage } = await generateText({
model,
system,
messages: convo,
temperature: opts?.temperature ?? 0.9,
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
maxRetries: 0,
dangerouslyAllowBrowser: true,
});
console.log(summarizeSdkUsage(opts?.tag ?? "chat", usage));
const completion = await client.chat.completions.create({
model: config.model,
messages: messages.map((m) => ({
role: m.role as "system" | "user" | "assistant",
content: m.content,
})),
temperature: opts?.temperature ?? 0.9,
stream: false,
});
if (typeof text !== "string" || text.length === 0) {
throw new Error(`Chat API (AI SDK ${protocol}) returned no content.`);
const text = completion.choices[0]?.message?.content ?? "";
console.log(summarizeSdkUsage(opts?.tag ?? "chat", completion.usage ?? undefined));
if (text.length === 0) {
throw new Error(`Chat API returned no content.`);
}
return text;
}
+107 -48
View File
@@ -1,6 +1,4 @@
import { generateImage as generateImageSdk } from "ai";
import { createOpenAI } from "@ai-sdk/openai";
import { createGoogleGenerativeAI } from "@ai-sdk/google";
import OpenAI, { toFile, type Uploadable } from "openai";
import type { Orientation, ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { fetchWithRetry } from "./fetchWithRetry";
import { normalizeBaseUrl } from "./normalizeUrl";
@@ -48,8 +46,8 @@ export type GenerateImageOptions = {
/**
* Reference images (UUIDs, URLs, or base64) to condition generation on —
* typically character portraits + the prior scene image. Runware caps at 4;
* we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these
* map to `prompt.images` (the SDK accepts public URLs or data URLs).
* we silently truncate beyond that. On the native OpenAI path these are
* fetched/decoded and sent to `images.edit`.
*/
referenceImages?: string[];
/** 01, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */
@@ -58,7 +56,7 @@ export type GenerateImageOptions = {
* Output aspect, locked per session. "portrait" → 9:16 vertical for mobile;
* default/"landscape" → 16:9 widescreen. Mapped to each provider's nearest
* supported size: Runware 1024×1792, OpenAI-compatible REST 1024x1792,
* native gpt-image 1024x1536, Gemini aspectRatio 9:16.
* native gpt-image 1024x1536.
*/
orientation?: Orientation;
};
@@ -66,8 +64,8 @@ export type GenerateImageOptions = {
export type GenerateImageResult = {
/**
* Image the client can render directly. A Runware CDN URL on the Runware
* path; a `data:<mime>;base64,...` URI on the AI SDK paths (OpenAI/Gemini
* return raw bytes, not a hosted URL).
* path; a `data:<mime>;base64,...` URI on the native OpenAI path when GPT
* image models return raw bytes instead of a hosted URL.
*/
imageUrl: string;
/**
@@ -117,63 +115,124 @@ export async function generateImage(
const protocol = resolveImageProtocol(config);
switch (protocol) {
case "openai":
case "google":
return generateImageViaAiSdk(config, prompt, options, protocol);
return generateImageOpenAi(config, prompt, options);
case "runware":
return generateImageRunware(config, prompt, options);
case "anthropic":
throw new Error(
'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".',
);
case "openai_compatible":
default:
return generateImageOpenAiCompatible(config, prompt, options);
}
}
// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK.
// Unlike the fetch path, this supports reference-image editing via
// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the
// client a data URI and synthesize a UUID; continuity references reuse the
// data URI rather than a provider UUID.
async function generateImageViaAiSdk(
// Native OpenAI (gpt-image) via the official OpenAI SDK. Unlike the compatible
// fetch path, this supports reference-image editing through `images.edit`.
// GPT image models return raw bytes, so we hand the client a data URI and
// synthesize a UUID; continuity references reuse the data URI rather than a
// provider UUID.
async function generateImageOpenAi(
config: ProviderConfig,
prompt: string,
options: GenerateImageOptions | undefined,
protocol: "openai" | "google",
options?: GenerateImageOptions,
): Promise<GenerateImageResult> {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
const imageModel =
protocol === "openai"
? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model)
: createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image(
config.model,
);
const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
const promptArg =
refs.length > 0 ? { text: prompt, images: refs } : prompt;
// Session-locked aspect. gpt-image takes an explicit `size` (portrait /
// landscape options are 1024x1536 / 1536x1024); Gemini takes an `aspectRatio`.
const portrait = options?.orientation === "portrait";
const { image } = await generateImageSdk({
model: imageModel,
prompt: promptArg,
...(protocol === "openai"
? { size: (portrait ? "1024x1536" : "1536x1024") as `${number}x${number}` }
: { aspectRatio: (portrait ? "9:16" : "16:9") as `${number}:${number}` }),
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai"),
maxRetries: 2,
dangerouslyAllowBrowser: true,
});
const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
const portrait = options?.orientation === "portrait";
const size = portrait ? "1024x1536" : "1536x1024";
return {
imageUrl: `data:${image.mediaType};base64,${image.base64}`,
imageUuid: crypto.randomUUID(),
};
const response =
refs.length > 0
? await client.images.edit({
model: config.model,
prompt,
image: await Promise.all(refs.map(referenceImageToUploadable)),
n: 1,
size,
})
: await client.images.generate({
model: config.model,
prompt,
n: 1,
size,
});
return imageResponseToResult(response);
}
async function referenceImageToUploadable(ref: string): Promise<Uploadable> {
if (ref.startsWith("data:")) {
const response = await fetch(ref);
if (!response.ok) {
throw new Error(`Failed to read data URL reference image.`);
}
const mediaType = response.headers.get("content-type") ?? "image/png";
return toFile(response, `reference.${extensionFromMediaType(mediaType)}`, {
type: mediaType,
});
}
if (/^https?:\/\//i.test(ref)) {
const response = await fetch(ref);
if (!response.ok) {
throw new Error(
`Failed to fetch reference image ${ref}: HTTP ${response.status}`,
);
}
const mediaType = response.headers.get("content-type") ?? "image/png";
return toFile(response, filenameFromUrl(ref, mediaType), {
type: mediaType,
});
}
throw new Error(
`Native OpenAI image editing requires reference image URLs or data URLs; got "${ref.slice(0, 32)}...".`,
);
}
function imageResponseToResult(
response: OpenAI.Images.ImagesResponse,
): GenerateImageResult {
const data = response.data?.[0];
const b64 = data?.b64_json;
if (b64) {
const format = response.output_format ?? "png";
return {
imageUrl: `data:image/${format};base64,${b64}`,
imageUuid: crypto.randomUUID(),
};
}
const imageUrl = data?.url;
if (imageUrl) {
return { imageUrl, imageUuid: crypto.randomUUID() };
}
throw new Error(`No image data in OpenAI response.`);
}
function filenameFromUrl(url: string, mediaType: string): string {
try {
const name = new URL(url).pathname.split("/").filter(Boolean).at(-1);
if (name && /\.[a-z0-9]+$/i.test(name)) return name;
} catch {
// Fall back to the media type below.
}
return `reference.${extensionFromMediaType(mediaType)}`;
}
function extensionFromMediaType(mediaType: string): string {
if (mediaType.includes("jpeg") || mediaType.includes("jpg")) return "jpg";
if (mediaType.includes("webp")) return "webp";
return "png";
}
// OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic
// text-to-image only — no reference images on this path; for editing/anchoring
// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above.
// set IMAGE_PROVIDER=openai to take the native OpenAI path above.
async function generateImageOpenAiCompatible(
config: ProviderConfig,
prompt: string,
-23
View File
@@ -1,23 +0,0 @@
import { createAnthropic } from "@ai-sdk/anthropic";
import { createGoogleGenerativeAI } from "@ai-sdk/google";
import { createOpenAI } from "@ai-sdk/openai";
import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
import { normalizeBaseUrl } from "./normalizeUrl";
export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
return config.provider ?? "openai_compatible";
}
export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
switch (protocol) {
case "anthropic":
return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
case "google":
return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
case "openai_compatible":
case "openai":
default:
return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
}
}
-2
View File
@@ -31,8 +31,6 @@ const ENDPOINT_SUFFIX =
const DEFAULT_VERSION_SEGMENT: Record<ProviderProtocol, string | null> = {
openai_compatible: "v1",
openai: "v1",
anthropic: "v1",
google: "v1beta",
// Runware posts to the bare base URL with no version-pathed sub-resource,
// so never inject a segment for it.
runware: null,
+27 -30
View File
@@ -1,7 +1,6 @@
import { generateText } from "ai";
import type { ModelMessage } from "ai";
import OpenAI from "openai";
import type { ProviderConfig } from "@infiplot/types";
import { createLanguageModel, resolveProtocol } from "./model";
import { normalizeBaseUrl } from "./normalizeUrl";
const VISION_TIMEOUT_MS = 60_000;
@@ -22,34 +21,32 @@ export async function analyzeImageDataUrl(
imageDataUrl: string,
prompt: string,
): Promise<string> {
const protocol = resolveProtocol(config);
const model = createLanguageModel(config, protocol);
const client = new OpenAI({
apiKey: config.apiKey,
baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
maxRetries: 0,
timeout: VISION_TIMEOUT_MS,
dangerouslyAllowBrowser: true,
});
const messages: ModelMessage[] = [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image", image: imageDataUrl },
],
},
];
const completion = await client.chat.completions.create({
model: config.model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: imageDataUrl } },
],
},
],
temperature: 0.2,
stream: false,
});
const timeoutCtrl = new AbortController();
const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
try {
const { text } = await generateText({
model,
messages,
temperature: 0.2,
maxRetries: 0,
abortSignal: timeoutCtrl.signal,
});
if (typeof text !== "string" || text.length === 0) {
throw new Error(`Vision API (AI SDK ${protocol}) returned no content.`);
}
return text;
} finally {
clearTimeout(timeoutId);
const text = completion.choices[0]?.message?.content ?? "";
if (text.length === 0) {
throw new Error(`Vision API returned no content.`);
}
return text;
}
+160
View File
@@ -0,0 +1,160 @@
import type { EngineConfig, ProviderProtocol } from "@infiplot/types";
// Bring-your-own model keys — stored CLIENT-SIDE ONLY.
//
// When a user supplies their own text/image/vision API credentials, we persist
// them in localStorage and the browser talks to providers directly. The keys
// are therefore never sent to our server: no request body, no header, no log.
const STORAGE_KEY = "infiplot:model";
const VALID_PROTOCOLS: ProviderProtocol[] = [
"openai_compatible",
"openai",
"runware",
];
export type StoredModelConfig = {
textBaseUrl: string;
textApiKey: string;
textModel: string;
textProvider?: ProviderProtocol;
imageBaseUrl: string;
imageApiKey: string;
imageModel: string;
imageProvider?: ProviderProtocol;
visionBaseUrl: string;
visionApiKey: string;
visionModel: string;
visionProvider?: ProviderProtocol;
};
function isValidProtocol(p: string): p is ProviderProtocol {
return (VALID_PROTOCOLS as readonly string[]).includes(p);
}
function readProtocol(raw: unknown): ProviderProtocol | undefined {
if (typeof raw === "string" && isValidProtocol(raw)) return raw;
return undefined;
}
/** Read + validate the persisted model config. Returns null when running on the
* server, when nothing is stored, on parse failure, or when required fields are
* missing. */
export function readStoredModelConfig(): StoredModelConfig | null {
if (typeof window === "undefined") return null;
try {
const raw = window.localStorage.getItem(STORAGE_KEY);
if (!raw) return null;
const parsed = JSON.parse(raw) as Partial<StoredModelConfig>;
const textBaseUrl = typeof parsed.textBaseUrl === "string" ? parsed.textBaseUrl.trim() : "";
const textApiKey = typeof parsed.textApiKey === "string" ? parsed.textApiKey.trim() : "";
const textModel = typeof parsed.textModel === "string" ? parsed.textModel.trim() : "";
const imageBaseUrl = typeof parsed.imageBaseUrl === "string" ? parsed.imageBaseUrl.trim() : "";
const imageApiKey = typeof parsed.imageApiKey === "string" ? parsed.imageApiKey.trim() : "";
const imageModel = typeof parsed.imageModel === "string" ? parsed.imageModel.trim() : "";
const visionBaseUrl = typeof parsed.visionBaseUrl === "string" ? parsed.visionBaseUrl.trim() : "";
const visionApiKey = typeof parsed.visionApiKey === "string" ? parsed.visionApiKey.trim() : "";
const visionModel = typeof parsed.visionModel === "string" ? parsed.visionModel.trim() : "";
if (
!textBaseUrl ||
!textApiKey ||
!textModel ||
!imageBaseUrl ||
!imageApiKey ||
!imageModel ||
!visionBaseUrl ||
!visionApiKey ||
!visionModel
) {
return null;
}
return {
textBaseUrl,
textApiKey,
textModel,
textProvider: readProtocol(parsed.textProvider),
imageBaseUrl,
imageApiKey,
imageModel,
imageProvider: readProtocol(parsed.imageProvider),
visionBaseUrl,
visionApiKey,
visionModel,
visionProvider: readProtocol(parsed.visionProvider),
};
} catch {
return null;
}
}
/** Persist the model config. Trims all string fields so trailing whitespace
* from pastes never breaks headers. */
export function writeStoredModelConfig(config: StoredModelConfig): void {
if (typeof window === "undefined") return;
try {
const payload: StoredModelConfig = {
textBaseUrl: config.textBaseUrl.trim(),
textApiKey: config.textApiKey.trim(),
textModel: config.textModel.trim(),
textProvider: config.textProvider,
imageBaseUrl: config.imageBaseUrl.trim(),
imageApiKey: config.imageApiKey.trim(),
imageModel: config.imageModel.trim(),
imageProvider: config.imageProvider,
visionBaseUrl: config.visionBaseUrl.trim(),
visionApiKey: config.visionApiKey.trim(),
visionModel: config.visionModel.trim(),
visionProvider: config.visionProvider,
};
window.localStorage.setItem(STORAGE_KEY, JSON.stringify(payload));
} catch {
// Storage disabled / quota / private mode — BYO simply stays off.
}
}
export function clearStoredModelConfig(): void {
if (typeof window === "undefined") return;
try {
window.localStorage.removeItem(STORAGE_KEY);
} catch {
// ignore
}
}
/** Build a full EngineConfig from stored model config + optional TTS config.
* Throws when model config is missing so callers can surface a friendly
* "please configure" message. */
export function resolveEngineConfig(
model: StoredModelConfig | null,
tts: import("@infiplot/types").TtsConfig | null,
): EngineConfig {
if (!model) {
throw new Error("模型配置未设置。请返回首页,点击「模型设置」配置 API 参数。");
}
return {
text: {
baseUrl: model.textBaseUrl,
apiKey: model.textApiKey,
model: model.textModel,
provider: model.textProvider,
},
image: {
baseUrl: model.imageBaseUrl,
apiKey: model.imageApiKey,
model: model.imageModel,
provider: model.imageProvider,
},
vision: {
baseUrl: model.visionBaseUrl,
apiKey: model.visionApiKey,
model: model.visionModel,
provider: model.visionProvider,
},
tts: tts ?? undefined,
mockImage: false,
};
}
-2
View File
@@ -6,8 +6,6 @@ import type {
const VALID_PROTOCOLS = [
"openai_compatible",
"anthropic",
"google",
"openai",
"runware",
] as const;
+77 -3
View File
@@ -3,8 +3,9 @@ import { jsonrepair, JSONRepairError } from "jsonrepair";
// Strict-then-forgiving JSON parser for LLM output. Tries in order:
// 1. Direct JSON.parse on the trimmed text.
// 2. Extract from ```json``` fenced block.
// 3. Slice between first { and last } and parse.
// 4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
// 3. Parse the first complete JSON value prefix (handles duplicated objects).
// 4. Slice between first { and last } and parse.
// 5. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
//
// On final failure, logs the first 800 chars of the raw model output so we
// can diagnose the actual syntax error without flooding logs or leaking
@@ -40,6 +41,67 @@ function preRepair(s: string): string {
return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
}
function firstJsonStart(s: string): number {
const objectStart = s.indexOf("{");
const arrayStart = s.indexOf("[");
if (objectStart === -1) return arrayStart;
if (arrayStart === -1) return objectStart;
return Math.min(objectStart, arrayStart);
}
function firstCompleteJsonValue(s: string): string | undefined {
const start = firstJsonStart(s);
if (start === -1) return undefined;
const stack: string[] = [];
let inString = false;
let escaped = false;
for (let i = start; i < s.length; i += 1) {
const ch = s[i]!;
if (inString) {
if (escaped) {
escaped = false;
} else if (ch === "\\") {
escaped = true;
} else if (ch === "\"") {
inString = false;
}
continue;
}
if (ch === "\"") {
inString = true;
continue;
}
if (ch === "{") {
stack.push("}");
continue;
}
if (ch === "[") {
stack.push("]");
continue;
}
if (ch === "}" || ch === "]") {
if (stack.at(-1) !== ch) return undefined;
stack.pop();
if (stack.length === 0) return s.slice(start, i + 1);
}
}
return undefined;
}
function parseFirstCompleteJsonValue<T>(s: string): T | undefined {
const value = firstCompleteJsonValue(s);
if (!value) return undefined;
return JSON.parse(value) as T;
}
export function parseJsonLoose<T>(raw: string): T {
const trimmed = raw.trim();
@@ -54,10 +116,22 @@ export function parseJsonLoose<T>(raw: string): T {
try {
return JSON.parse(fenced[1]) as T;
} catch {
// fall through
try {
const parsed = parseFirstCompleteJsonValue<T>(fenced[1]);
if (parsed !== undefined) return parsed;
} catch {
// fall through
}
}
}
try {
const parsed = parseFirstCompleteJsonValue<T>(trimmed);
if (parsed !== undefined) return parsed;
} catch {
// fall through
}
const first = trimmed.indexOf("{");
const last = trimmed.lastIndexOf("}");
const slice =
+101
View File
@@ -0,0 +1,101 @@
import {
startSession as startSessionClient,
requestScene as requestSceneClient,
visionDecide as visionDecideClient,
classifyFreeform as classifyFreeformClient,
requestInsertBeat as requestInsertBeatClient,
} from "@infiplot/engine";
import {
readStoredModelConfig,
resolveEngineConfig,
} from "@/lib/clientModelConfig";
import { loadClientTtsConfig } from "@/lib/clientTtsConfig";
import type {
FreeformClassifyRequest,
FreeformClassifyResponse,
EngineConfig,
InsertBeatRequest,
InsertBeatResponse,
SceneRequest,
SceneResponse,
StartRequest,
StartResponse,
VisionRequest,
VisionResponse,
} from "@infiplot/types";
function getClientConfig(): EngineConfig | null {
const modelCfg = readStoredModelConfig();
const ttsCfg = loadClientTtsConfig();
if (!modelCfg) return null;
return resolveEngineConfig(modelCfg, ttsCfg);
}
async function postJson<T>(path: string, body: unknown): Promise<T> {
const res = await fetch(path, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body),
});
if (!res.ok) {
let message = `HTTP ${res.status}`;
try {
const data = (await res.json()) as { error?: string };
if (data.error) message = data.error;
} catch {
// ignore parse failure, keep HTTP status message
}
throw new Error(message);
}
return res.json() as Promise<T>;
}
// ── Unified entry points ───────────────────────────────────────────────
// When the browser has a BYO model config in localStorage, these call the
// client-side engine directly (talking to providers from the browser).
// Otherwise they fall back to the server-side API routes, which read
// environment variables — useful for Vercel deploys that already supply keys.
export async function startSession(req: StartRequest): Promise<StartResponse> {
const config = getClientConfig();
if (config) {
return startSessionClient(config, req);
}
return postJson<StartResponse>("/api/start", req);
}
export async function requestScene(req: SceneRequest): Promise<SceneResponse> {
const config = getClientConfig();
if (config) {
return requestSceneClient(config, req);
}
return postJson<SceneResponse>("/api/scene", req);
}
export async function visionDecide(req: VisionRequest): Promise<VisionResponse> {
const config = getClientConfig();
if (config) {
return visionDecideClient(config, req);
}
return postJson<VisionResponse>("/api/vision", req);
}
export async function classifyFreeform(
req: FreeformClassifyRequest,
): Promise<FreeformClassifyResponse> {
const config = getClientConfig();
if (config) {
return classifyFreeformClient(config, req);
}
return postJson<FreeformClassifyResponse>("/api/classify-freeform", req);
}
export async function requestInsertBeat(
req: InsertBeatRequest,
): Promise<InsertBeatResponse> {
const config = getClientConfig();
if (config) {
return requestInsertBeatClient(config, req);
}
return postJson<InsertBeatResponse>("/api/insert-beat", req);
}
+11
View File
@@ -0,0 +1,11 @@
export const STYLE_EXTRACTION_PROMPT = `You are a senior concept artist helping describe an image's visual style so that a text-to-image diffusion model (FLUX) can reproduce the same aesthetic on different subjects.
Look at the attached image and produce a single English style-prompt string that captures ONLY its visual style — NOT its subject matter. Focus on:
- Medium / technique (e.g., watercolor, oil painting, cel-shaded anime, 3D render, pixel art)
- Line work and rendering (sharp ink outlines, soft shading, painterly brushstrokes, flat colors)
- Color palette and lighting (pastel, saturated, monochrome, warm golden-hour, cool neon, high contrast)
- Mood and atmosphere (dreamy, melancholic, cinematic, nostalgic, gritty)
- Any recognizable artistic influence (Ghibli, Makoto Shinkai, ukiyo-e, vaporwave, cyberpunk anime, etc.)
Do NOT describe the characters, objects, or scene contents. Output exactly one JSON object:
{"stylePrompt": "<comma-separated English visual-style attributes, ~30-60 words>"}`;
+11 -3
View File
@@ -8,6 +8,16 @@ import type { CharacterVoice, TtsConfig } from "@infiplot/types";
// top-N candidates so multiple similar characters don't collapse onto the
// same voice. Provision is a pure function — no network call needed.
function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer);
let binary = "";
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]!);
}
return btoa(binary);
}
const OUTPUT_FORMAT = "mp3";
const OUTPUT_MIME = "audio/mpeg";
@@ -183,8 +193,6 @@ export async function stepfunSynthesize(
}
const ab = await res.arrayBuffer();
// Buffer is fine here — TTS routes run on runtime="nodejs". Falls back to
// btoa+chunks if we ever target Edge.
const audioBase64 = Buffer.from(ab).toString("base64");
const audioBase64 = arrayBufferToBase64(ab);
return { audioBase64, mimeType: OUTPUT_MIME };
}
+4 -8
View File
@@ -327,19 +327,15 @@ export type VisionClassify = "insert-beat" | "change-scene";
* openai_compatible text / vision / image — OpenAI Chat Completions +
* `/images/generations` (self-implemented fetch; the
* default for text/vision when unset)
* anthropic text / vision — native Anthropic Messages (AI SDK)
* google text / vision / image — native Gemini (AI SDK); image
* uses the Nano Banana family
* openai image only — OpenAI gpt-image via AI SDK,
* unlocks reference-image editing (for text/vision use
* openai_compatible, which already speaks OpenAI's format)
* openai image only — OpenAI gpt-image via the
* official OpenAI SDK, unlocks reference-image editing
* (for text/vision use openai_compatible, which already
* speaks OpenAI's format)
* runware image only — Runware task-array protocol
* (self-implemented; the default for runware.ai URLs)
*/
export type ProviderProtocol =
| "openai_compatible"
| "anthropic"
| "google"
| "openai"
| "runware";