import type { CharacterVoice, TtsConfig } from "@infiplot/types"; import { stepfunProvision, stepfunSynthesize } from "./stepfun"; import { xiaomiProvision, xiaomiSynthesize } from "./xiaomi"; // Provider auto-detection by base URL — mirrors the image client convention // of inferring Runware from *.runware.ai and falling back otherwise. Keeps // the BYO client flow unchanged: TTS_PROVIDER env var stays unused, and // browser-side keys (Xiaomi only today) keep working through the xiaomi path. function isStepfun(cfg: TtsConfig): boolean { return /(^|[./])stepfun\.com\b/i.test(cfg.baseUrl); } export async function provisionVoice( cfg: TtsConfig, description: string, // Optional per-character salt (typically the character name). Only // StepFun's preset-picker uses it — Xiaomi voicedesign mints a unique // clip per call regardless. Threading it through keeps the API uniform // and prevents archetype collisions on the StepFun path. salt?: string, ): Promise { return isStepfun(cfg) ? stepfunProvision(cfg, description, salt) : xiaomiProvision(cfg, description); } // Dispatch by the voice's own provider tag, not by the current config. A // session can outlive a provider switch (e.g. .env.local flip mid-game), and // each voice must be synthesized via the protocol that minted it. The cfg // still needs to point at the matching provider's endpoint; mismatch surfaces // as a transparent network error, which `synthesizeBeat` already swallows. export async function synthesize( cfg: TtsConfig, voice: CharacterVoice, text: string, delivery?: string, signal?: AbortSignal, ): Promise<{ audioBase64: string; mimeType: string }> { if (voice.provider === "stepfun") { return stepfunSynthesize(cfg, voice, text, delivery, signal); } return xiaomiSynthesize(cfg, voice, text, delivery, signal); }