19bbee16fe
Add StepFun step-tts-mini / step-tts-2 / stepaudio-2.5-tts as an alternate
TTS provider alongside Xiaomi MiMo. Auto-detected from TTS_BASE_URL host
(contains `stepfun.com` → StepFun; otherwise → MiMo), mirroring how the
image client infers Runware from `*.runware.ai`.
CharacterVoice becomes a discriminated union on `provider`:
- xiaomi: { referenceAudioBase64, mimeType } — unchanged
- stepfun: { voiceId, model, mimeType } — preset voice ID + chosen model
Provision dispatches on the current cfg's base URL; synthesis dispatches
on the voice's own `provider` tag so a session with mixed voices (e.g. a
provider switch mid-development) routes each beat through the correct
protocol. xiaomiSynthesize now guards against being called with a non-
xiaomi voice, surfacing the bug as a clear runtime error instead of a
TypeScript narrow violation at the access site.
StepFun has no voicedesign equivalent — only preset voices + voice
cloning from a reference audio upload. Cloning would require an extra
asset per character, so v1 maps the LLM's Chinese voiceDescription to one
of the 32 published preset IDs via gender + age + tone keyword scoring,
with a deterministic hash spread across the top-3 candidates so multiple
characters with similar descriptions don't collapse onto the identical
preset. lineDelivery is accepted but not yet propagated to StepFun's
voice_label.emotion / .style fields — left as a follow-up.
beat-audio route validation relaxed from `voice.referenceAudioBase64`
(xiaomi-shaped) to `voice.provider` (shape-agnostic), so stepfun voices
pass the gate; provider-specific shape errors still surface from the
synth function.
Observed latency on InfiPlot's dev loop: StepFun step-tts-mini median
~2.3s per beat with 0% timeouts across the test session, vs MiMo's
median ~8s with the long tail tripping the existing 15s synth budget
on roughly 2 of 3 beats. Pricing: step-tts-mini ¥0.9/万字符 (~¥0.14
per typical 50-beat session) vs MiMo TTS currently free under the
Token Plan creator incentive.
AGENTS.md provider matrix updated to describe both providers and the
discriminated-union dispatch.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
121 lines
3.4 KiB
TypeScript
121 lines
3.4 KiB
TypeScript
import type { CharacterVoice, TtsConfig } from "@infiplot/types";
|
|
|
|
// Xiaomi MiMo currently outputs wav / pcm16 only (mp3 not supported for output).
|
|
// The reference clip we persist is therefore wav. Kept as a single switch so we
|
|
// can flip to mp3 the day the API supports it.
|
|
const OUTPUT_FORMAT = "wav";
|
|
const OUTPUT_MIME = "audio/wav";
|
|
|
|
function buildHeaders(cfg: TtsConfig): HeadersInit {
|
|
return {
|
|
"Content-Type": "application/json",
|
|
"api-key": cfg.apiKey,
|
|
};
|
|
}
|
|
|
|
function joinUrl(baseUrl: string, path: string): string {
|
|
return `${baseUrl.replace(/\/$/, "")}${path}`;
|
|
}
|
|
|
|
function designModel(cfg: TtsConfig): string {
|
|
return `${cfg.speechModel}-voicedesign`;
|
|
}
|
|
|
|
function cloneModel(cfg: TtsConfig): string {
|
|
return `${cfg.speechModel}-voiceclone`;
|
|
}
|
|
|
|
type ChatAudioResponse = {
|
|
choices?: Array<{ message?: { audio?: { data?: string } } }>;
|
|
error?: { message?: string };
|
|
message?: string;
|
|
};
|
|
|
|
function extractAudio(json: ChatAudioResponse, where: string): string {
|
|
const data = json.choices?.[0]?.message?.audio?.data;
|
|
if (!data) {
|
|
const err = json.error?.message ?? json.message ?? JSON.stringify(json);
|
|
throw new Error(`Xiaomi ${where} returned no audio: ${err.slice(0, 300)}`);
|
|
}
|
|
return data;
|
|
}
|
|
|
|
export async function xiaomiProvision(
|
|
cfg: TtsConfig,
|
|
description: string,
|
|
): Promise<CharacterVoice> {
|
|
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
|
|
|
const body = {
|
|
model: designModel(cfg),
|
|
messages: [
|
|
{ role: "user", content: description },
|
|
{ role: "assistant", content: "你好,这是音色试听样本。" },
|
|
],
|
|
audio: { format: OUTPUT_FORMAT },
|
|
};
|
|
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: buildHeaders(cfg),
|
|
body: JSON.stringify(body),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const text = await res.text();
|
|
throw new Error(`Xiaomi voicedesign ${res.status}: ${text.slice(0, 300)}`);
|
|
}
|
|
|
|
const json = (await res.json()) as ChatAudioResponse;
|
|
const referenceAudioBase64 = extractAudio(json, "voicedesign");
|
|
|
|
return { provider: "xiaomi", referenceAudioBase64, mimeType: OUTPUT_MIME };
|
|
}
|
|
|
|
export async function xiaomiSynthesize(
|
|
cfg: TtsConfig,
|
|
voice: CharacterVoice,
|
|
text: string,
|
|
delivery?: string,
|
|
signal?: AbortSignal,
|
|
): Promise<{ audioBase64: string; mimeType: string }> {
|
|
if (voice.provider !== "xiaomi") {
|
|
throw new Error(
|
|
`xiaomiSynthesize received non-xiaomi voice (provider="${voice.provider}")`,
|
|
);
|
|
}
|
|
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
|
|
|
// The free-form delivery direction rides in the `user` (director) message,
|
|
// so it shapes the performance without ever being read aloud. The spoken
|
|
// text stays in the `assistant` message, clean.
|
|
const body = {
|
|
model: cloneModel(cfg),
|
|
messages: [
|
|
{ role: "user", content: delivery?.trim() ?? "" },
|
|
{ role: "assistant", content: text },
|
|
],
|
|
audio: {
|
|
format: OUTPUT_FORMAT,
|
|
voice: `data:${voice.mimeType};base64,${voice.referenceAudioBase64}`,
|
|
},
|
|
};
|
|
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: buildHeaders(cfg),
|
|
body: JSON.stringify(body),
|
|
signal,
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const txt = await res.text();
|
|
throw new Error(`Xiaomi voiceclone ${res.status}: ${txt.slice(0, 300)}`);
|
|
}
|
|
|
|
const json = (await res.json()) as ChatAudioResponse;
|
|
const audioBase64 = extractAudio(json, "voiceclone");
|
|
|
|
return { audioBase64, mimeType: OUTPUT_MIME };
|
|
}
|