ca73a41a0b
Make homepage cards and live sessions produce sound when the server is configured for StepFun TTS, instead of silently failing (the prebaked Xiaomi voice was useless on a StepFun server, and wasted ~220KB/beat in Fast Origin Transfer). Three coordinated changes: 1. CharacterDesigner now picks a StepFun preset voice id directly from the 32-entry catalog in the SAME LLM call that designs the character — zero extra latency, LLM-grade match quality. The Xiaomi prompt path is byte-identical to history (verified programmatically) so cache hit rate and voice quality are preserved. pickStepfunVoiceId (keyword scorer) remains the fallback for orphan speakers / invalid LLM picks. 2. The 32-preset catalog moves to lib/tts-client/stepfun-voices.json as the single source of truth, shared by the scorer, the CharacterDesigner prompt, /api/tts-provider, and the offline enrich script. 3. A new GET /api/tts-provider endpoint lets the client probe the server's TTS provider at /play mount. fetchBeatAudio then shapes its request body: on a StepFun server it sends the lightweight stepfunVoiceId / voiceDescription and omits the ~220KB Xiaomi reference audio (FOT saving ~13MB per protagonist per session on prebaked cards). requestBeatAudio re-provisions on a provider mismatch before synth, so audio never goes silent on a cross-provider replay or mid-session provider flip. New type fields are all optional and backward-compatible: Character.stepfunVoiceId, BeatAudioRequest.voiceDescription/characterName/stepfunVoiceId, voice made optional. AGENTS.md updated for the new route, type fields, dependency map, and StepFun voice-selection flow.
62 lines
2.6 KiB
TypeScript
62 lines
2.6 KiB
TypeScript
import type { CharacterVoice, TtsConfig, TtsProvider } from "@infiplot/types";
|
|
import {
|
|
formatStepfunCatalogForPrompt,
|
|
isStepfun,
|
|
isValidStepfunVoiceId,
|
|
stepfunProvision,
|
|
type StepfunProvisionOptions,
|
|
stepfunSynthesize,
|
|
} from "./stepfun";
|
|
import { xiaomiProvision, xiaomiSynthesize } from "./xiaomi";
|
|
|
|
// Re-export so /api/tts-provider, orchestrator, CharacterDesigner prompt, and
|
|
// the client all share ONE provider-detection rule + ONE catalog rendering +
|
|
// ONE validity check with the synth path.
|
|
export { isStepfun, isValidStepfunVoiceId, formatStepfunCatalogForPrompt };
|
|
|
|
/** Map a configured TtsConfig to its provider tag. Single source of truth for
|
|
* the inference rule (host contains stepfun.com → stepfun, else xiaomi) so
|
|
* /api/tts-provider and resolveVoice can't drift when a third provider is
|
|
* added. A PRESENT TtsConfig always maps to a concrete provider — `null`
|
|
* (no TTS configured) is the caller's responsibility to handle separately. */
|
|
export function inferTtsProvider(cfg: TtsConfig): Exclude<TtsProvider, null> {
|
|
return isStepfun(cfg) ? "stepfun" : "xiaomi";
|
|
}
|
|
|
|
// `opts.stepfunVoiceId` threads the CharacterDesigner's LLM-selected preset
|
|
// down to stepfunProvision. Xiaomi ignores it. See StepfunProvisionOptions.
|
|
export type ProvisionVoiceOptions = StepfunProvisionOptions;
|
|
|
|
export async function provisionVoice(
|
|
cfg: TtsConfig,
|
|
description: string,
|
|
// Optional per-character salt (typically the character name). Only
|
|
// StepFun's preset-picker uses it — Xiaomi voicedesign mints a unique
|
|
// clip per call regardless. Threading it through keeps the API uniform
|
|
// and prevents archetype collisions on the StepFun path.
|
|
salt?: string,
|
|
opts?: ProvisionVoiceOptions,
|
|
): Promise<CharacterVoice> {
|
|
return isStepfun(cfg)
|
|
? stepfunProvision(cfg, description, salt, opts)
|
|
: xiaomiProvision(cfg, description);
|
|
}
|
|
|
|
// Dispatch by the voice's own provider tag, not by the current config. A
|
|
// session can outlive a provider switch (e.g. .env.local flip mid-game), and
|
|
// each voice must be synthesized via the protocol that minted it. The cfg
|
|
// still needs to point at the matching provider's endpoint; mismatch surfaces
|
|
// as a transparent network error, which `synthesizeBeat` already swallows.
|
|
export async function synthesize(
|
|
cfg: TtsConfig,
|
|
voice: CharacterVoice,
|
|
text: string,
|
|
delivery?: string,
|
|
signal?: AbortSignal,
|
|
): Promise<{ audioBase64: string; mimeType: string }> {
|
|
if (voice.provider === "stepfun") {
|
|
return stepfunSynthesize(cfg, voice, text, delivery, signal);
|
|
}
|
|
return xiaomiSynthesize(cfg, voice, text, delivery, signal);
|
|
}
|