diff --git a/app/play/page.tsx b/app/play/page.tsx index 1ac9708..4790d3a 100644 --- a/app/play/page.tsx +++ b/app/play/page.tsx @@ -525,7 +525,7 @@ async function resolveByoVoice( return ready; } if (!speaker.voiceDescription) return null; - const p = provisionVoice(cfg, speaker.voiceDescription); + const p = provisionVoice(cfg, speaker.voiceDescription, speaker.name); cache.set(speaker.name, p); try { return await p; diff --git a/lib/engine/agents/characterDesigner.ts b/lib/engine/agents/characterDesigner.ts index 60835c0..6b2a7b7 100644 --- a/lib/engine/agents/characterDesigner.ts +++ b/lib/engine/agents/characterDesigner.ts @@ -103,7 +103,7 @@ export async function provisionCharacterVoice( ): Promise { if (!config.tts) return undefined; try { - return await provisionVoice(config.tts, voiceDescription); + return await provisionVoice(config.tts, voiceDescription, charName); } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error(`[characterDesigner] voice provision failed for ${charName}: ${msg}`); diff --git a/lib/tts-client/index.ts b/lib/tts-client/index.ts index c4c6515..5ea2ae5 100644 --- a/lib/tts-client/index.ts +++ b/lib/tts-client/index.ts @@ -13,9 +13,14 @@ function isStepfun(cfg: TtsConfig): boolean { export async function provisionVoice( cfg: TtsConfig, description: string, + // Optional per-character salt (typically the character name). Only + // StepFun's preset-picker uses it — Xiaomi voicedesign mints a unique + // clip per call regardless. Threading it through keeps the API uniform + // and prevents archetype collisions on the StepFun path. + salt?: string, ): Promise { return isStepfun(cfg) - ? stepfunProvision(cfg, description) + ? stepfunProvision(cfg, description, salt) : xiaomiProvision(cfg, description); } diff --git a/lib/tts-client/stepfun.ts b/lib/tts-client/stepfun.ts index 00238ac..123056d 100644 --- a/lib/tts-client/stepfun.ts +++ b/lib/tts-client/stepfun.ts @@ -115,19 +115,24 @@ export function pickStepfunVoiceId(description: string, salt = ""): string { // Pick from the top 3 (or fewer) deterministically by hashing the // description + an optional salt (charName) so two characters that share - // archetype keywords don't collapse onto the identical preset. + // archetype keywords don't collapse onto the identical preset. Hash the + // lowercased desc so case differences in the same description don't pick + // different presets (scoring above is already case-insensitive). const top = scored.slice(0, Math.min(3, scored.length)); - const idx = hashStr(description + "|" + salt) % top.length; + const idx = hashStr(desc + "|" + salt.toLowerCase()) % top.length; return top[idx]!.v.id; } // Provision is synchronous / no network — StepFun has no voicedesign equivalent. // We mirror xiaomiProvision's async signature so the router stays uniform. +// The optional `salt` (character name) spreads two characters that share +// archetype keywords across the top-N candidate presets. export async function stepfunProvision( cfg: TtsConfig, description: string, + salt?: string, ): Promise { - const voiceId = pickStepfunVoiceId(description); + const voiceId = pickStepfunVoiceId(description, salt); return { provider: "stepfun", voiceId,