From 375f401c8fd2ac3592c78861bbc7976e75a9c5eb Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Mon, 15 Jun 2026 13:05:36 +0800 Subject: [PATCH] fix(tts): persist stepfunVoiceId on Character + harden probe race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups from pr-agent review of #79: 1. director.ts voicePromises built a Character WITHOUT stepfunVoiceId, so on a StepFun server the client (which omits the voice payload to save FOT) echoed back only voiceDescription — and the server re-scored via pickStepfunVoiceId every beat instead of honoring the LLM pick. The whole "CharacterDesigner picks a preset id" mechanism was effectively bypassed on live StepFun sessions (it only worked for prebaked cards, which carry stepfunVoiceId in their JSON). Persist stepfunVoiceId onto the Character so the client→server round-trip keeps the LLM selection. 2. fetchBeatAudio's null-provider branch (probe pending) required speaker.voice and silently dropped a stepfun-only speaker. Accept any synthesizable source (voice | stepfunVoiceId | voiceDescription) so a slow getTtsProvider probe can't drop audio during the first scene's fetch window. The server resolveVoice normalizes regardless of which fields arrive. --- app/play/page.tsx | 9 ++++++--- lib/engine/director.ts | 4 ++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/app/play/page.tsx b/app/play/page.tsx index acc9d20..bbcf259 100644 --- a/app/play/page.tsx +++ b/app/play/page.tsx @@ -879,14 +879,17 @@ function PlayInner() { // - BYO (xiaomi): baked voice OR voiceDescription to provision locally. // - Server stepfun: stepfunVoiceId or voiceDescription — no Xiaomi // `voice` needed (saves the ~220KB reference-audio FOT). - // - Server xiaomi / unknown: rely on speaker.voice (the server will - // normalize if provider mismatch — but we still need *something*). + // - Server xiaomi / unknown (probe pending): accept ANY synthesizable + // source. The null case covers the race where getTtsProvider hasn't + // resolved before the first beat fetch fires — without this widening + // a stepfun-only speaker (no Xiaomi voice) would be silently dropped. + // The server resolves + normalizes regardless of which fields arrive. if (byo) { if (!speaker.voice && !speaker.voiceDescription) return; } else if (serverProvider === "stepfun") { if (!speaker.stepfunVoiceId && !speaker.voiceDescription) return; } else { - if (!speaker.voice) return; + if (!speaker.voice && !speaker.stepfunVoiceId && !speaker.voiceDescription) return; } if (beatAudioAbortRef.current.has(beat.id)) return; diff --git a/lib/engine/director.ts b/lib/engine/director.ts index ab997f3..b2a89a1 100644 --- a/lib/engine/director.ts +++ b/lib/engine/director.ts @@ -308,6 +308,9 @@ export async function directScene( // On the StepFun path, thread the LLM-selected stepfunVoiceId from the card // into provision — it lets stepfunProvision honor the catalog pick instead // of falling back to the keyword scorer (same network cost: still zero). + // ALSO persist it onto the Character so the client can echo it back on a + // StepFun server (where it skips the ~220KB voice payload) and the server + // resolveVoice honors the LLM pick at synth time instead of re-scoring. const voicePromises = cards.map((card) => provisionCharacterVoice(config, card.voiceDescription, card.name, { stepfunVoiceId: card.stepfunVoiceId, @@ -316,6 +319,7 @@ export async function directScene( name: card.name, voiceDescription: card.voiceDescription, voice, + stepfunVoiceId: card.stepfunVoiceId, }), ), );