From 375f401c8fd2ac3592c78861bbc7976e75a9c5eb Mon Sep 17 00:00:00 2001
From: yuanzonghao <yuanzonghao123@gmail.com>
Date: Mon, 15 Jun 2026 13:05:36 +0800
Subject: [PATCH] fix(tts): persist stepfunVoiceId on Character + harden probe
 race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups from pr-agent review of #79:

1. director.ts voicePromises built a Character WITHOUT stepfunVoiceId, so
   on a StepFun server the client (which omits the voice payload to save
   FOT) echoed back only voiceDescription — and the server re-scored via
   pickStepfunVoiceId every beat instead of honoring the LLM pick. The
   whole "CharacterDesigner picks a preset id" mechanism was effectively
   bypassed on live StepFun sessions (it only worked for prebaked cards,
   which carry stepfunVoiceId in their JSON). Persist stepfunVoiceId onto
   the Character so the client→server round-trip keeps the LLM selection.

2. fetchBeatAudio's null-provider branch (probe pending) required
   speaker.voice and silently dropped a stepfun-only speaker. Accept any
   synthesizable source (voice | stepfunVoiceId | voiceDescription) so a
   slow getTtsProvider probe can't drop audio during the first scene's
   fetch window. The server resolveVoice normalizes regardless of which
   fields arrive.
---
 app/play/page.tsx      | 9 ++++++---
 lib/engine/director.ts | 4 ++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/app/play/page.tsx b/app/play/page.tsx
index acc9d20..bbcf259 100644
--- a/app/play/page.tsx
+++ b/app/play/page.tsx
@@ -879,14 +879,17 @@ function PlayInner() {
       //   - BYO (xiaomi): baked voice OR voiceDescription to provision locally.
       //   - Server stepfun: stepfunVoiceId or voiceDescription — no Xiaomi
       //     `voice` needed (saves the ~220KB reference-audio FOT).
-      //   - Server xiaomi / unknown: rely on speaker.voice (the server will
-      //     normalize if provider mismatch — but we still need *something*).
+      //   - Server xiaomi / unknown (probe pending): accept ANY synthesizable
+      //     source. The null case covers the race where getTtsProvider hasn't
+      //     resolved before the first beat fetch fires — without this widening
+      //     a stepfun-only speaker (no Xiaomi voice) would be silently dropped.
+      //     The server resolves + normalizes regardless of which fields arrive.
       if (byo) {
         if (!speaker.voice && !speaker.voiceDescription) return;
       } else if (serverProvider === "stepfun") {
         if (!speaker.stepfunVoiceId && !speaker.voiceDescription) return;
       } else {
-        if (!speaker.voice) return;
+        if (!speaker.voice && !speaker.stepfunVoiceId && !speaker.voiceDescription) return;
       }
 
       if (beatAudioAbortRef.current.has(beat.id)) return;
diff --git a/lib/engine/director.ts b/lib/engine/director.ts
index ab997f3..b2a89a1 100644
--- a/lib/engine/director.ts
+++ b/lib/engine/director.ts
@@ -308,6 +308,9 @@ export async function directScene(
   // On the StepFun path, thread the LLM-selected stepfunVoiceId from the card
   // into provision — it lets stepfunProvision honor the catalog pick instead
   // of falling back to the keyword scorer (same network cost: still zero).
+  // ALSO persist it onto the Character so the client can echo it back on a
+  // StepFun server (where it skips the ~220KB voice payload) and the server
+  // resolveVoice honors the LLM pick at synth time instead of re-scoring.
   const voicePromises = cards.map((card) =>
     provisionCharacterVoice(config, card.voiceDescription, card.name, {
       stepfunVoiceId: card.stepfunVoiceId,
@@ -316,6 +319,7 @@ export async function directScene(
         name: card.name,
         voiceDescription: card.voiceDescription,
         voice,
+        stepfunVoiceId: card.stepfunVoiceId,
       }),
     ),
   );