fix(play): restore server TTS, FOT strip/merge, nudge, and blob cleanup
Reverts the regressions from b63b694 on the server-fallback path:
P0 — fetchBeatAudio non-BYO branch was a bare return; every non-BYO
user got silent playback regardless of server TTS config. Re-connect
to /api/beat-audio with the beatAudioAbortRef signal, count 204/!ok
as silence strikes, create a blob URL on success.
P1 — stripVoicesForTransport + mergeCharactersPreserveVoice were
deleted, so the server-fallback path re-sent ~160KB
referenceAudioBase64 per character on every request AND lost voices
for already-known characters after scene 1. Re-add both, applied
ONLY on the server-fallback branches in engineClient.ts (BYO
client-direct path untouched).
P3 — the aborted-before-store blob URL race had no revoke, leaking
one blob URL per cancelled synth. Re-add the else-if revoke.
P2 — handleSettingsSaved ignored ttsConfigured, so a BYO key entered
mid-session only took effect after a page reload. Re-add the ref/state
refresh + audio re-prefetch. Also restore the silence-nudge UI
(silenceStrikes counter, SILENCE_NUDGE_THRESHOLD, dismissible pill
beside the mute toggle) that surfaces BYO-key guidance when the
shared server key is being rate-limited.
Verified live: /api/beat-audio now returns 200 (was 0 calls under
the bug); audio plays after synth completes.
This commit is contained in:
+118
-4
@@ -57,6 +57,11 @@ import { UserChip } from "@/components/UserChip";
|
|||||||
|
|
||||||
const MUTED_STORAGE_KEY = "infiplot:muted";
|
const MUTED_STORAGE_KEY = "infiplot:muted";
|
||||||
|
|
||||||
|
// Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a
|
||||||
|
// non-BYO, unmuted player. Set high enough that one transient miss won't trip
|
||||||
|
// it, low enough to catch a scene that's clearly being rate-limited.
|
||||||
|
const SILENCE_NUDGE_THRESHOLD = 3;
|
||||||
|
|
||||||
// Mobile-portrait users get a 9:16 scene image painted for them; everyone else
|
// Mobile-portrait users get a 9:16 scene image painted for them; everyone else
|
||||||
// (desktop, tablet, mobile-landscape) keeps the 16:9 landscape image. Only a
|
// (desktop, tablet, mobile-landscape) keeps the 16:9 landscape image. Only a
|
||||||
// touch device (coarse pointer) held upright counts as "portrait" — a mouse
|
// touch device (coarse pointer) held upright counts as "portrait" — a mouse
|
||||||
@@ -603,6 +608,12 @@ function PlayInner() {
|
|||||||
const [orientation, setOrientation] = useState<Orientation>("landscape");
|
const [orientation, setOrientation] = useState<Orientation>("landscape");
|
||||||
const [lastExitLabel, setLastExitLabel] = useState<string | null>(null);
|
const [lastExitLabel, setLastExitLabel] = useState<string | null>(null);
|
||||||
// Consecutive server-side TTS misses (null audio / failed /api/beat-audio).
|
// Consecutive server-side TTS misses (null audio / failed /api/beat-audio).
|
||||||
|
// Climbs when the shared server key is rate-limited by MiMo — the exact pain
|
||||||
|
// BYO fixes — so the play page can nudge non-BYO users to add their own key.
|
||||||
|
// Reset to 0 on any successful synth. Only the server path touches it.
|
||||||
|
const [silenceStrikes, setSilenceStrikes] = useState(0);
|
||||||
|
// Once the player dismisses the silence nudge, keep it gone for this session.
|
||||||
|
const [nudgeDismissed, setNudgeDismissed] = useState(false);
|
||||||
const [settingsOpen, setSettingsOpen] = useState(false);
|
const [settingsOpen, setSettingsOpen] = useState(false);
|
||||||
const [visionClickEnabled, setVisionClickEnabled] = useState(true);
|
const [visionClickEnabled, setVisionClickEnabled] = useState(true);
|
||||||
const [authModalOpen, setAuthModalOpen] = useState(false);
|
const [authModalOpen, setAuthModalOpen] = useState(false);
|
||||||
@@ -855,8 +866,39 @@ function PlayInner() {
|
|||||||
);
|
);
|
||||||
audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`;
|
audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`;
|
||||||
} else {
|
} else {
|
||||||
// No TTS configured — silent.
|
// Server-side synth: POST just this beat + the speaker's voice (not
|
||||||
return;
|
// the whole session) to /api/beat-audio. Returns 204 when the engine
|
||||||
|
// had nothing to say (no TTS configured / empty synth) and binary
|
||||||
|
// audio otherwise. Both 204 and !ok count as a silence strike so the
|
||||||
|
// nudge surfaces when the shared server key is being rate-limited.
|
||||||
|
const res = await fetch("/api/beat-audio", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
|
||||||
|
voice: speaker.voice,
|
||||||
|
}),
|
||||||
|
signal: abort.signal,
|
||||||
|
});
|
||||||
|
if (res.status === 204) {
|
||||||
|
setSilenceStrikes((n) => Math.min(n + 1, 99));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!res.ok) {
|
||||||
|
setSilenceStrikes((n) => Math.min(n + 1, 99));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const blob = await res.blob();
|
||||||
|
// Defensive: a 200 with an empty body (proxy/CDN truncation,
|
||||||
|
// framework edge cases) would create a silent blob URL and wrongly
|
||||||
|
// reset the silence counter. Treat empty as a miss so the nudge
|
||||||
|
// still surfaces when the shared key is being rate-limited.
|
||||||
|
if (blob.size === 0) {
|
||||||
|
setSilenceStrikes((n) => Math.min(n + 1, 99));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
audioUrl = URL.createObjectURL(blob);
|
||||||
|
setSilenceStrikes(0);
|
||||||
}
|
}
|
||||||
// Skip the state write if we've been aborted between the await and
|
// Skip the state write if we've been aborted between the await and
|
||||||
// here — beat ids are scene-local, so a late arrival from a prior
|
// here — beat ids are scene-local, so a late arrival from a prior
|
||||||
@@ -864,9 +906,23 @@ function PlayInner() {
|
|||||||
// same id.
|
// same id.
|
||||||
if (audioUrl && !abort.signal.aborted) {
|
if (audioUrl && !abort.signal.aborted) {
|
||||||
setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl }));
|
setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl }));
|
||||||
|
} else if (audioUrl?.startsWith("blob:")) {
|
||||||
|
// Aborted between synth and store — revoke the blob URL we just
|
||||||
|
// created so it doesn't leak. (Scene-change and mute transitions
|
||||||
|
// revoke stored URLs separately; this only covers this race.)
|
||||||
|
URL.revokeObjectURL(audioUrl);
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// aborted / network / Xiaomi rate-limit — silent fallback (no audio)
|
// aborted (scene change / mute) — silent fallback, NOT a strike.
|
||||||
|
// Network failure / server 5xx / shared-key rate-limit that surfaces
|
||||||
|
// as a thrown error on the server path DOES count — otherwise the
|
||||||
|
// silence nudge would never fire for those cases (the explicit 204/
|
||||||
|
// !ok/empty-blob branches above only cover responses, not throws).
|
||||||
|
// BYO throws are the user's own key quota, not the shared-key pain
|
||||||
|
// the nudge addresses, so they don't count.
|
||||||
|
if (!abort.signal.aborted && !byo) {
|
||||||
|
setSilenceStrikes((n) => Math.min(n + 1, 99));
|
||||||
|
}
|
||||||
} finally {
|
} finally {
|
||||||
// Only clear the slot if it's still ours. An aborted prior fetch
|
// Only clear the slot if it's still ours. An aborted prior fetch
|
||||||
// running its finally late could otherwise delete the controller of a
|
// running its finally late could otherwise delete the controller of a
|
||||||
@@ -956,8 +1012,27 @@ function PlayInner() {
|
|||||||
setVisionClickEnabled(settings.visionClickEnabled);
|
setVisionClickEnabled(settings.visionClickEnabled);
|
||||||
const nextPlayerName = settings.playerName || undefined;
|
const nextPlayerName = settings.playerName || undefined;
|
||||||
setSession((prev) => prev ? { ...prev, playerName: nextPlayerName } : prev);
|
setSession((prev) => prev ? { ...prev, playerName: nextPlayerName } : prev);
|
||||||
|
// Refresh the BYO TTS config so a key entered mid-session takes effect
|
||||||
|
// immediately — byoTtsRef is otherwise only read once at mount.
|
||||||
|
const cfg = settings.ttsConfigured ? loadClientTtsConfig() : null;
|
||||||
|
byoTtsRef.current = cfg;
|
||||||
|
setByoTtsConfig(cfg);
|
||||||
|
if (cfg) {
|
||||||
|
// Switching to BYO: any server-path audio in flight is now stale,
|
||||||
|
// and the silence nudge is no longer relevant. Abort + clear, then
|
||||||
|
// re-synth the current scene with the user's own key.
|
||||||
|
setSilenceStrikes(0);
|
||||||
|
cancelBeatAudioFetches();
|
||||||
|
setBeatAudioMap((prev) => {
|
||||||
|
for (const url of Object.values(prev)) {
|
||||||
|
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
});
|
||||||
|
prefetchSceneAudio();
|
||||||
|
}
|
||||||
},
|
},
|
||||||
[],
|
[prefetchSceneAudio],
|
||||||
);
|
);
|
||||||
|
|
||||||
function detachRecordedReplay(): void {
|
function detachRecordedReplay(): void {
|
||||||
@@ -2185,6 +2260,16 @@ function PlayInner() {
|
|||||||
const sceneCount = session?.history.length ?? 0;
|
const sceneCount = session?.history.length ?? 0;
|
||||||
const beatCount = visitedBeatsRef.current.length;
|
const beatCount = visitedBeatsRef.current.length;
|
||||||
|
|
||||||
|
// Surface the BYO-key nudge only to an unmuted, non-BYO player whose last few
|
||||||
|
// beats came back silent (shared key rate-limited) — the exact pain BYO fixes.
|
||||||
|
// Dismissible for the session.
|
||||||
|
const showSilenceNudge =
|
||||||
|
phase === "ready" &&
|
||||||
|
!muted &&
|
||||||
|
!byoTtsConfig &&
|
||||||
|
!nudgeDismissed &&
|
||||||
|
silenceStrikes >= SILENCE_NUDGE_THRESHOLD;
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen flex flex-col">
|
<div className="min-h-screen flex flex-col">
|
||||||
{exportProgress && (
|
{exportProgress && (
|
||||||
@@ -2294,6 +2379,35 @@ function PlayInner() {
|
|||||||
/>
|
/>
|
||||||
{muted ? "静 · 音" : "有 · 声"}
|
{muted ? "静 · 音" : "有 · 声"}
|
||||||
</button>
|
</button>
|
||||||
|
|
||||||
|
{/* Silence nudge — a compact pill right beside the mute toggle.
|
||||||
|
Triggers when the shared server key keeps coming back silent,
|
||||||
|
which usually means it's rate-limited; nudges the player to
|
||||||
|
enter their own API Key for a more stable experience.
|
||||||
|
Clicking opens the settings modal in place; the × dismisses
|
||||||
|
it for the session. */}
|
||||||
|
{showSilenceNudge && (
|
||||||
|
<span className="flex items-center gap-1 animate-fade-in">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => setSettingsOpen(true)}
|
||||||
|
className="inline-flex items-center gap-1.5 rounded-full border border-ember-500/40 bg-ember-500/10 px-2.5 py-1 text-[10px] text-ember-500 hover:bg-ember-500/20 transition-colors"
|
||||||
|
title="效果不满意/经常没声音?填入自己的 API Key 试试"
|
||||||
|
>
|
||||||
|
<i className="fa-solid fa-volume-xmark text-[9px]" />
|
||||||
|
效果不满意/经常没声音?填入自己的 API Key 试试
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => setNudgeDismissed(true)}
|
||||||
|
aria-label="关闭提示"
|
||||||
|
title="关闭"
|
||||||
|
className="text-clay-400 hover:text-clay-700 transition-colors"
|
||||||
|
>
|
||||||
|
<i className="fa-solid fa-xmark text-[10px]" />
|
||||||
|
</button>
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
|
|||||||
+59
-4
@@ -11,6 +11,7 @@ import {
|
|||||||
} from "@/lib/clientModelConfig";
|
} from "@/lib/clientModelConfig";
|
||||||
import { loadClientTtsConfig } from "@/lib/clientTtsConfig";
|
import { loadClientTtsConfig } from "@/lib/clientTtsConfig";
|
||||||
import type {
|
import type {
|
||||||
|
Character,
|
||||||
FreeformClassifyRequest,
|
FreeformClassifyRequest,
|
||||||
FreeformClassifyResponse,
|
FreeformClassifyResponse,
|
||||||
EngineConfig,
|
EngineConfig,
|
||||||
@@ -18,6 +19,7 @@ import type {
|
|||||||
InsertBeatResponse,
|
InsertBeatResponse,
|
||||||
SceneRequest,
|
SceneRequest,
|
||||||
SceneResponse,
|
SceneResponse,
|
||||||
|
Session,
|
||||||
StartRequest,
|
StartRequest,
|
||||||
StartResponse,
|
StartResponse,
|
||||||
VisionRequest,
|
VisionRequest,
|
||||||
@@ -58,6 +60,39 @@ async function postJson<T>(path: string, body: unknown): Promise<T> {
|
|||||||
return res.json() as Promise<T>;
|
return res.json() as Promise<T>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── FOT reduction helpers (server-fallback path only) ─────────────────
|
||||||
|
// The server-fallback POSTs send the whole Session over the wire. Voice
|
||||||
|
// data is bulky (~160KB/character via referenceAudioBase64) and the
|
||||||
|
// scene-generation / vision / classify pipelines never need it — voices
|
||||||
|
// are only consumed by /api/beat-audio, which receives them directly, not
|
||||||
|
// via the session. So strip voices before transport.
|
||||||
|
function stripVoicesForTransport(session: Session): Session {
|
||||||
|
return {
|
||||||
|
...session,
|
||||||
|
// Destructure voice out so the serialized payload drops the field
|
||||||
|
// entirely (voice is optional on Character), rather than serializing
|
||||||
|
// it as undefined/null. This is the ~160KB/character referenceAudioBase64
|
||||||
|
// we want off the wire on the server-fallback path.
|
||||||
|
characters: session.characters.map(({ voice: _voice, ...rest }) => rest),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// The server strips voice from already-known characters before responding
|
||||||
|
// (see /api/scene stripKnownVoices and /api/insert-beat's blanket strip) to
|
||||||
|
// save bandwidth, so only NEW characters carry voice in the response. For
|
||||||
|
// existing characters, re-attach the voice the client already holds locally.
|
||||||
|
function mergeCharactersPreserveVoice(
|
||||||
|
local: Character[],
|
||||||
|
remote: Character[],
|
||||||
|
): Character[] {
|
||||||
|
const localByName = new Map(local.map((c) => [c.name, c]));
|
||||||
|
return remote.map((c) => {
|
||||||
|
const prev = localByName.get(c.name);
|
||||||
|
if (!prev) return c;
|
||||||
|
return { ...c, voice: c.voice ?? prev.voice };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// ── Unified entry points ───────────────────────────────────────────────
|
// ── Unified entry points ───────────────────────────────────────────────
|
||||||
// When the browser has a BYO model config in localStorage, these call the
|
// When the browser has a BYO model config in localStorage, these call the
|
||||||
// client-side engine directly (talking to providers from the browser).
|
// client-side engine directly (talking to providers from the browser).
|
||||||
@@ -77,7 +112,14 @@ export async function requestScene(req: SceneRequest): Promise<SceneResponse> {
|
|||||||
if (config) {
|
if (config) {
|
||||||
return requestSceneClient(config, req);
|
return requestSceneClient(config, req);
|
||||||
}
|
}
|
||||||
return postJson<SceneResponse>("/api/scene", req);
|
const data = await postJson<SceneResponse>("/api/scene", {
|
||||||
|
...req,
|
||||||
|
session: stripVoicesForTransport(req.session),
|
||||||
|
});
|
||||||
|
// Server stripped known-character voices for bandwidth — re-attach the
|
||||||
|
// voices we already hold so fetchBeatAudio can synth them.
|
||||||
|
data.characters = mergeCharactersPreserveVoice(req.session.characters, data.characters);
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function visionDecide(req: VisionRequest): Promise<VisionResponse> {
|
export async function visionDecide(req: VisionRequest): Promise<VisionResponse> {
|
||||||
@@ -85,7 +127,10 @@ export async function visionDecide(req: VisionRequest): Promise<VisionResponse>
|
|||||||
if (config) {
|
if (config) {
|
||||||
return visionDecideClient(config, req);
|
return visionDecideClient(config, req);
|
||||||
}
|
}
|
||||||
return postJson<VisionResponse>("/api/vision", req);
|
return postJson<VisionResponse>("/api/vision", {
|
||||||
|
...req,
|
||||||
|
session: stripVoicesForTransport(req.session),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function classifyFreeform(
|
export async function classifyFreeform(
|
||||||
@@ -95,7 +140,10 @@ export async function classifyFreeform(
|
|||||||
if (config) {
|
if (config) {
|
||||||
return classifyFreeformClient(config, req);
|
return classifyFreeformClient(config, req);
|
||||||
}
|
}
|
||||||
return postJson<FreeformClassifyResponse>("/api/classify-freeform", req);
|
return postJson<FreeformClassifyResponse>("/api/classify-freeform", {
|
||||||
|
...req,
|
||||||
|
session: stripVoicesForTransport(req.session),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function requestInsertBeat(
|
export async function requestInsertBeat(
|
||||||
@@ -105,5 +153,12 @@ export async function requestInsertBeat(
|
|||||||
if (config) {
|
if (config) {
|
||||||
return requestInsertBeatClient(config, req);
|
return requestInsertBeatClient(config, req);
|
||||||
}
|
}
|
||||||
return postJson<InsertBeatResponse>("/api/insert-beat", req);
|
const data = await postJson<InsertBeatResponse>("/api/insert-beat", {
|
||||||
|
...req,
|
||||||
|
session: stripVoicesForTransport(req.session),
|
||||||
|
});
|
||||||
|
// /api/insert-beat strips voice from ALL characters before responding —
|
||||||
|
// re-attach every voice the client already holds so audio keeps working.
|
||||||
|
data.characters = mergeCharactersPreserveVoice(req.session.characters, data.characters);
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user