From 1a6238f8b8132056cb7d23fc893af4d11cb392ea Mon Sep 17 00:00:00 2001 From: yuanzonghao Date: Tue, 9 Jun 2026 14:24:27 +0800 Subject: [PATCH] fix(tts): harden StepFun provider integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate voice.provider against known whitelist (xiaomi|stepfun) in beat-audio route to return a clear 400 instead of falling through - Move single-char pronouns (他/她) to weak-signal fallback in detectGender to avoid false positives on compounds like 其他 - Update .env.example with StepFun configuration examples Co-Authored-By: Claude Opus 4.6 --- .env.example | 18 ++++++++++++++---- app/api/beat-audio/route.ts | 10 ++++++++-- lib/tts-client/stepfun.ts | 10 ++++++---- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/.env.example b/.env.example index 35ae802..bf96f47 100644 --- a/.env.example +++ b/.env.example @@ -66,10 +66,20 @@ VISION_MODEL=mimo-v2.5 # google → VISION_BASE_URL=https://generativelanguage.googleapis.com VISION_MODEL=gemini-3.5-flash # VISION_PROVIDER=openai_compatible -# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------ -# Per-character voice design → clone, with per-line delivery direction. -# Voice identity = the reference audio kept in the session (no server expiry). -# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL. +# ---- 4. TTS (optional — leave blank to disable) -------------------- +# Provider is auto-detected from TTS_BASE_URL host: +# *stepfun.com → StepFun (preset voices, keyword-scored selection) +# otherwise → Xiaomi MiMo (voicedesign + voiceclone) +# +# Xiaomi MiMo — per-character voice design → clone, with per-line delivery. +# TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 +# TTS_API_KEY=tp-xxx +# TTS_SPEECH_MODEL=mimo-v2.5-tts +# +# StepFun — 32 preset voices, auto-selected by gender + age + tone scoring. +# TTS_BASE_URL=https://api.stepfun.com/v1 +# TTS_API_KEY=sk-xxx +# TTS_SPEECH_MODEL=step-tts-mini # or step-tts-2 / stepaudio-2.5-tts TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 TTS_API_KEY=tp-xxx TTS_SPEECH_MODEL=mimo-v2.5-tts diff --git a/app/api/beat-audio/route.ts b/app/api/beat-audio/route.ts index fe881f2..7ef2484 100644 --- a/app/api/beat-audio/route.ts +++ b/app/api/beat-audio/route.ts @@ -16,9 +16,15 @@ export async function POST(req: Request) { // Accept either provider's voice shape — xiaomi carries referenceAudioBase64, // stepfun carries voiceId. We only check the discriminator + the line text; // shape-specific validation lives in each provider's synth function. - if (!body.beat?.id || !body.beat?.line || !body.voice?.provider) { + const VALID_TTS_PROVIDERS = ["xiaomi", "stepfun"]; + if ( + !body.beat?.id || + !body.beat?.line || + !body.voice?.provider || + !VALID_TTS_PROVIDERS.includes(body.voice.provider) + ) { return NextResponse.json( - { error: "beat.id, beat.line and voice.provider are required" }, + { error: "beat.id, beat.line and voice.provider (xiaomi|stepfun) are required" }, { status: 400 }, ); } diff --git a/lib/tts-client/stepfun.ts b/lib/tts-client/stepfun.ts index 123056d..dac5882 100644 --- a/lib/tts-client/stepfun.ts +++ b/lib/tts-client/stepfun.ts @@ -68,14 +68,16 @@ function hashStr(s: string): number { } function detectGender(desc: string): "male" | "female" { - // Female signals (broader cast — galgame skews toward female NPCs). - if (/女性|女声|少女|姐姐|妹妹|熟女|御姐|阿姨|奶奶|女孩|姑娘|大妈|女子|女生|女士|她|小姐/.test(desc)) { + if (/女性|女声|少女|姐姐|妹妹|熟女|御姐|阿姨|奶奶|女孩|姑娘|大妈|女子|女生|女士|小姐/.test(desc)) { return "female"; } - if (/男性|男声|少年|青年|大叔|哥哥|弟弟|男人|男孩|大爷|爷爷|男子|男生|先生|他|公子|师傅/.test(desc)) { + if (/男性|男声|少年|青年|大叔|哥哥|弟弟|男人|男孩|大爷|爷爷|男子|男生|先生|公子|师傅/.test(desc)) { return "male"; } - // No strong signal: default female (matches the catalog's center of mass). + // Weak signals: single-char pronouns checked last to avoid false positives + // on compound words like "其他" (other) or "她们" (they-fem). + if (/她/.test(desc)) return "female"; + if (/他/.test(desc)) return "male"; return "female"; }