feat(tts): StepFun voice selection via CharacterDesigner + provider-aware beat-audio

Make homepage cards and live sessions produce sound when the server is configured for StepFun TTS, instead of silently failing (the prebaked Xiaomi voice was useless on a StepFun server, and wasted ~220KB/beat in Fast Origin Transfer). Three coordinated changes: 1. CharacterDesigner now picks a StepFun preset voice id directly from the 32-entry catalog in the SAME LLM call that designs the character — zero extra latency, LLM-grade match quality. The Xiaomi prompt path is byte-identical to history (verified programmatically) so cache hit rate and voice quality are preserved. pickStepfunVoiceId (keyword scorer) remains the fallback for orphan speakers / invalid LLM picks. 2. The 32-preset catalog moves to lib/tts-client/stepfun-voices.json as the single source of truth, shared by the scorer, the CharacterDesigner prompt, /api/tts-provider, and the offline enrich script. 3. A new GET /api/tts-provider endpoint lets the client probe the server's TTS provider at /play mount. fetchBeatAudio then shapes its request body: on a StepFun server it sends the lightweight stepfunVoiceId / voiceDescription and omits the ~220KB Xiaomi reference audio (FOT saving ~13MB per protagonist per session on prebaked cards). requestBeatAudio re-provisions on a provider mismatch before synth, so audio never goes silent on a cross-provider replay or mid-session provider flip. New type fields are all optional and backward-compatible: Character.stepfunVoiceId, BeatAudioRequest.voiceDescription/characterName/stepfunVoiceId, voice made optional. AGENTS.md updated for the new route, type fields, dependency map, and StepFun voice-selection flow.
2026-06-15 12:49:25 +08:00
parent da191dd7a2
commit ca73a41a0b
15 changed files with 754 additions and 90 deletions
@@ -0,0 +1,265 @@
+#!/usr/bin/env node
+/**
+ * Enrich the prebaked homepage first-act JSONs with a StepFun preset voice
+ * id for each character, so that when the server is configured for StepFun
+ * TTS, players entering via a homepage card (?card=...) still get a voice
+ * instead of silence.
+ *
+ * The prebaked JSONs ship with a Xiaomi voice (baked at build time). When the
+ * server runs StepFun, that Xiaomi voice is useless on the synth path AND
+ * costs ~220KB per beat-audio request in wasted Fast Origin Transfer. So we
+ * additionally write `characters[i].stepfunVoiceId`, picked by ONE LLM call
+ * per character using the same 32-preset catalog the CharacterDesigner uses.
+ *
+ * Idempotent: skips any character that already has a stepfunVoiceId.
+ * Pass --force to re-pick every character. --only=f0,f1 to filter.
+ * --portrait targets firstact-portrait/ instead of firstact/.
+ *
+ * Reads .env.local for TEXT_BASE_URL / TEXT_API_KEY / TEXT_MODEL (same env
+ * convention as scripts/generate-presets.mjs). Does NOT touch voice / imageUrl
+ * / scene / storyState — only appends stepfunVoiceId.
+ *
+ * Usage:
+ *   node scripts/enrich-firstacts-stepfun.mjs
+ *   node scripts/enrich-firstacts-stepfun.mjs --force
+ *   node scripts/enrich-firstacts-stepfun.mjs --only=f0,f1 --portrait
+ */
+
+import { fileURLToPath } from "node:url";
+import { dirname, resolve } from "node:path";
+import { existsSync, readFileSync, writeFileSync, readdirSync, statSync } from "node:fs";
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const WEB_ROOT = resolve(__dirname, "..");
+const ENV_FILE = resolve(WEB_ROOT, ".env.local");
+const OUT_DIR_NAME = process.argv.includes("--portrait")
+  ? "firstact-portrait"
+  : "firstact";
+const OUT_DIR = resolve(WEB_ROOT, "public", "home", OUT_DIR_NAME);
+const CATALOG_FILE = resolve(WEB_ROOT, "lib", "tts-client", "stepfun-voices.json");
+
+const FORCE = process.argv.includes("--force");
+const ONLY_ARG = process.argv.find((a) => a.startsWith("--only="));
+const ONLY = ONLY_ARG ? ONLY_ARG.split("=")[1].split(",") : null;
+const CONCURRENCY = 4;
+const MAX_ATTEMPTS = 4;
+
+// ── env ───────────────────────────────────────────────────────────────
+function loadEnv(path) {
+  if (!existsSync(path)) return {};
+  const txt = readFileSync(path, "utf8");
+  const env = {};
+  for (const raw of txt.split(/\r?\n/)) {
+    const line = raw.trim();
+    if (!line || line.startsWith("#")) continue;
+    const eq = line.indexOf("=");
+    if (eq < 0) continue;
+    const k = line.slice(0, eq).trim();
+    let v = line.slice(eq + 1).trim();
+    if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
+      v = v.slice(1, -1);
+    }
+    env[k] = v;
+  }
+  return env;
+}
+
+const env = loadEnv(ENV_FILE);
+const BASE_URL = env.TEXT_BASE_URL;
+const API_KEY = env.TEXT_API_KEY;
+const MODEL = env.TEXT_MODEL;
+if (!BASE_URL || !API_KEY || !MODEL) {
+  console.error(`Missing TEXT_BASE_URL / TEXT_API_KEY / TEXT_MODEL in ${ENV_FILE}`);
+  process.exit(2);
+}
+
+// ── catalog ───────────────────────────────────────────────────────────
+const CATALOG = JSON.parse(readFileSync(CATALOG_FILE, "utf8"));
+const VALID_IDS = new Set(CATALOG.map((v) => v.id));
+const CATALOG_TEXT = CATALOG.map(
+  (v) => `- ${v.id}：${v.desc}（${v.gender}/${v.age}）`,
+).join("\n");
+
+// ── LLM pick ──────────────────────────────────────────────────────────
+const SYSTEM_PROMPT = `你是一个 TTS 音色匹配助手。给你一个角色的名字、世界观背景、以及中文音色设定描述，你要从下面的 StepFun 预设音色清单里挑选「最贴合该角色」的一个预设 id。
+
+挑选原则：
+- 性别必须一致（男声只能选 male 行，女声只能选 female 行）。
+- 年龄段尽量一致；拿不准时优先气质匹配。
+- id 必须**原样复制**清单里的某个值（拼写、大小写、连字符都不能变），不允许编造清单外的 id。
+
+StepFun 预设音色清单：
+${CATALOG_TEXT}
+
+只输出一个 JSON 对象，不要输出任何其它文本：
+{ "stepfunVoiceId": "清单内某个 id" }`;
+
+function buildUser(charName, voiceDescription, worldSetting) {
+  return [
+    `角色名：${charName}`,
+    `世界观：${worldSetting}`,
+    `音色设定描述：${voiceDescription}`,
+    "",
+    "请挑选最贴合该角色的 StepFun 预设 id，严格以 JSON 返回。",
+  ].join("\n");
+}
+
+// Cheap permissive JSON extraction — the model sometimes wraps in ```json
+// fences or adds stray prose. Mirrors parseJsonLoose's first-cut logic
+// without pulling in the TS engine code.
+function extractJson(raw) {
+  const trimmed = raw.trim();
+  // Strip ```json ... ``` fences if present.
+  const fence = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i);
+  const candidate = (fence ? fence[1] : trimmed).trim();
+  // Try direct parse first.
+  try {
+    return JSON.parse(candidate);
+  } catch {
+    // Fall back to the first {...} slice.
+    const start = candidate.indexOf("{");
+    const end = candidate.lastIndexOf("}");
+    if (start >= 0 && end > start) {
+      try {
+        return JSON.parse(candidate.slice(start, end + 1));
+      } catch {
+        // give up below
+      }
+    }
+  }
+  return null;
+}
+
+const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
+
+async function pickVoiceId(charName, voiceDescription, worldSetting) {
+  const url = BASE_URL.replace(/\/$/, "") + "/chat/completions";
+  let lastErr = "";
+  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
+    try {
+      const res = await fetch(url, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Bearer ${API_KEY}`,
+        },
+        body: JSON.stringify({
+          model: MODEL,
+          messages: [
+            { role: "system", content: SYSTEM_PROMPT },
+            { role: "user", content: buildUser(charName, voiceDescription, worldSetting) },
+          ],
+          temperature: 0.3,
+        }),
+      });
+      if (!res.ok) {
+        lastErr = `HTTP ${res.status}: ${(await res.text()).slice(0, 200)}`;
+      } else {
+        const data = await res.json();
+        const raw = data?.choices?.[0]?.message?.content ?? "";
+        const parsed = extractJson(raw);
+        const id = parsed?.stepfunVoiceId;
+        if (typeof id === "string" && VALID_IDS.has(id)) {
+          return id;
+        }
+        lastErr = `invalid id in response: ${JSON.stringify(id ?? parsed).slice(0, 120)}`;
+      }
+    } catch (e) {
+      lastErr = e instanceof Error ? e.message : String(e);
+    }
+    if (attempt < MAX_ATTEMPTS) {
+      const delay = 2 ** attempt * 2000;
+      console.warn(`    [retry ${attempt}/${MAX_ATTEMPTS}] ${lastErr} — waiting ${delay}ms`);
+      await sleep(delay);
+    }
+  }
+  throw new Error(lastErr || "unknown error");
+}
+
+// ── main ──────────────────────────────────────────────────────────────
+function listCards() {
+  const all = readdirSync(OUT_DIR)
+    .filter((f) => f.endsWith(".json"))
+    .map((f) => f.replace(/\.json$/, ""));
+  if (ONLY) {
+    const keep = new Set(ONLY);
+    return all.filter((f) => keep.has(f));
+  }
+  return all.sort();
+}
+
+async function enrichOne(cardName) {
+  const file = resolve(OUT_DIR, `${cardName}.json`);
+  const data = JSON.parse(readFileSync(file, "utf8"));
+  const worldSetting = data.worldSetting ?? "";
+  const characters = Array.isArray(data.characters) ? data.characters : [];
+  let changed = false;
+  for (const c of characters) {
+    if (!c.voiceDescription) {
+      console.warn(`    [skip] ${c.name}: no voiceDescription`);
+      continue;
+    }
+    if (!FORCE && typeof c.stepfunVoiceId === "string" && c.stepfunVoiceId) {
+      continue; // already enriched
+    }
+    const id = await pickVoiceId(c.name, c.voiceDescription, worldSetting);
+    c.stepfunVoiceId = id;
+    changed = true;
+    console.log(`    ${c.name} → ${id}`);
+  }
+  if (changed) {
+    writeFileSync(file, JSON.stringify(data));
+  }
+  return changed;
+}
+
+async function worker(queue, counters) {
+  while (queue.length > 0) {
+    const name = queue.shift();
+    if (!name) return;
+    try {
+      const changed = await enrichOne(name);
+      if (changed) {
+        counters.enriched++;
+        console.log(`[${counters.done}/${counters.total}] ${name} enriched`);
+      } else {
+        counters.skipped++;
+        console.log(`[${counters.done}/${counters.total}] ${name} skip (no change)`);
+      }
+    } catch (e) {
+      counters.failed++;
+      const msg = e instanceof Error ? e.message : String(e);
+      console.error(`[${counters.done}/${counters.total}] ${name} FAIL: ${msg}`);
+    }
+    counters.done++;
+    await sleep(1000); // be nice to rate limits between cards
+  }
+}
+
+async function main() {
+  if (!existsSync(OUT_DIR)) {
+    console.error(`Output dir not found: ${OUT_DIR}`);
+    process.exit(2);
+  }
+  const cards = listCards();
+  if (cards.length === 0) {
+    console.log(`No cards found in ${OUT_DIR}`);
+    return;
+  }
+  console.log(
+    `[enrich] ${cards.length} cards in ${OUT_DIR} | force=${FORCE} | only=${ONLY ? ONLY.join(",") : "all"} | concurrency=${CONCURRENCY}`,
+  );
+
+  const counters = { done: 0, total: cards.length, enriched: 0, skipped: 0, failed: 0 };
+  const queue = [...cards];
+  const t0 = Date.now();
+  await Promise.all(
+    Array.from({ length: CONCURRENCY }, () => worker(queue, counters)),
+  );
+  console.log(
+    `\n[enrich] done in ${Math.round((Date.now() - t0) / 1000)}s — enriched ${counters.enriched}, skipped ${counters.skipped}, failed ${counters.failed}`,
+  );
+  process.exit(counters.failed ? 1 : 0);
+}
+
+main();