feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞，声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
@@ -13,6 +13,7 @@
  },
  "dependencies": {
    "@yume/ai-client": "workspace:*",
+    "@yume/tts-client": "workspace:*",
    "@yume/types": "workspace:*",
    "sharp": "^0.33.5"
  }
@@ -4,6 +4,8 @@ import type {
  BeatChoice,
  BeatChoiceEffect,
  BeatNext,
+  Character,
+  InsertBeatPartial,
  ProviderConfig,
  Scene,
  Session,
@@ -43,13 +45,20 @@ type RawBeat = {
  narration?: string;
  speaker?: string;
  line?: string;
+  lineDelivery?: string;
  next?: RawNext;
 };

+type RawCharacterUpdate = {
+  name?: string;
+  description?: string;
+};
+
 type RawScene = {
  scenePrompt?: string;
  entryBeatId?: string;
  beats?: RawBeat[];
+  characterUpdates?: RawCharacterUpdate[];
 };

 function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
  // last/dangling continue into a real scene-change exit so the player can
  // never get stuck self-looping on it.
  const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
+  const line = raw.line?.trim() || undefined;
  return {
    id,
    narration: raw.narration?.trim() || undefined,
    speaker: raw.speaker?.trim() || undefined,
-    line: raw.line?.trim() || undefined,
+    line,
+    // lineDelivery only meaningful when there is a line to deliver.
+    lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
    next: coerceNext(raw.next, fallback),
  };
 }

+function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
+  if (!Array.isArray(raw)) return [];
+  return raw
+    .map((c) => ({
+      name: c.name?.trim() ?? "",
+      description: c.description?.trim() ?? "",
+    }))
+    .filter((c) => c.name && c.description);
+}
+
 const FALLBACK_SEED = "故事继续推进";

 function fallbackExitChoice(beatId: string): BeatChoice {
@@ -230,10 +252,15 @@ function newSceneId(): string {
 //  Called both on real scene transitions AND on speculative prefetch.
 // ──────────────────────────────────────────────────────────────────────

+export type SceneResult = {
+  scene: Scene;
+  characterUpdates: Character[];
+};
+
 export async function directScene(
  config: ProviderConfig,
  session: Session,
-): Promise<Scene> {
+): Promise<SceneResult> {
  const raw = await chat(
    config,
    [
@@ -264,10 +291,13 @@ export async function directScene(
      : beats[0]!.id;

  return {
-    id: newSceneId(),
-    scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
-    beats,
-    entryBeatId,
+    scene: {
+      id: newSceneId(),
+      scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
+      beats,
+      entryBeatId,
+    },
+    characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
  };
 }

@@ -280,7 +310,7 @@ export async function directInsertBeat(
  config: ProviderConfig,
  session: Session,
  freeformAction: string,
-): Promise<{ narration?: string; speaker?: string; line?: string }> {
+): Promise<InsertBeatPartial> {
  const raw = await chat(
    config,
    [
@@ -293,15 +323,12 @@ export async function directInsertBeat(
    { temperature: 0.9, responseFormat: "json_object" },
  );

-  const parsed = parseJsonLoose<{
-    narration?: string;
-    speaker?: string;
-    line?: string;
-  }>(raw);
+  const parsed = parseJsonLoose<InsertBeatPartial>(raw);

  const narration = parsed.narration?.trim() || undefined;
  const speaker = parsed.speaker?.trim() || undefined;
  const line = parsed.line?.trim() || undefined;
+  const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;

  // If the model returned nothing usable, supply a fallback narration so the
  // frontend doesn't append a silent empty beat that renders no dialogue —
@@ -309,5 +336,5 @@ export async function directInsertBeat(
  if (!narration && !speaker && !line) {
    return { narration: "（你停下脚步，环视片刻。）" };
  }
-  return { narration, speaker, line };
+  return { narration, speaker, line, lineDelivery };
 }
@@ -5,4 +5,7 @@ export {
  requestInsertBeat,
 } from "./orchestrator";
 export { annotateClick } from "./annotate";
+export { voiceBeat, voiceScene } from "./voice";
+export type { SceneResult } from "./director";
+export type { InsertBeatPartial } from "@yume/types";
 export * from "./prompts";
@@ -0,0 +1,25 @@
+import sharp from "sharp";
+
+let cached: string | undefined;
+
+// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
+// TTS path without paying for image generation. Generated once, then memoized.
+export async function mockImageBase64(): Promise<string> {
+  if (cached) return cached;
+
+  const W = 1792;
+  const H = 1024;
+  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
+    <rect width="${W}" height="${H}" fill="#161109"/>
+    <rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
+          stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
+    <text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
+          font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
+    <text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
+          font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
+  </svg>`;
+
+  const png = await sharp(Buffer.from(svg)).png().toBuffer();
+  cached = png.toString("base64");
+  return cached;
+}
@@ -1,7 +1,10 @@
 import type {
+  BeatAudio,
+  Character,
  EngineConfig,
  InsertBeatRequest,
  InsertBeatResponse,
+  Scene,
  SceneRequest,
  SceneResponse,
  Session,
@@ -12,15 +15,55 @@ import type {
 } from "@yume/types";
 import { annotateClick } from "./annotate";
 import { directInsertBeat, directScene } from "./director";
+import { mockImageBase64 } from "./mockImage";
 import { render } from "./renderer";
 import { interpret } from "./vision";
+import { voiceBeat, voiceScene } from "./voice";

 function newSessionId(): string {
  return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
 }

+// Merge new character entries into the registry by name. If a name already
+// exists we preserve the existing voice (so a description revision never
+// silently re-provisions a voice the player has already heard).
+function mergeCharacters(existing: Character[], updates: Character[]): Character[] {
+  if (updates.length === 0) return existing;
+  const byName = new Map(existing.map((c) => [c.name, c]));
+  for (const u of updates) {
+    const prev = byName.get(u.name);
+    byName.set(u.name, prev?.voice ? { ...u, voice: prev.voice } : u);
+  }
+  return Array.from(byName.values());
+}
+
+async function renderImage(
+  config: EngineConfig,
+  scene: Scene,
+  styleGuide: string,
+): Promise<string> {
+  if (config.mockImage) return mockImageBase64();
+  return render(config.image, scene, styleGuide);
+}
+
+async function runVoiceScene(
+  config: EngineConfig,
+  session: Session,
+  scene: Scene,
+): Promise<{
+  beatAudio?: Record<string, BeatAudio>;
+  characters: Character[];
+}> {
+  if (!config.tts) return { characters: session.characters };
+  const res = await voiceScene(config.tts, session, scene);
+  return {
+    beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
+    characters: res.characters,
+  };
+}
+
 // ──────────────────────────────────────────────────────────────────────
-//  startSession — first scene + image
+//  startSession — first scene + image + per-beat voice
 // ──────────────────────────────────────────────────────────────────────

 export async function startSession(
@@ -33,31 +76,55 @@ export async function startSession(
    worldSetting: req.worldSetting.trim(),
    styleGuide: req.styleGuide.trim(),
    history: [],
+    characters: [],
  };

-  const scene = await directScene(config.text, session);
-  const imageBase64 = await render(config.image, scene, session.styleGuide);
+  const { scene, characterUpdates } = await directScene(config.text, session);
+  const preVoiceSession: Session = {
+    ...session,
+    characters: mergeCharacters(session.characters, characterUpdates),
+  };
+
+  const [imageBase64, voiceRes] = await Promise.all([
+    renderImage(config, scene, preVoiceSession.styleGuide),
+    runVoiceScene(config, preVoiceSession, scene),
+  ]);

  return {
    sessionId: session.id,
    scene,
    imageBase64,
+    characters: voiceRes.characters,
+    beatAudio: voiceRes.beatAudio,
  };
 }

 // ──────────────────────────────────────────────────────────────────────
-//  requestScene — generate the NEXT scene + image.
-//  Frontend passes a session whose latest history entry has `exit` set.
-//  Also used for prefetch speculation (frontend synthesizes the exit).
+//  requestScene — generate the NEXT scene + image + per-beat voice.
+//  Used both on real scene transitions and on speculative prefetch.
 // ──────────────────────────────────────────────────────────────────────

 export async function requestScene(
  config: EngineConfig,
  req: SceneRequest,
 ): Promise<SceneResponse> {
-  const scene = await directScene(config.text, req.session);
-  const imageBase64 = await render(config.image, scene, req.session.styleGuide);
-  return { scene, imageBase64 };
+  const { scene, characterUpdates } = await directScene(config.text, req.session);
+  const preVoiceSession: Session = {
+    ...req.session,
+    characters: mergeCharacters(req.session.characters, characterUpdates),
+  };
+
+  const [imageBase64, voiceRes] = await Promise.all([
+    renderImage(config, scene, preVoiceSession.styleGuide),
+    runVoiceScene(config, preVoiceSession, scene),
+  ]);
+
+  return {
+    scene,
+    imageBase64,
+    characters: voiceRes.characters,
+    beatAudio: voiceRes.beatAudio,
+  };
 }

 // ──────────────────────────────────────────────────────────────────────
@@ -75,6 +142,7 @@ export async function visionDecide(

 // ──────────────────────────────────────────────────────────────────────
 //  requestInsertBeat — generates a transient in-scene beat (no image regen)
+//  and voices the line if any.
 // ──────────────────────────────────────────────────────────────────────

 export async function requestInsertBeat(
@@ -86,5 +154,49 @@ export async function requestInsertBeat(
    req.session,
    req.freeformAction,
  );
-  return { partial };
+
+  // INSERT_BEAT prompt forbids new characters — but if the director violates
+  // it, voiceBeat's name-inferred fallback would silently provision and persist
+  // the hallucinated speaker. Strip the speaker attribution and promote the
+  // line into narration so the player still sees the text (the client only
+  // renders `line` when there is a `speaker`).
+  if (
+    partial.speaker &&
+    !req.session.characters.some((c) => c.name === partial.speaker)
+  ) {
+    console.warn(
+      `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
+    );
+    const promotedNarration =
+      [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
+    return {
+      partial: {
+        narration: promotedNarration,
+        speaker: undefined,
+        line: undefined,
+        lineDelivery: undefined,
+      },
+      characters: req.session.characters,
+    };
+  }
+
+  if (!config.tts) {
+    // Always echo characters so callers don't need a ?? fallback.
+    return { partial, characters: req.session.characters };
+  }
+
+  // Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
+  // registered cast, so we voice against the existing character set.
+  const voiceRes = await voiceBeat(
+    config.tts,
+    req.session,
+    req.session.characters,
+    partial,
+  );
+
+  return {
+    partial,
+    characters: voiceRes.characters,
+    audio: voiceRes.audio,
+  };
 }
@@ -4,11 +4,12 @@ import type { Scene, Session } from "@yume/types";
 //  Director — emits one Scene (background + a graph of beats) at a time.
 // ──────────────────────────────────────────────────────────────────────

-export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史，输出**一个完整的场景**。
+export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史、已登记角色，输出**一个完整的场景**，并为每句台词配上细腻的配音导演指令。

 一个场景包含：
 - 一张背景图（你给出英文 scenePrompt）
 - 一组对话节拍 beats，玩家会按顺序经历它们
+- 任何**首次登场**的角色，需在 characterUpdates 里登记其专属音色设计

 每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接：
 - "continue": 玩家点击图片背景 / 按继续，自然推进到下一个 beat
@@ -30,27 +31,42 @@ choice 的 effect 有两种：
 - choice 至少 2 个，至多 4 个，互不重复

 文本风格约束：
- narration / line 用中文，scenePrompt 用英文
+- narration / line 用中文（**纯净可显示文本**，绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的，会被玩家看见）
+- scenePrompt / lineDelivery / characterUpdates 内的文字按下方专门说明
 - 单个 beat 的 narration 与 line 加起来 ≤80 字
 - 单个 choice label ≤15 字
- scenePrompt 只描述画面里看到什么，不要描述 UI
+- scenePrompt 用英文，只描述画面里看到什么，不要描述 UI
+
+配音相关字段：
+- 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的"配音导演指令"，描述该句台词怎么念（情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏）。例："鼓起勇气又害羞，声音发颤、偏小，句尾带一丝气声，语速偏慢"。平淡场合写"平静自然、语速适中"即可，但要贴当下情境。
+- characterUpdates 仅当**有新角色首次出现**时列出该新角色的音色设计；已登记的角色不要重复列出。
+- characterUpdates[].description **必须以明确性别开头**（"女性，…" / "男性，…"），随后描述：年龄、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言。例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗，语速偏快，标准普通话"。
+
+角色与台词的硬性规则（影响配音正确性）：
+- 任何 beat 的 speaker 字段一旦填了名字，**该名字必须**：① 在"已登记角色"列表中存在，或 ② 本次输出的 characterUpdates 里登记。绝不允许 speaker 是个未登记的陌生名字。
+- speaker 名字必须与登记名**完全一致**，不要加「（回忆）」「学姐」之类后缀或别名。

 必须输出严格 JSON，结构如下：
 {
  "scenePrompt": "english scene description, no UI",
  "entryBeatId": "b1",
+  "characterUpdates": [
+    { "name": "夏海", "description": "女性，约17岁少女，音色清亮带点稚嫩甜美…" }
+  ],
  "beats": [
    {
      "id": "b1",
-      "narration": "可空",
+      "narration": "可空（纯净文本）",
      "speaker": "可空",
-      "line": "可空",
+      "line": "可空（纯净文本）",
+      "lineDelivery": "line 非空时必填：配音导演指令",
      "next": { "type": "continue", "nextBeatId": "b2" }
    },
    {
      "id": "b2",
-      "speaker": "...",
-      "line": "...",
+      "speaker": "夏海",
+      "line": "学长，我有话想对你说。",
+      "lineDelivery": "鼓起勇气，但又有点害羞，语速偏慢，句尾微微上扬",
      "next": {
        "type": "choice",
        "choices": [
@@ -77,6 +93,13 @@ export function buildDirectorUserMessage(session: Session): string {
  parts.push(`世界观：${session.worldSetting}`);
  parts.push(`画风：${session.styleGuide}`);

+  if (session.characters.length > 0) {
+    parts.push("\n已登记角色（speaker 必须用这些名字之一，或在本次 characterUpdates 里登记新名）：");
+    for (const c of session.characters) {
+      parts.push(`- ${c.name}：${c.description}`);
+    }
+  }
+
  if (session.history.length === 0) {
    parts.push("\n这是故事的开场。请生成第一个场景，严格以 JSON 格式返回。");
    return parts.join("\n");
@@ -142,19 +165,22 @@ export function buildDirectorUserMessage(session: Session): string {
 export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**（比如看一眼桌上的相框、想了想刚才那句话）。请基于此动作，写出一个**单独的、过渡性的 beat**：可以是旁白、角色台词、或两者结合。

 文本风格约束：
- narration / line 用中文
+- narration / line 用中文，**纯净可显示文本**，不要写 (叹气) 这类配音标注
 - narration 与 line 加起来 ≤80 字
 - 不要打破当前场景的物理状态（玩家仍在原地、对面仍是同一个角色）
 - 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
+- 如果有 line，speaker 必须用**已登记角色**里的名字（绝不允许引入新角色）
+- 如果有 line，**必须**给出 lineDelivery（配音导演指令，自由中文，描述这句话怎么念）

 必须输出严格 JSON：
 {
  "narration": "...",
  "speaker": "...",
-  "line": "..."
+  "line": "...",
+  "lineDelivery": "..."
 }

-字段都可为空字符串。不要输出 JSON 以外的任何文本。`;
+narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;

 export function buildInsertBeatUserMessage(
  session: Session,
@@ -163,9 +189,16 @@ export function buildInsertBeatUserMessage(
  const parts: string[] = [];
  parts.push(`世界观：${session.worldSetting}`);

+  if (session.characters.length > 0) {
+    parts.push("\n已登记角色（speaker 只能用这些名字）：");
+    for (const c of session.characters) {
+      parts.push(`- ${c.name}`);
+    }
+  }
+
  const current = session.history.at(-1);
  if (current) {
-    parts.push(`当前场景：${current.scene.scenePrompt}`);
+    parts.push(`\n当前场景：${current.scene.scenePrompt}`);
    const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
    const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId);
    if (lastBeat) {
@@ -0,0 +1,106 @@
+import { provisionVoice, synthesize } from "@yume/tts-client";
+import type {
+  BeatAudio,
+  Character,
+  CharacterVoice,
+  Scene,
+  Session,
+  TtsConfig,
+} from "@yume/types";
+
+export type BeatLike = {
+  id?: string;
+  speaker?: string;
+  line?: string;
+  lineDelivery?: string;
+};
+
+// When the director references a speaker that was never registered, derive a
+// description from the name + world so the voice's gender/temperament is at
+// least inferred from the name — never borrowed from another character.
+function inferredSpeakerDescription(name: string, session: Session): string {
+  return `请根据角色名「${name}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`;
+}
+
+// Voice a single beat against a mutable character registry.
+// Returns the (possibly-extended) registry plus the audio if synthesized.
+// Narration-only beats and missing-line beats return no audio (VN convention).
+export async function voiceBeat(
+  cfg: TtsConfig,
+  session: Session,
+  characters: Character[],
+  beat: BeatLike,
+): Promise<{ audio?: BeatAudio; characters: Character[] }> {
+  if (!beat.speaker || !beat.line) {
+    return { characters };
+  }
+
+  const speakerName = beat.speaker;
+  const text = beat.line;
+  const delivery = beat.lineDelivery;
+
+  // Hoisted so the catch can return the in-progress registry even if synthesis
+  // fails after provisioning succeeded — otherwise the just-provisioned voice
+  // would be lost and the next beat for this speaker would pay to re-design it
+  // (extra cost, latency, and more 429 risk on rate-limited providers).
+  let nextCharacters: Character[] = characters;
+
+  try {
+    const idx = characters.findIndex((c) => c.name === speakerName);
+    let voice: CharacterVoice | undefined;
+
+    if (idx !== -1 && characters[idx]?.voice) {
+      voice = characters[idx]!.voice;
+    } else if (idx !== -1) {
+      const target = characters[idx]!;
+      voice = await provisionVoice(cfg, target.description);
+      nextCharacters = characters.map((c, i) =>
+        i === idx ? { ...c, voice } : c,
+      );
+    } else {
+      const description = inferredSpeakerDescription(speakerName, session);
+      voice = await provisionVoice(cfg, description);
+      nextCharacters = [...characters, { name: speakerName, description, voice }];
+    }
+
+    const { audioBase64, mimeType } = await synthesize(
+      cfg,
+      voice,
+      text,
+      delivery,
+    );
+    return {
+      audio: { base64: audioBase64, mime: mimeType },
+      characters: nextCharacters,
+    };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.error(`[voice] degraded: ${msg}`);
+    return { characters: nextCharacters };
+  }
+}
+
+// Voice every beat in a scene. Sequential by design: a single speaker
+// appearing in multiple beats must provision exactly once and share that
+// voice across calls — parallel synthesis would race and create duplicates.
+// With 2–6 beats × ~500ms per clone the total cost is well inside the image
+// generation budget (10s+), so the simplicity is worth it.
+export async function voiceScene(
+  cfg: TtsConfig,
+  session: Session,
+  scene: Scene,
+): Promise<{
+  beatAudio: Record<string, BeatAudio>;
+  characters: Character[];
+}> {
+  let characters = session.characters;
+  const beatAudio: Record<string, BeatAudio> = {};
+
+  for (const beat of scene.beats) {
+    const res = await voiceBeat(cfg, session, characters, beat);
+    characters = res.characters;
+    if (res.audio) beatAudio[beat.id] = res.audio;
+  }
+
+  return { beatAudio, characters };
+}