feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞，声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
@@ -4,6 +4,8 @@ import type {
  BeatChoice,
  BeatChoiceEffect,
  BeatNext,
+  Character,
+  InsertBeatPartial,
  ProviderConfig,
  Scene,
  Session,
@@ -43,13 +45,20 @@ type RawBeat = {
  narration?: string;
  speaker?: string;
  line?: string;
+  lineDelivery?: string;
  next?: RawNext;
 };

+type RawCharacterUpdate = {
+  name?: string;
+  description?: string;
+};
+
 type RawScene = {
  scenePrompt?: string;
  entryBeatId?: string;
  beats?: RawBeat[];
+  characterUpdates?: RawCharacterUpdate[];
 };

 function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
  // last/dangling continue into a real scene-change exit so the player can
  // never get stuck self-looping on it.
  const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
+  const line = raw.line?.trim() || undefined;
  return {
    id,
    narration: raw.narration?.trim() || undefined,
    speaker: raw.speaker?.trim() || undefined,
-    line: raw.line?.trim() || undefined,
+    line,
+    // lineDelivery only meaningful when there is a line to deliver.
+    lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
    next: coerceNext(raw.next, fallback),
  };
 }

+function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
+  if (!Array.isArray(raw)) return [];
+  return raw
+    .map((c) => ({
+      name: c.name?.trim() ?? "",
+      description: c.description?.trim() ?? "",
+    }))
+    .filter((c) => c.name && c.description);
+}
+
 const FALLBACK_SEED = "故事继续推进";

 function fallbackExitChoice(beatId: string): BeatChoice {
@@ -230,10 +252,15 @@ function newSceneId(): string {
 //  Called both on real scene transitions AND on speculative prefetch.
 // ──────────────────────────────────────────────────────────────────────

+export type SceneResult = {
+  scene: Scene;
+  characterUpdates: Character[];
+};
+
 export async function directScene(
  config: ProviderConfig,
  session: Session,
-): Promise<Scene> {
+): Promise<SceneResult> {
  const raw = await chat(
    config,
    [
@@ -264,10 +291,13 @@ export async function directScene(
      : beats[0]!.id;

  return {
-    id: newSceneId(),
-    scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
-    beats,
-    entryBeatId,
+    scene: {
+      id: newSceneId(),
+      scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
+      beats,
+      entryBeatId,
+    },
+    characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
  };
 }

@@ -280,7 +310,7 @@ export async function directInsertBeat(
  config: ProviderConfig,
  session: Session,
  freeformAction: string,
-): Promise<{ narration?: string; speaker?: string; line?: string }> {
+): Promise<InsertBeatPartial> {
  const raw = await chat(
    config,
    [
@@ -293,15 +323,12 @@ export async function directInsertBeat(
    { temperature: 0.9, responseFormat: "json_object" },
  );

-  const parsed = parseJsonLoose<{
-    narration?: string;
-    speaker?: string;
-    line?: string;
-  }>(raw);
+  const parsed = parseJsonLoose<InsertBeatPartial>(raw);

  const narration = parsed.narration?.trim() || undefined;
  const speaker = parsed.speaker?.trim() || undefined;
  const line = parsed.line?.trim() || undefined;
+  const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;

  // If the model returned nothing usable, supply a fallback narration so the
  // frontend doesn't append a silent empty beat that renders no dialogue —
@@ -309,5 +336,5 @@ export async function directInsertBeat(
  if (!narration && !speaker && !line) {
    return { narration: "（你停下脚步，环视片刻。）" };
  }
-  return { narration, speaker, line };
+  return { narration, speaker, line, lineDelivery };
 }