feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞，声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
@@ -9,6 +9,8 @@ export type Beat = {
  narration?: string;
  speaker?: string;
  line?: string;
+  /** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
+  lineDelivery?: string;
  next: BeatNext;
 };

@@ -54,6 +56,30 @@ export type SceneHistoryEntry = {
  exit?: SceneExit;
 };

+// ──────────────────────────────────────────────────────────────────────
+//  Characters & voices (TTS)
+// ──────────────────────────────────────────────────────────────────────
+
+export type CharacterVoice = {
+  provider: "xiaomi";
+  /** Xiaomi MiMo design output stored as reference audio for later clones. */
+  referenceAudioBase64: string;
+  mimeType: string;
+};
+
+export type Character = {
+  name: string;
+  /** Free-form voice design description; must begin with explicit gender. */
+  description: string;
+  voice?: CharacterVoice;
+};
+
+/** A single beat's synthesized audio, attached to the response. */
+export type BeatAudio = {
+  base64: string;
+  mime: string;
+};
+
 // ──────────────────────────────────────────────────────────────────────
 //  Session
 // ──────────────────────────────────────────────────────────────────────
@@ -64,6 +90,8 @@ export type Session = {
  worldSetting: string;
  styleGuide: string;
  history: SceneHistoryEntry[];
+  /** Character registry — accumulates across scenes; voices persist for reuse. */
+  characters: Character[];
 };

 // ──────────────────────────────────────────────────────────────────────
@@ -87,10 +115,21 @@ export type ProviderConfig = {
  model: string;
 };

+export type TtsConfig = {
+  baseUrl: string;
+  apiKey: string;
+  /** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
+  speechModel: string;
+};
+
 export type EngineConfig = {
  text: ProviderConfig;
  image: ProviderConfig;
  vision: ProviderConfig;
+  /** Optional — when missing the game runs silently (no TTS). */
+  tts?: TtsConfig;
+  /** When true the renderer returns a placeholder PNG instead of calling the image API. */
+  mockImage?: boolean;
 };

 // ──────────────────────────────────────────────────────────────────────
@@ -106,6 +145,10 @@ export type StartResponse = {
  sessionId: string;
  scene: Scene;
  imageBase64: string;
+  /** Post-voice character registry (with provisioned voices). */
+  characters: Character[];
+  /** Per-beat synthesized audio, keyed by beat.id. */
+  beatAudio?: Record<string, BeatAudio>;
 };

 // /api/scene — generates the next Scene, given session whose latest
@@ -118,6 +161,8 @@ export type SceneRequest = {
 export type SceneResponse = {
  scene: Scene;
  imageBase64: string;
+  characters: Character[];
+  beatAudio?: Record<string, BeatAudio>;
 };

 // /api/vision — interprets a background click on the current image and
@@ -141,10 +186,16 @@ export type InsertBeatRequest = {
  freeformAction: string;
 };

-export type InsertBeatResponse = {
-  partial: {
-    narration?: string;
-    speaker?: string;
-    line?: string;
-  };
+/** Partial beat fields produced by the insert-beat director. */
+export type InsertBeatPartial = {
+  narration?: string;
+  speaker?: string;
+  line?: string;
+  lineDelivery?: string;
+};
+
+export type InsertBeatResponse = {
+  partial: InsertBeatPartial;
+  characters: Character[];
+  audio?: BeatAudio;
 };