feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞，声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
@@ -1,24 +1,45 @@
 # =============================================================
 # 云梦 — AI 视觉小说
-# Three independently configurable AI providers
-# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
-# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
+# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
+# (one API key covers all three) + any image provider for IMAGE.
 #
+# Any OpenAI-compatible endpoint works for any slot — OpenRouter,
+# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
 # Image generation uses the chat-completions + modalities API
 # (OpenRouter-style), NOT the legacy /images/generations endpoint.
 # =============================================================

-# ---- 1. Text LLM (story director) -----------------------------
-TEXT_BASE_URL=https://openrouter.ai/api/v1
-TEXT_API_KEY=sk-or-v1-xxx
-TEXT_MODEL=~anthropic/claude-sonnet-latest
+# ---- 1. Text LLM · scene director ----------------------------------
+# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN)
+# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1
+# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys)
+TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
+TEXT_API_KEY=tp-xxx
+TEXT_MODEL=mimo-v2.5-pro

-# ---- 2. Image generator (renders the whole UI screen) ---------
+# ---- 2. Image generator (renders the scene background) -------------
+# Any provider supporting chat-completions + modalities image output.
 IMAGE_BASE_URL=https://openrouter.ai/api/v1
 IMAGE_API_KEY=sk-or-v1-xxx
 IMAGE_MODEL=openai/gpt-5.4-image-2

-# ---- 3. Vision model (interprets where the user clicked) ------
-VISION_BASE_URL=https://openrouter.ai/api/v1
-VISION_API_KEY=sk-or-v1-xxx
-VISION_MODEL=~google/gemini-flash-latest
+# ---- 3. Vision model · multimodal click interpretation -------------
+# Recommended: MiMo V2.5 omni — multimodal.
+# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and
+#    rejects image_url content parts.
+VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
+VISION_API_KEY=tp-xxx
+VISION_MODEL=mimo-v2.5
+
+# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------
+# Per-character voice design → clone, with per-line delivery direction.
+# Voice identity = the reference audio kept in the session (no server expiry).
+# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL.
+TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
+TTS_API_KEY=tp-xxx
+TTS_SPEECH_MODEL=mimo-v2.5-tts
+
+# ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) -----
+# true → return a placeholder image instead of calling the image model.
+# Text/story/voice still run normally. Great for iterating on TTS.
+MOCK_IMAGE=false
@@ -14,6 +14,7 @@ import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
 import { PRESETS } from "@/lib/presets";
 import type {
  Beat,
+  BeatAudio,
  BeatChoice,
  InsertBeatResponse,
  Scene,
@@ -24,6 +25,8 @@ import type {
  VisionResponse,
 } from "@yume/types";

+const MUTED_STORAGE_KEY = "yume:muted";
+
 // ──────────────────────────────────────────────────────────────────────
 //  Prefetch pool — speculative SceneResponses keyed by choice path.
 //
@@ -133,7 +136,16 @@ function prefetchScenePath(
            nextSceneSeed: sole.effect.nextSceneSeed,
          },
        };
-        prefetchScenePath(pool, baseSession, [...steps, nextStep], depth + 1);
+        // Carry forward the registry that the parent prefetch result already
+        // settled (it may include characters introduced by the intermediate
+        // scene). Without this, the L2+ prefetch starts from the original
+        // base.characters and a later transition through this survivor would
+        // silently drop voices the player has already heard.
+        const carriedBase: Session = {
+          ...baseSession,
+          characters: data.characters,
+        };
+        prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1);
      }
    }

@@ -181,6 +193,18 @@ function PlayInner() {
  const [currentScene, setCurrentScene] = useState<Scene | null>(null);
  const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
  const [imageBase64, setImageBase64] = useState<string | null>(null);
+  const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
+  // Lazy-initialize from localStorage so PlayCanvas never mounts with the
+  // wrong muted value (an effect-based read would briefly let audio play
+  // before the preference settled in a scenario where audio arrives early).
+  const [muted, setMuted] = useState<boolean>(() => {
+    if (typeof window === "undefined") return false;
+    try {
+      return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1";
+    } catch {
+      return false;
+    }
+  });
  const [pendingClick, setPendingClick] = useState<{
    x: number;
    y: number;
@@ -203,6 +227,10 @@ function PlayInner() {
    return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
  }, [currentScene, currentBeatId]);

+  const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined;
+  const audioBase64 = currentBeatAudio?.base64 ?? null;
+  const audioMime = currentBeatAudio?.mime ?? null;
+
  useEffect(() => {
    sessionRef.current = session;
  }, [session]);
@@ -231,6 +259,19 @@ function PlayInner() {
    });
  }, [currentBeatId]);

+  // ── Mute persistence (read is via the useState lazy initializer above) ─
+  const toggleMuted = useCallback(() => {
+    setMuted((prev) => {
+      const next = !prev;
+      try {
+        window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0");
+      } catch {
+        // ignore
+      }
+      return next;
+    });
+  }, []);
+
  // ── Presentation mode toggle ─────────────────────────────────────────
  const togglePresentation = useCallback(async () => {
    const entering = !presentation;
@@ -327,12 +368,14 @@ function PlayInner() {
              visitedBeatIds: [data.scene.entryBeatId],
            },
          ],
+          characters: data.characters,
        };
        visitedBeatsRef.current = [data.scene.entryBeatId];
        setSession(initial);
        setCurrentScene(data.scene);
        setCurrentBeatId(data.scene.entryBeatId);
        setImageBase64(data.imageBase64);
+        setBeatAudioMap(data.beatAudio ?? {});
        setPhase("ready");
      })
      .catch((e) => setError(String(e)));
@@ -409,12 +452,14 @@ function PlayInner() {
            visitedBeatIds: [result.scene.entryBeatId],
          },
        ],
+        characters: result.characters,
      };
      visitedBeatsRef.current = [result.scene.entryBeatId];
      setSession(newSession);
      setCurrentScene(result.scene);
      setCurrentBeatId(result.scene.entryBeatId);
      setImageBase64(result.imageBase64);
+      setBeatAudioMap(result.beatAudio ?? {});
      setLastExitLabel(exitLabel);
      setPhase("ready");
    } catch (e) {
@@ -514,7 +559,8 @@ function PlayInner() {
          };
          throw new Error(j.error ?? insertRes.statusText);
        }
-        const { partial } = (await insertRes.json()) as InsertBeatResponse;
+        const { partial, characters: insertChars, audio } =
+          (await insertRes.json()) as InsertBeatResponse;

        const fromBeatId =
          currentBeatRef.current?.id ?? currentScene.entryBeatId;
@@ -526,6 +572,7 @@ function PlayInner() {
          narration: partial.narration,
          speaker: partial.speaker,
          line: partial.line,
+          lineDelivery: partial.lineDelivery,
          next: { type: "continue", nextBeatId: fromBeatId },
        };

@@ -541,11 +588,15 @@ function PlayInner() {
                history: s.history.map((h, i, arr) =>
                  i === arr.length - 1 ? { ...h, scene: patched } : h,
                ),
+                characters: insertChars,
              }
            : s,
        );
        setCurrentScene(patched);
        setCurrentBeatId(newBeatId);
+        if (audio) {
+          setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
+        }
        setLastExitLabel(decision.intent.freeformAction);
        setPhase("ready");
        setPendingClick(null);
@@ -627,6 +678,9 @@ function PlayInner() {
      <div className="fixed inset-0 bg-black flex items-center justify-center z-50">
        <PlayCanvas
          imageBase64={imageBase64}
+          audioBase64={audioBase64}
+          audioMime={audioMime}
+          muted={muted}
          phase={phase}
          beat={currentBeat}
          pendingClick={pendingClick}
@@ -666,6 +720,9 @@ function PlayInner() {
      <main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
        <PlayCanvas
          imageBase64={imageBase64}
+          audioBase64={audioBase64}
+          audioMime={audioMime}
+          muted={muted}
          phase={phase}
          beat={currentBeat}
          pendingClick={pendingClick}
@@ -700,7 +757,17 @@ function PlayInner() {
          F · 演 · 示
        </button>
        <div className="text-[9px] smallcaps text-clay-400 num">Ⅰ · Ⅰ</div>
-        <span className="text-[9px] w-[60px]" aria-hidden />
+        <button
+          type="button"
+          onClick={toggleMuted}
+          className="text-[9px] smallcaps text-clay-400 hover:text-clay-700 transition-colors flex items-center gap-2 w-[80px] justify-end"
+          aria-label={muted ? "取消静音" : "静音"}
+        >
+          <i
+            className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
+          />
+          {muted ? "静 · 音" : "有 · 声"}
+        </button>
      </footer>
    </div>
  );
@@ -13,30 +13,66 @@ export type Phase =
 const SHADOW =
  "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";

+const DEFAULT_CHAR_MS = 28;
+const MIN_CHAR_MS = 30;
+// Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice
+// while preserving pitch. Typewriter pacing is divided by the same factor.
+const SPEECH_RATE = 1.2;
+// If audio metadata never arrives within this window, give up waiting and
+// let the typewriter run at default speed.
+const AUDIO_WAIT_TIMEOUT_MS = 2500;
+
 // ── Typewriter hook ────────────────────────────────────────────────────
 // Returns the progressively-revealed text, a `done` flag, and a `skip()` that
 // instantly completes the current text. Reset is keyed by `resetKey` (the beat
 // id) rather than the text, so a new beat whose line happens to match the
-// previous one still replays from scratch. `done` is derived synchronously
-// (not from a post-paint effect) so a stale "done" frame never paints.
+// previous one still replays from scratch.
+//
+// When `targetDurationMs` is provided we space characters to span that audio
+// duration, keeping text and voice in lockstep. While `waitForAudio` is true
+// and we don't yet know a duration, the typewriter holds (so text doesn't
+// race ahead of an audio that's still loading).
 function useTypewriter(
  text: string,
  resetKey: string,
-  speed = 28,
+  opts: { targetDurationMs?: number; waitForAudio: boolean } = {
+    waitForAudio: false,
+  },
 ): { shown: string; done: boolean; skip: () => void } {
+  const { targetDurationMs, waitForAudio } = opts;
  const [displayed, setDisplayed] = useState("");
  const [prevKey, setPrevKey] = useState(resetKey);
  const timer = useRef<ReturnType<typeof setInterval> | null>(null);
+  // Sticky once the player has skipped this beat: prevents a late-arriving
+  // audio metadata event from re-triggering the effect and replaying the text.
+  const skippedRef = useRef(false);

  // Render-phase reset (React "adjust state on prop change" pattern): when the
  // beat changes, drop the old progress before this render commits.
  if (resetKey !== prevKey) {
    setPrevKey(resetKey);
    setDisplayed("");
+    skippedRef.current = false;
  }

  useEffect(() => {
    if (!text) return;
+    // `=== undefined` (not `!targetDurationMs`): 0 means "audio failed or
+    // timed out — run at default speed". The original truthy check stalled
+    // the typewriter forever on those fallback paths.
+    if (waitForAudio && targetDurationMs === undefined) return;
+    // If the player skipped, settle on the full text and don't restart even
+    // when audio metadata arrives late and re-triggers this effect.
+    if (skippedRef.current) {
+      setDisplayed(text);
+      return;
+    }
+
+    const speed =
+      targetDurationMs && text.length > 0
+        ? Math.max(MIN_CHAR_MS, targetDurationMs / text.length)
+        : DEFAULT_CHAR_MS;
+
    let i = 0;
    timer.current = setInterval(() => {
      i += 1;
@@ -50,13 +86,14 @@ function useTypewriter(
      if (timer.current) clearInterval(timer.current);
      timer.current = null;
    };
-  }, [resetKey, text, speed]);
+  }, [resetKey, text, targetDurationMs, waitForAudio]);

  const skip = useCallback(() => {
    if (timer.current) {
      clearInterval(timer.current);
      timer.current = null;
    }
+    skippedRef.current = true;
    setDisplayed(text);
  }, [text]);

@@ -123,6 +160,9 @@ function ChoiceButton({
 // ── Main component ─────────────────────────────────────────────────────
 export function PlayCanvas({
  imageBase64,
+  audioBase64,
+  audioMime,
+  muted,
  phase,
  beat,
  pendingClick,
@@ -132,6 +172,9 @@ export function PlayCanvas({
  fullViewport = false,
 }: {
  imageBase64: string | null;
+  audioBase64: string | null;
+  audioMime: string | null;
+  muted: boolean;
  phase: Phase;
  beat: Beat | null;
  pendingClick: { x: number; y: number } | null;
@@ -141,7 +184,11 @@ export function PlayCanvas({
  fullViewport?: boolean;
 }) {
  const imgRef = useRef<HTMLImageElement>(null);
+  const audioRef = useRef<HTMLAudioElement>(null);
  const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
+  const [audioDurationMs, setAudioDurationMs] = useState<number | undefined>(
+    undefined,
+  );

  const isChoiceBeat = beat?.next.type === "choice";
  const choices: BeatChoice[] = isChoiceBeat
@@ -150,7 +197,56 @@ export function PlayCanvas({

  const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? "";
  const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
-    useTypewriter(displayBody, beat?.id ?? "", 30);
+    useTypewriter(displayBody, beat?.id ?? "", {
+      targetDurationMs: audioDurationMs,
+      waitForAudio: Boolean(audioBase64),
+    });
+
+  // ── Audio source change ──────────────────────────────────────────────
+  // Reset duration when audio source changes; if loading takes too long,
+  // unblock the typewriter via timeout so text doesn't stall.
+  useEffect(() => {
+    setAudioDurationMs(undefined);
+    if (!audioBase64) return;
+    const timer = setTimeout(() => {
+      setAudioDurationMs((prev) => prev ?? 0);
+    }, AUDIO_WAIT_TIMEOUT_MS);
+    return () => clearTimeout(timer);
+  }, [audioBase64]);
+
+  // ── Mute toggle ───────────────────────────────────────────────────────
+  useEffect(() => {
+    const el = audioRef.current;
+    if (!el) return;
+    el.muted = muted;
+    el.playbackRate = SPEECH_RATE;
+    if (!muted && audioBase64 && el.paused) {
+      el.play().catch(() => {
+        // autoplay blocked — silent until next interaction
+      });
+    }
+  }, [muted, audioBase64]);
+
+  function handleAudioMetadata() {
+    const el = audioRef.current;
+    if (!el) return;
+    el.playbackRate = SPEECH_RATE;
+    // Effective playback time is shorter once sped up — keep the typewriter in sync.
+    const ms = Number.isFinite(el.duration)
+      ? (el.duration * 1000) / SPEECH_RATE
+      : 0;
+    setAudioDurationMs(ms > 0 ? ms : 0);
+    if (!muted) {
+      el.play().catch(() => {
+        // autoplay blocked
+      });
+    }
+  }
+
+  function handleAudioError() {
+    // Treat as zero duration so the typewriter runs at default speed.
+    setAudioDurationMs(0);
+  }

  function handleImageClick(e: React.MouseEvent<HTMLImageElement>) {
    if (phase !== "ready" || !imgRef.current || !beat) return;
@@ -197,6 +293,19 @@ export function PlayCanvas({
    <div
      className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
    >
+      {/* Hidden audio element — voice playback for the current beat */}
+      {audioBase64 && (
+        <audio
+          key={audioBase64.slice(-48)}
+          ref={audioRef}
+          src={`data:${audioMime ?? "audio/wav"};base64,${audioBase64}`}
+          preload="auto"
+          onLoadedMetadata={handleAudioMetadata}
+          onError={handleAudioError}
+          className="hidden"
+        />
+      )}
+
      {imageBase64 ? (
        <div
          className="relative inline-block"
@@ -1,4 +1,4 @@
-import type { EngineConfig } from "@yume/types";
+import type { EngineConfig, TtsConfig } from "@yume/types";

 function readVar(name: string): string {
  const v = process.env[name];
@@ -6,6 +6,22 @@ function readVar(name: string): string {
  return v;
 }

+function readOptionalVar(name: string): string | undefined {
+  const v = process.env[name];
+  return v && v.length > 0 ? v : undefined;
+}
+
+function loadTtsConfig(): TtsConfig | undefined {
+  const baseUrl = readOptionalVar("TTS_BASE_URL");
+  const apiKey = readOptionalVar("TTS_API_KEY");
+  const speechModel = readOptionalVar("TTS_SPEECH_MODEL");
+
+  // Missing any → TTS disabled (game runs silently).
+  if (!baseUrl || !apiKey || !speechModel) return undefined;
+
+  return { baseUrl, apiKey, speechModel };
+}
+
 export function loadEngineConfig(): EngineConfig {
  return {
    text: {
@@ -23,5 +39,7 @@ export function loadEngineConfig(): EngineConfig {
      apiKey: readVar("VISION_API_KEY"),
      model: readVar("VISION_MODEL"),
    },
+    tts: loadTtsConfig(),
+    mockImage: readOptionalVar("MOCK_IMAGE") === "true",
  };
 }
@@ -4,7 +4,12 @@ import type { NextConfig } from "next";
 const config: NextConfig = {
  reactStrictMode: true,
  typedRoutes: false,
-  transpilePackages: ["@yume/engine", "@yume/ai-client", "@yume/types"],
+  transpilePackages: [
+    "@yume/engine",
+    "@yume/ai-client",
+    "@yume/types",
+    "@yume/tts-client",
+  ],
  serverExternalPackages: ["sharp"],
  turbopack: {
    root: path.join(__dirname, "..", ".."),