feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞，声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-05-28 20:45:21 +08:00
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
@@ -1,24 +1,45 @@
 # =============================================================
 # 云梦 — AI 视觉小说
-# Three independently configurable AI providers
+# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
-# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
+# (one API key covers all three) + any image provider for IMAGE.
 # Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
 #
 # Any OpenAI-compatible endpoint works for any slot — OpenRouter,
 # OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
 # Image generation uses the chat-completions + modalities API
 # (OpenRouter-style), NOT the legacy /images/generations endpoint.
 # =============================================================
-# ---- 1. Text LLM (story director) -----------------------------
+# ---- 1. Text LLM · scene director ----------------------------------
-TEXT_BASE_URL=https://openrouter.ai/api/v1
+# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN)
-TEXT_API_KEY=sk-or-v1-xxx
+# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1
-TEXT_MODEL=~anthropic/claude-sonnet-latest
+# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys)
 TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 TEXT_API_KEY=tp-xxx
 TEXT_MODEL=mimo-v2.5-pro
-# ---- 2. Image generator (renders the whole UI screen) ---------
+# ---- 2. Image generator (renders the scene background) -------------
 # Any provider supporting chat-completions + modalities image output.
 IMAGE_BASE_URL=https://openrouter.ai/api/v1
 IMAGE_API_KEY=sk-or-v1-xxx
 IMAGE_MODEL=openai/gpt-5.4-image-2
-# ---- 3. Vision model (interprets where the user clicked) ------
+# ---- 3. Vision model · multimodal click interpretation -------------
-VISION_BASE_URL=https://openrouter.ai/api/v1
+# Recommended: MiMo V2.5 omni — multimodal.
-VISION_API_KEY=sk-or-v1-xxx
+# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and
-VISION_MODEL=~google/gemini-flash-latest
+#    rejects image_url content parts.
 VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 VISION_API_KEY=tp-xxx
 VISION_MODEL=mimo-v2.5
 # ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------
 # Per-character voice design → clone, with per-line delivery direction.
 # Voice identity = the reference audio kept in the session (no server expiry).
 # The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL.
 TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 TTS_API_KEY=tp-xxx
 TTS_SPEECH_MODEL=mimo-v2.5-tts
 # ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) -----
 # true → return a placeholder image instead of calling the image model.
 # Text/story/voice still run normally. Great for iterating on TTS.
 MOCK_IMAGE=false
@@ -14,6 +14,7 @@ import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
 import { PRESETS } from "@/lib/presets";
 import type {
  Beat,
  BeatAudio,
  BeatChoice,
  InsertBeatResponse,
  Scene,
@@ -24,6 +25,8 @@ import type {
  VisionResponse,
 } from "@yume/types";
 const MUTED_STORAGE_KEY = "yume:muted";
 // ──────────────────────────────────────────────────────────────────────
 //  Prefetch pool — speculative SceneResponses keyed by choice path.
 //
@@ -133,7 +136,16 @@ function prefetchScenePath(
            nextSceneSeed: sole.effect.nextSceneSeed,
          },
        };
-        prefetchScenePath(pool, baseSession, [...steps, nextStep], depth + 1);
+        // Carry forward the registry that the parent prefetch result already
        // settled (it may include characters introduced by the intermediate
        // scene). Without this, the L2+ prefetch starts from the original
        // base.characters and a later transition through this survivor would
        // silently drop voices the player has already heard.
        const carriedBase: Session = {
          ...baseSession,
          characters: data.characters,
        };
        prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1);
      }
    }
@@ -181,6 +193,18 @@ function PlayInner() {
  const [currentScene, setCurrentScene] = useState<Scene | null>(null);
  const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
  const [imageBase64, setImageBase64] = useState<string | null>(null);
  const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
  // Lazy-initialize from localStorage so PlayCanvas never mounts with the
  // wrong muted value (an effect-based read would briefly let audio play
  // before the preference settled in a scenario where audio arrives early).
  const [muted, setMuted] = useState<boolean>(() => {
    if (typeof window === "undefined") return false;
    try {
      return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1";
    } catch {
      return false;
    }
  });
  const [pendingClick, setPendingClick] = useState<{
    x: number;
    y: number;
@@ -203,6 +227,10 @@ function PlayInner() {
    return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
  }, [currentScene, currentBeatId]);
  const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined;
  const audioBase64 = currentBeatAudio?.base64 ?? null;
  const audioMime = currentBeatAudio?.mime ?? null;
  useEffect(() => {
    sessionRef.current = session;
  }, [session]);
@@ -231,6 +259,19 @@ function PlayInner() {
    });
  }, [currentBeatId]);
  // ── Mute persistence (read is via the useState lazy initializer above) ─
  const toggleMuted = useCallback(() => {
    setMuted((prev) => {
      const next = !prev;
      try {
        window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0");
      } catch {
        // ignore
      }
      return next;
    });
  }, []);
  // ── Presentation mode toggle ─────────────────────────────────────────
  const togglePresentation = useCallback(async () => {
    const entering = !presentation;
@@ -327,12 +368,14 @@ function PlayInner() {
              visitedBeatIds: [data.scene.entryBeatId],
            },
          ],
          characters: data.characters,
        };
        visitedBeatsRef.current = [data.scene.entryBeatId];
        setSession(initial);
        setCurrentScene(data.scene);
        setCurrentBeatId(data.scene.entryBeatId);
        setImageBase64(data.imageBase64);
        setBeatAudioMap(data.beatAudio ?? {});
        setPhase("ready");
      })
      .catch((e) => setError(String(e)));
@@ -409,12 +452,14 @@ function PlayInner() {
            visitedBeatIds: [result.scene.entryBeatId],
          },
        ],
        characters: result.characters,
      };
      visitedBeatsRef.current = [result.scene.entryBeatId];
      setSession(newSession);
      setCurrentScene(result.scene);
      setCurrentBeatId(result.scene.entryBeatId);
      setImageBase64(result.imageBase64);
      setBeatAudioMap(result.beatAudio ?? {});
      setLastExitLabel(exitLabel);
      setPhase("ready");
    } catch (e) {
@@ -514,7 +559,8 @@ function PlayInner() {
          };
          throw new Error(j.error ?? insertRes.statusText);
        }
-        const { partial } = (await insertRes.json()) as InsertBeatResponse;
+        const { partial, characters: insertChars, audio } =
          (await insertRes.json()) as InsertBeatResponse;
        const fromBeatId =
          currentBeatRef.current?.id ?? currentScene.entryBeatId;
@@ -526,6 +572,7 @@ function PlayInner() {
          narration: partial.narration,
          speaker: partial.speaker,
          line: partial.line,
          lineDelivery: partial.lineDelivery,
          next: { type: "continue", nextBeatId: fromBeatId },
        };
@@ -541,11 +588,15 @@ function PlayInner() {
                history: s.history.map((h, i, arr) =>
                  i === arr.length - 1 ? { ...h, scene: patched } : h,
                ),
                characters: insertChars,
              }
            : s,
        );
        setCurrentScene(patched);
        setCurrentBeatId(newBeatId);
        if (audio) {
          setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
        }
        setLastExitLabel(decision.intent.freeformAction);
        setPhase("ready");
        setPendingClick(null);
@@ -627,6 +678,9 @@ function PlayInner() {
      <div className="fixed inset-0 bg-black flex items-center justify-center z-50">
        <PlayCanvas
          imageBase64={imageBase64}
          audioBase64={audioBase64}
          audioMime={audioMime}
          muted={muted}
          phase={phase}
          beat={currentBeat}
          pendingClick={pendingClick}
@@ -666,6 +720,9 @@ function PlayInner() {
      <main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
        <PlayCanvas
          imageBase64={imageBase64}
          audioBase64={audioBase64}
          audioMime={audioMime}
          muted={muted}
          phase={phase}
          beat={currentBeat}
          pendingClick={pendingClick}
@@ -700,7 +757,17 @@ function PlayInner() {
          F · 演 · 示
        </button>
        <div className="text-[9px] smallcaps text-clay-400 num">Ⅰ · Ⅰ</div>
-        <span className="text-[9px] w-[60px]" aria-hidden />
+        <button
          type="button"
          onClick={toggleMuted}
          className="text-[9px] smallcaps text-clay-400 hover:text-clay-700 transition-colors flex items-center gap-2 w-[80px] justify-end"
          aria-label={muted ? "取消静音" : "静音"}
        >
          <i
            className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
          />
          {muted ? "静 · 音" : "有 · 声"}
        </button>
      </footer>
    </div>
  );
@@ -13,30 +13,66 @@ export type Phase =
 const SHADOW =
  "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
 const DEFAULT_CHAR_MS = 28;
 const MIN_CHAR_MS = 30;
 // Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice
 // while preserving pitch. Typewriter pacing is divided by the same factor.
 const SPEECH_RATE = 1.2;
 // If audio metadata never arrives within this window, give up waiting and
 // let the typewriter run at default speed.
 const AUDIO_WAIT_TIMEOUT_MS = 2500;
 // ── Typewriter hook ────────────────────────────────────────────────────
 // Returns the progressively-revealed text, a `done` flag, and a `skip()` that
 // instantly completes the current text. Reset is keyed by `resetKey` (the beat
 // id) rather than the text, so a new beat whose line happens to match the
-// previous one still replays from scratch. `done` is derived synchronously
+// previous one still replays from scratch.
-// (not from a post-paint effect) so a stale "done" frame never paints.
+//
 // When `targetDurationMs` is provided we space characters to span that audio
 // duration, keeping text and voice in lockstep. While `waitForAudio` is true
 // and we don't yet know a duration, the typewriter holds (so text doesn't
 // race ahead of an audio that's still loading).
 function useTypewriter(
  text: string,
  resetKey: string,
-  speed = 28,
+  opts: { targetDurationMs?: number; waitForAudio: boolean } = {
    waitForAudio: false,
  },
 ): { shown: string; done: boolean; skip: () => void } {
  const { targetDurationMs, waitForAudio } = opts;
  const [displayed, setDisplayed] = useState("");
  const [prevKey, setPrevKey] = useState(resetKey);
  const timer = useRef<ReturnType<typeof setInterval> | null>(null);
  // Sticky once the player has skipped this beat: prevents a late-arriving
  // audio metadata event from re-triggering the effect and replaying the text.
  const skippedRef = useRef(false);
  // Render-phase reset (React "adjust state on prop change" pattern): when the
  // beat changes, drop the old progress before this render commits.
  if (resetKey !== prevKey) {
    setPrevKey(resetKey);
    setDisplayed("");
    skippedRef.current = false;
  }
  useEffect(() => {
    if (!text) return;
    // `=== undefined` (not `!targetDurationMs`): 0 means "audio failed or
    // timed out — run at default speed". The original truthy check stalled
    // the typewriter forever on those fallback paths.
    if (waitForAudio && targetDurationMs === undefined) return;
    // If the player skipped, settle on the full text and don't restart even
    // when audio metadata arrives late and re-triggers this effect.
    if (skippedRef.current) {
      setDisplayed(text);
      return;
    }
    const speed =
      targetDurationMs && text.length > 0
        ? Math.max(MIN_CHAR_MS, targetDurationMs / text.length)
        : DEFAULT_CHAR_MS;
    let i = 0;
    timer.current = setInterval(() => {
      i += 1;
@@ -50,13 +86,14 @@ function useTypewriter(
      if (timer.current) clearInterval(timer.current);
      timer.current = null;
    };
-  }, [resetKey, text, speed]);
+  }, [resetKey, text, targetDurationMs, waitForAudio]);
  const skip = useCallback(() => {
    if (timer.current) {
      clearInterval(timer.current);
      timer.current = null;
    }
    skippedRef.current = true;
    setDisplayed(text);
  }, [text]);
@@ -123,6 +160,9 @@ function ChoiceButton({
 // ── Main component ─────────────────────────────────────────────────────
 export function PlayCanvas({
  imageBase64,
  audioBase64,
  audioMime,
  muted,
  phase,
  beat,
  pendingClick,
@@ -132,6 +172,9 @@ export function PlayCanvas({
  fullViewport = false,
 }: {
  imageBase64: string | null;
  audioBase64: string | null;
  audioMime: string | null;
  muted: boolean;
  phase: Phase;
  beat: Beat | null;
  pendingClick: { x: number; y: number } | null;
@@ -141,7 +184,11 @@ export function PlayCanvas({
  fullViewport?: boolean;
 }) {
  const imgRef = useRef<HTMLImageElement>(null);
  const audioRef = useRef<HTMLAudioElement>(null);
  const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
  const [audioDurationMs, setAudioDurationMs] = useState<number | undefined>(
    undefined,
  );
  const isChoiceBeat = beat?.next.type === "choice";
  const choices: BeatChoice[] = isChoiceBeat
@@ -150,7 +197,56 @@ export function PlayCanvas({
  const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? "";
  const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
-    useTypewriter(displayBody, beat?.id ?? "", 30);
+    useTypewriter(displayBody, beat?.id ?? "", {
      targetDurationMs: audioDurationMs,
      waitForAudio: Boolean(audioBase64),
    });
  // ── Audio source change ──────────────────────────────────────────────
  // Reset duration when audio source changes; if loading takes too long,
  // unblock the typewriter via timeout so text doesn't stall.
  useEffect(() => {
    setAudioDurationMs(undefined);
    if (!audioBase64) return;
    const timer = setTimeout(() => {
      setAudioDurationMs((prev) => prev ?? 0);
    }, AUDIO_WAIT_TIMEOUT_MS);
    return () => clearTimeout(timer);
  }, [audioBase64]);
  // ── Mute toggle ───────────────────────────────────────────────────────
  useEffect(() => {
    const el = audioRef.current;
    if (!el) return;
    el.muted = muted;
    el.playbackRate = SPEECH_RATE;
    if (!muted && audioBase64 && el.paused) {
      el.play().catch(() => {
        // autoplay blocked — silent until next interaction
      });
    }
  }, [muted, audioBase64]);
  function handleAudioMetadata() {
    const el = audioRef.current;
    if (!el) return;
    el.playbackRate = SPEECH_RATE;
    // Effective playback time is shorter once sped up — keep the typewriter in sync.
    const ms = Number.isFinite(el.duration)
      ? (el.duration * 1000) / SPEECH_RATE
      : 0;
    setAudioDurationMs(ms > 0 ? ms : 0);
    if (!muted) {
      el.play().catch(() => {
        // autoplay blocked
      });
    }
  }
  function handleAudioError() {
    // Treat as zero duration so the typewriter runs at default speed.
    setAudioDurationMs(0);
  }
  function handleImageClick(e: React.MouseEvent<HTMLImageElement>) {
    if (phase !== "ready" || !imgRef.current || !beat) return;
@@ -197,6 +293,19 @@ export function PlayCanvas({
    <div
      className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
    >
      {/* Hidden audio element — voice playback for the current beat */}
      {audioBase64 && (
        <audio
          key={audioBase64.slice(-48)}
          ref={audioRef}
          src={`data:${audioMime ?? "audio/wav"};base64,${audioBase64}`}
          preload="auto"
          onLoadedMetadata={handleAudioMetadata}
          onError={handleAudioError}
          className="hidden"
        />
      )}
      {imageBase64 ? (
        <div
          className="relative inline-block"
@@ -1,4 +1,4 @@
-import type { EngineConfig } from "@yume/types";
+import type { EngineConfig, TtsConfig } from "@yume/types";
 function readVar(name: string): string {
  const v = process.env[name];
@@ -6,6 +6,22 @@ function readVar(name: string): string {
  return v;
 }
 function readOptionalVar(name: string): string | undefined {
  const v = process.env[name];
  return v && v.length > 0 ? v : undefined;
 }
 function loadTtsConfig(): TtsConfig | undefined {
  const baseUrl = readOptionalVar("TTS_BASE_URL");
  const apiKey = readOptionalVar("TTS_API_KEY");
  const speechModel = readOptionalVar("TTS_SPEECH_MODEL");
  // Missing any → TTS disabled (game runs silently).
  if (!baseUrl || !apiKey || !speechModel) return undefined;
  return { baseUrl, apiKey, speechModel };
 }
 export function loadEngineConfig(): EngineConfig {
  return {
    text: {
@@ -23,5 +39,7 @@ export function loadEngineConfig(): EngineConfig {
      apiKey: readVar("VISION_API_KEY"),
      model: readVar("VISION_MODEL"),
    },
    tts: loadTtsConfig(),
    mockImage: readOptionalVar("MOCK_IMAGE") === "true",
  };
 }
@@ -4,7 +4,12 @@ import type { NextConfig } from "next";
 const config: NextConfig = {
  reactStrictMode: true,
  typedRoutes: false,
-  transpilePackages: ["@yume/engine", "@yume/ai-client", "@yume/types"],
+  transpilePackages: [
    "@yume/engine",
    "@yume/ai-client",
    "@yume/types",
    "@yume/tts-client",
  ],
  serverExternalPackages: ["sharp"],
  turbopack: {
    root: path.join(__dirname, "..", ".."),
@@ -13,6 +13,7 @@
  },
  "dependencies": {
    "@yume/ai-client": "workspace:*",
    "@yume/tts-client": "workspace:*",
    "@yume/types": "workspace:*",
    "sharp": "^0.33.5"
  }
@@ -4,6 +4,8 @@ import type {
  BeatChoice,
  BeatChoiceEffect,
  BeatNext,
  Character,
  InsertBeatPartial,
  ProviderConfig,
  Scene,
  Session,
@@ -43,13 +45,20 @@ type RawBeat = {
  narration?: string;
  speaker?: string;
  line?: string;
  lineDelivery?: string;
  next?: RawNext;
 };
 type RawCharacterUpdate = {
  name?: string;
  description?: string;
 };
 type RawScene = {
  scenePrompt?: string;
  entryBeatId?: string;
  beats?: RawBeat[];
  characterUpdates?: RawCharacterUpdate[];
 };
 function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
  // last/dangling continue into a real scene-change exit so the player can
  // never get stuck self-looping on it.
  const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
  const line = raw.line?.trim() || undefined;
  return {
    id,
    narration: raw.narration?.trim() || undefined,
    speaker: raw.speaker?.trim() || undefined,
-    line: raw.line?.trim() || undefined,
+    line,
    // lineDelivery only meaningful when there is a line to deliver.
    lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
    next: coerceNext(raw.next, fallback),
  };
 }
 function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
  if (!Array.isArray(raw)) return [];
  return raw
    .map((c) => ({
      name: c.name?.trim() ?? "",
      description: c.description?.trim() ?? "",
    }))
    .filter((c) => c.name && c.description);
 }
 const FALLBACK_SEED = "故事继续推进";
 function fallbackExitChoice(beatId: string): BeatChoice {
@@ -230,10 +252,15 @@ function newSceneId(): string {
 //  Called both on real scene transitions AND on speculative prefetch.
 // ──────────────────────────────────────────────────────────────────────
 export type SceneResult = {
  scene: Scene;
  characterUpdates: Character[];
 };
 export async function directScene(
  config: ProviderConfig,
  session: Session,
-): Promise<Scene> {
+): Promise<SceneResult> {
  const raw = await chat(
    config,
    [
@@ -264,10 +291,13 @@ export async function directScene(
      : beats[0]!.id;
  return {
-    id: newSceneId(),
+    scene: {
-    scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
+      id: newSceneId(),
-    beats,
+      scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
-    entryBeatId,
+      beats,
      entryBeatId,
    },
    characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
  };
 }
@@ -280,7 +310,7 @@ export async function directInsertBeat(
  config: ProviderConfig,
  session: Session,
  freeformAction: string,
-): Promise<{ narration?: string; speaker?: string; line?: string }> {
+): Promise<InsertBeatPartial> {
  const raw = await chat(
    config,
    [
@@ -293,15 +323,12 @@ export async function directInsertBeat(
    { temperature: 0.9, responseFormat: "json_object" },
  );
-  const parsed = parseJsonLoose<{
+  const parsed = parseJsonLoose<InsertBeatPartial>(raw);
    narration?: string;
    speaker?: string;
    line?: string;
  }>(raw);
  const narration = parsed.narration?.trim() || undefined;
  const speaker = parsed.speaker?.trim() || undefined;
  const line = parsed.line?.trim() || undefined;
  const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;
  // If the model returned nothing usable, supply a fallback narration so the
  // frontend doesn't append a silent empty beat that renders no dialogue —
@@ -309,5 +336,5 @@ export async function directInsertBeat(
  if (!narration && !speaker && !line) {
    return { narration: "（你停下脚步，环视片刻。）" };
  }
-  return { narration, speaker, line };
+  return { narration, speaker, line, lineDelivery };
 }
@@ -5,4 +5,7 @@ export {
  requestInsertBeat,
 } from "./orchestrator";
 export { annotateClick } from "./annotate";
 export { voiceBeat, voiceScene } from "./voice";
 export type { SceneResult } from "./director";
 export type { InsertBeatPartial } from "@yume/types";
 export * from "./prompts";
@@ -0,0 +1,25 @@
 import sharp from "sharp";
 let cached: string | undefined;
 // A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
 // TTS path without paying for image generation. Generated once, then memoized.
 export async function mockImageBase64(): Promise<string> {
  if (cached) return cached;
  const W = 1792;
  const H = 1024;
  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
    <rect width="${W}" height="${H}" fill="#161109"/>
    <rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
          stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
    <text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
          font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
    <text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
          font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
  </svg>`;
  const png = await sharp(Buffer.from(svg)).png().toBuffer();
  cached = png.toString("base64");
  return cached;
 }
@@ -1,7 +1,10 @@
 import type {
  BeatAudio,
  Character,
  EngineConfig,
  InsertBeatRequest,
  InsertBeatResponse,
  Scene,
  SceneRequest,
  SceneResponse,
  Session,
@@ -12,15 +15,55 @@ import type {
 } from "@yume/types";
 import { annotateClick } from "./annotate";
 import { directInsertBeat, directScene } from "./director";
 import { mockImageBase64 } from "./mockImage";
 import { render } from "./renderer";
 import { interpret } from "./vision";
 import { voiceBeat, voiceScene } from "./voice";
 function newSessionId(): string {
  return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
 }
 // Merge new character entries into the registry by name. If a name already
 // exists we preserve the existing voice (so a description revision never
 // silently re-provisions a voice the player has already heard).
 function mergeCharacters(existing: Character[], updates: Character[]): Character[] {
  if (updates.length === 0) return existing;
  const byName = new Map(existing.map((c) => [c.name, c]));
  for (const u of updates) {
    const prev = byName.get(u.name);
    byName.set(u.name, prev?.voice ? { ...u, voice: prev.voice } : u);
  }
  return Array.from(byName.values());
 }
 async function renderImage(
  config: EngineConfig,
  scene: Scene,
  styleGuide: string,
 ): Promise<string> {
  if (config.mockImage) return mockImageBase64();
  return render(config.image, scene, styleGuide);
 }
 async function runVoiceScene(
  config: EngineConfig,
  session: Session,
  scene: Scene,
 ): Promise<{
  beatAudio?: Record<string, BeatAudio>;
  characters: Character[];
 }> {
  if (!config.tts) return { characters: session.characters };
  const res = await voiceScene(config.tts, session, scene);
  return {
    beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
    characters: res.characters,
  };
 }
 // ──────────────────────────────────────────────────────────────────────
-//  startSession — first scene + image
+//  startSession — first scene + image + per-beat voice
 // ──────────────────────────────────────────────────────────────────────
 export async function startSession(
@@ -33,31 +76,55 @@ export async function startSession(
    worldSetting: req.worldSetting.trim(),
    styleGuide: req.styleGuide.trim(),
    history: [],
    characters: [],
  };
-  const scene = await directScene(config.text, session);
+  const { scene, characterUpdates } = await directScene(config.text, session);
-  const imageBase64 = await render(config.image, scene, session.styleGuide);
+  const preVoiceSession: Session = {
    ...session,
    characters: mergeCharacters(session.characters, characterUpdates),
  };
  const [imageBase64, voiceRes] = await Promise.all([
    renderImage(config, scene, preVoiceSession.styleGuide),
    runVoiceScene(config, preVoiceSession, scene),
  ]);
  return {
    sessionId: session.id,
    scene,
    imageBase64,
    characters: voiceRes.characters,
    beatAudio: voiceRes.beatAudio,
  };
 }
 // ──────────────────────────────────────────────────────────────────────
-//  requestScene — generate the NEXT scene + image.
+//  requestScene — generate the NEXT scene + image + per-beat voice.
-//  Frontend passes a session whose latest history entry has `exit` set.
+//  Used both on real scene transitions and on speculative prefetch.
 //  Also used for prefetch speculation (frontend synthesizes the exit).
 // ──────────────────────────────────────────────────────────────────────
 export async function requestScene(
  config: EngineConfig,
  req: SceneRequest,
 ): Promise<SceneResponse> {
-  const scene = await directScene(config.text, req.session);
+  const { scene, characterUpdates } = await directScene(config.text, req.session);
-  const imageBase64 = await render(config.image, scene, req.session.styleGuide);
+  const preVoiceSession: Session = {
-  return { scene, imageBase64 };
+    ...req.session,
    characters: mergeCharacters(req.session.characters, characterUpdates),
  };
  const [imageBase64, voiceRes] = await Promise.all([
    renderImage(config, scene, preVoiceSession.styleGuide),
    runVoiceScene(config, preVoiceSession, scene),
  ]);
  return {
    scene,
    imageBase64,
    characters: voiceRes.characters,
    beatAudio: voiceRes.beatAudio,
  };
 }
 // ──────────────────────────────────────────────────────────────────────
@@ -75,6 +142,7 @@ export async function visionDecide(
 // ──────────────────────────────────────────────────────────────────────
 //  requestInsertBeat — generates a transient in-scene beat (no image regen)
 //  and voices the line if any.
 // ──────────────────────────────────────────────────────────────────────
 export async function requestInsertBeat(
@@ -86,5 +154,49 @@ export async function requestInsertBeat(
    req.session,
    req.freeformAction,
  );
-  return { partial };
+
  // INSERT_BEAT prompt forbids new characters — but if the director violates
  // it, voiceBeat's name-inferred fallback would silently provision and persist
  // the hallucinated speaker. Strip the speaker attribution and promote the
  // line into narration so the player still sees the text (the client only
  // renders `line` when there is a `speaker`).
  if (
    partial.speaker &&
    !req.session.characters.some((c) => c.name === partial.speaker)
  ) {
    console.warn(
      `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
    );
    const promotedNarration =
      [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
    return {
      partial: {
        narration: promotedNarration,
        speaker: undefined,
        line: undefined,
        lineDelivery: undefined,
      },
      characters: req.session.characters,
    };
  }
  if (!config.tts) {
    // Always echo characters so callers don't need a ?? fallback.
    return { partial, characters: req.session.characters };
  }
  // Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
  // registered cast, so we voice against the existing character set.
  const voiceRes = await voiceBeat(
    config.tts,
    req.session,
    req.session.characters,
    partial,
  );
  return {
    partial,
    characters: voiceRes.characters,
    audio: voiceRes.audio,
  };
 }
@@ -4,11 +4,12 @@ import type { Scene, Session } from "@yume/types";
 //  Director — emits one Scene (background + a graph of beats) at a time.
 // ──────────────────────────────────────────────────────────────────────
-export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史，输出**一个完整的场景**。
+export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史、已登记角色，输出**一个完整的场景**，并为每句台词配上细腻的配音导演指令。
 一个场景包含：
 - 一张背景图（你给出英文 scenePrompt）
 - 一组对话节拍 beats，玩家会按顺序经历它们
 - 任何**首次登场**的角色，需在 characterUpdates 里登记其专属音色设计
 每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接：
 - "continue": 玩家点击图片背景 / 按继续，自然推进到下一个 beat
@@ -30,27 +31,42 @@ choice 的 effect 有两种：
 - choice 至少 2 个，至多 4 个，互不重复
 文本风格约束：
- narration / line 用中文，scenePrompt 用英文
+- narration / line 用中文（**纯净可显示文本**，绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的，会被玩家看见）
 - scenePrompt / lineDelivery / characterUpdates 内的文字按下方专门说明
 - 单个 beat 的 narration 与 line 加起来 ≤80 字
 - 单个 choice label ≤15 字
- scenePrompt 只描述画面里看到什么，不要描述 UI
+- scenePrompt 用英文，只描述画面里看到什么，不要描述 UI
 配音相关字段：
 - 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的"配音导演指令"，描述该句台词怎么念（情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏）。例："鼓起勇气又害羞，声音发颤、偏小，句尾带一丝气声，语速偏慢"。平淡场合写"平静自然、语速适中"即可，但要贴当下情境。
 - characterUpdates 仅当**有新角色首次出现**时列出该新角色的音色设计；已登记的角色不要重复列出。
 - characterUpdates[].description **必须以明确性别开头**（"女性，…" / "男性，…"），随后描述：年龄、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言。例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗，语速偏快，标准普通话"。
 角色与台词的硬性规则（影响配音正确性）：
 - 任何 beat 的 speaker 字段一旦填了名字，**该名字必须**：① 在"已登记角色"列表中存在，或 ② 本次输出的 characterUpdates 里登记。绝不允许 speaker 是个未登记的陌生名字。
 - speaker 名字必须与登记名**完全一致**，不要加「（回忆）」「学姐」之类后缀或别名。
 必须输出严格 JSON，结构如下：
 {
  "scenePrompt": "english scene description, no UI",
  "entryBeatId": "b1",
  "characterUpdates": [
    { "name": "夏海", "description": "女性，约17岁少女，音色清亮带点稚嫩甜美…" }
  ],
  "beats": [
    {
      "id": "b1",
-      "narration": "可空",
+      "narration": "可空（纯净文本）",
      "speaker": "可空",
-      "line": "可空",
+      "line": "可空（纯净文本）",
      "lineDelivery": "line 非空时必填：配音导演指令",
      "next": { "type": "continue", "nextBeatId": "b2" }
    },
    {
      "id": "b2",
-      "speaker": "...",
+      "speaker": "夏海",
-      "line": "...",
+      "line": "学长，我有话想对你说。",
      "lineDelivery": "鼓起勇气，但又有点害羞，语速偏慢，句尾微微上扬",
      "next": {
        "type": "choice",
        "choices": [
@@ -77,6 +93,13 @@ export function buildDirectorUserMessage(session: Session): string {
  parts.push(`世界观：${session.worldSetting}`);
  parts.push(`画风：${session.styleGuide}`);
  if (session.characters.length > 0) {
    parts.push("\n已登记角色（speaker 必须用这些名字之一，或在本次 characterUpdates 里登记新名）：");
    for (const c of session.characters) {
      parts.push(`- ${c.name}：${c.description}`);
    }
  }
  if (session.history.length === 0) {
    parts.push("\n这是故事的开场。请生成第一个场景，严格以 JSON 格式返回。");
    return parts.join("\n");
@@ -142,19 +165,22 @@ export function buildDirectorUserMessage(session: Session): string {
 export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**（比如看一眼桌上的相框、想了想刚才那句话）。请基于此动作，写出一个**单独的、过渡性的 beat**：可以是旁白、角色台词、或两者结合。
 文本风格约束：
- narration / line 用中文
+- narration / line 用中文，**纯净可显示文本**，不要写 (叹气) 这类配音标注
 - narration 与 line 加起来 ≤80 字
 - 不要打破当前场景的物理状态（玩家仍在原地、对面仍是同一个角色）
 - 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
 - 如果有 line，speaker 必须用**已登记角色**里的名字（绝不允许引入新角色）
 - 如果有 line，**必须**给出 lineDelivery（配音导演指令，自由中文，描述这句话怎么念）
 必须输出严格 JSON：
 {
  "narration": "...",
  "speaker": "...",
-  "line": "..."
+  "line": "...",
  "lineDelivery": "..."
 }
-字段都可为空字符串。不要输出 JSON 以外的任何文本。`;
+narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
 export function buildInsertBeatUserMessage(
  session: Session,
@@ -163,9 +189,16 @@ export function buildInsertBeatUserMessage(
  const parts: string[] = [];
  parts.push(`世界观：${session.worldSetting}`);
  if (session.characters.length > 0) {
    parts.push("\n已登记角色（speaker 只能用这些名字）：");
    for (const c of session.characters) {
      parts.push(`- ${c.name}`);
    }
  }
  const current = session.history.at(-1);
  if (current) {
-    parts.push(`当前场景：${current.scene.scenePrompt}`);
+    parts.push(`\n当前场景：${current.scene.scenePrompt}`);
    const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
    const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId);
    if (lastBeat) {
@@ -0,0 +1,106 @@
 import { provisionVoice, synthesize } from "@yume/tts-client";
 import type {
  BeatAudio,
  Character,
  CharacterVoice,
  Scene,
  Session,
  TtsConfig,
 } from "@yume/types";
 export type BeatLike = {
  id?: string;
  speaker?: string;
  line?: string;
  lineDelivery?: string;
 };
 // When the director references a speaker that was never registered, derive a
 // description from the name + world so the voice's gender/temperament is at
 // least inferred from the name — never borrowed from another character.
 function inferredSpeakerDescription(name: string, session: Session): string {
  return `请根据角色名「${name}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`;
 }
 // Voice a single beat against a mutable character registry.
 // Returns the (possibly-extended) registry plus the audio if synthesized.
 // Narration-only beats and missing-line beats return no audio (VN convention).
 export async function voiceBeat(
  cfg: TtsConfig,
  session: Session,
  characters: Character[],
  beat: BeatLike,
 ): Promise<{ audio?: BeatAudio; characters: Character[] }> {
  if (!beat.speaker || !beat.line) {
    return { characters };
  }
  const speakerName = beat.speaker;
  const text = beat.line;
  const delivery = beat.lineDelivery;
  // Hoisted so the catch can return the in-progress registry even if synthesis
  // fails after provisioning succeeded — otherwise the just-provisioned voice
  // would be lost and the next beat for this speaker would pay to re-design it
  // (extra cost, latency, and more 429 risk on rate-limited providers).
  let nextCharacters: Character[] = characters;
  try {
    const idx = characters.findIndex((c) => c.name === speakerName);
    let voice: CharacterVoice | undefined;
    if (idx !== -1 && characters[idx]?.voice) {
      voice = characters[idx]!.voice;
    } else if (idx !== -1) {
      const target = characters[idx]!;
      voice = await provisionVoice(cfg, target.description);
      nextCharacters = characters.map((c, i) =>
        i === idx ? { ...c, voice } : c,
      );
    } else {
      const description = inferredSpeakerDescription(speakerName, session);
      voice = await provisionVoice(cfg, description);
      nextCharacters = [...characters, { name: speakerName, description, voice }];
    }
    const { audioBase64, mimeType } = await synthesize(
      cfg,
      voice,
      text,
      delivery,
    );
    return {
      audio: { base64: audioBase64, mime: mimeType },
      characters: nextCharacters,
    };
  } catch (err) {
    const msg = err instanceof Error ? err.message : String(err);
    console.error(`[voice] degraded: ${msg}`);
    return { characters: nextCharacters };
  }
 }
 // Voice every beat in a scene. Sequential by design: a single speaker
 // appearing in multiple beats must provision exactly once and share that
 // voice across calls — parallel synthesis would race and create duplicates.
 // With 2–6 beats × ~500ms per clone the total cost is well inside the image
 // generation budget (10s+), so the simplicity is worth it.
 export async function voiceScene(
  cfg: TtsConfig,
  session: Session,
  scene: Scene,
 ): Promise<{
  beatAudio: Record<string, BeatAudio>;
  characters: Character[];
 }> {
  let characters = session.characters;
  const beatAudio: Record<string, BeatAudio> = {};
  for (const beat of scene.beats) {
    const res = await voiceBeat(cfg, session, characters, beat);
    characters = res.characters;
    if (res.audio) beatAudio[beat.id] = res.audio;
  }
  return { beatAudio, characters };
 }
@@ -0,0 +1,17 @@
 {
  "name": "@yume/tts-client",
  "version": "0.1.0",
  "private": true,
  "type": "module",
  "main": "./src/index.ts",
  "types": "./src/index.ts",
  "exports": {
    ".": "./src/index.ts"
  },
  "scripts": {
    "typecheck": "tsc --noEmit"
  },
  "dependencies": {
    "@yume/types": "workspace:*"
  }
 }
@@ -0,0 +1 @@
 export { xiaomiProvision as provisionVoice, xiaomiSynthesize as synthesize } from "./xiaomi";
@@ -0,0 +1,113 @@
 import type { CharacterVoice, TtsConfig } from "@yume/types";
 // Xiaomi MiMo currently outputs wav / pcm16 only (mp3 not supported for output).
 // The reference clip we persist is therefore wav. Kept as a single switch so we
 // can flip to mp3 the day the API supports it.
 const OUTPUT_FORMAT = "wav";
 const OUTPUT_MIME = "audio/wav";
 function buildHeaders(cfg: TtsConfig): HeadersInit {
  return {
    "Content-Type": "application/json",
    "api-key": cfg.apiKey,
  };
 }
 function joinUrl(baseUrl: string, path: string): string {
  return `${baseUrl.replace(/\/$/, "")}${path}`;
 }
 function designModel(cfg: TtsConfig): string {
  return `${cfg.speechModel}-voicedesign`;
 }
 function cloneModel(cfg: TtsConfig): string {
  return `${cfg.speechModel}-voiceclone`;
 }
 type ChatAudioResponse = {
  choices?: Array<{ message?: { audio?: { data?: string } } }>;
  error?: { message?: string };
  message?: string;
 };
 function extractAudio(json: ChatAudioResponse, where: string): string {
  const data = json.choices?.[0]?.message?.audio?.data;
  if (!data) {
    const err = json.error?.message ?? json.message ?? JSON.stringify(json);
    throw new Error(`Xiaomi ${where} returned no audio: ${err.slice(0, 300)}`);
  }
  return data;
 }
 export async function xiaomiProvision(
  cfg: TtsConfig,
  description: string,
 ): Promise<CharacterVoice> {
  const url = joinUrl(cfg.baseUrl, "/chat/completions");
  const body = {
    model: designModel(cfg),
    messages: [
      { role: "user", content: description },
      { role: "assistant", content: "你好，这是音色试听样本。" },
    ],
    audio: { format: OUTPUT_FORMAT },
  };
  const res = await fetch(url, {
    method: "POST",
    headers: buildHeaders(cfg),
    body: JSON.stringify(body),
  });
  if (!res.ok) {
    const text = await res.text();
    throw new Error(`Xiaomi voicedesign ${res.status}: ${text.slice(0, 300)}`);
  }
  const json = (await res.json()) as ChatAudioResponse;
  const referenceAudioBase64 = extractAudio(json, "voicedesign");
  return { provider: "xiaomi", referenceAudioBase64, mimeType: OUTPUT_MIME };
 }
 export async function xiaomiSynthesize(
  cfg: TtsConfig,
  voice: CharacterVoice,
  text: string,
  delivery?: string,
 ): Promise<{ audioBase64: string; mimeType: string }> {
  const url = joinUrl(cfg.baseUrl, "/chat/completions");
  // The free-form delivery direction rides in the `user` (director) message,
  // so it shapes the performance without ever being read aloud. The spoken
  // text stays in the `assistant` message, clean.
  const body = {
    model: cloneModel(cfg),
    messages: [
      { role: "user", content: delivery?.trim() ?? "" },
      { role: "assistant", content: text },
    ],
    audio: {
      format: OUTPUT_FORMAT,
      voice: `data:${voice.mimeType};base64,${voice.referenceAudioBase64}`,
    },
  };
  const res = await fetch(url, {
    method: "POST",
    headers: buildHeaders(cfg),
    body: JSON.stringify(body),
  });
  if (!res.ok) {
    const txt = await res.text();
    throw new Error(`Xiaomi voiceclone ${res.status}: ${txt.slice(0, 300)}`);
  }
  const json = (await res.json()) as ChatAudioResponse;
  const audioBase64 = extractAudio(json, "voiceclone");
  return { audioBase64, mimeType: OUTPUT_MIME };
 }
@@ -0,0 +1,7 @@
 {
  "extends": "../../tsconfig.base.json",
  "compilerOptions": {
    "noEmit": true
  },
  "include": ["src/**/*"]
 }
@@ -9,6 +9,8 @@ export type Beat = {
  narration?: string;
  speaker?: string;
  line?: string;
  /** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
  lineDelivery?: string;
  next: BeatNext;
 };
@@ -54,6 +56,30 @@ export type SceneHistoryEntry = {
  exit?: SceneExit;
 };
 // ──────────────────────────────────────────────────────────────────────
 //  Characters & voices (TTS)
 // ──────────────────────────────────────────────────────────────────────
 export type CharacterVoice = {
  provider: "xiaomi";
  /** Xiaomi MiMo design output stored as reference audio for later clones. */
  referenceAudioBase64: string;
  mimeType: string;
 };
 export type Character = {
  name: string;
  /** Free-form voice design description; must begin with explicit gender. */
  description: string;
  voice?: CharacterVoice;
 };
 /** A single beat's synthesized audio, attached to the response. */
 export type BeatAudio = {
  base64: string;
  mime: string;
 };
 // ──────────────────────────────────────────────────────────────────────
 //  Session
 // ──────────────────────────────────────────────────────────────────────
@@ -64,6 +90,8 @@ export type Session = {
  worldSetting: string;
  styleGuide: string;
  history: SceneHistoryEntry[];
  /** Character registry — accumulates across scenes; voices persist for reuse. */
  characters: Character[];
 };
 // ──────────────────────────────────────────────────────────────────────
@@ -87,10 +115,21 @@ export type ProviderConfig = {
  model: string;
 };
 export type TtsConfig = {
  baseUrl: string;
  apiKey: string;
  /** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
  speechModel: string;
 };
 export type EngineConfig = {
  text: ProviderConfig;
  image: ProviderConfig;
  vision: ProviderConfig;
  /** Optional — when missing the game runs silently (no TTS). */
  tts?: TtsConfig;
  /** When true the renderer returns a placeholder PNG instead of calling the image API. */
  mockImage?: boolean;
 };
 // ──────────────────────────────────────────────────────────────────────
@@ -106,6 +145,10 @@ export type StartResponse = {
  sessionId: string;
  scene: Scene;
  imageBase64: string;
  /** Post-voice character registry (with provisioned voices). */
  characters: Character[];
  /** Per-beat synthesized audio, keyed by beat.id. */
  beatAudio?: Record<string, BeatAudio>;
 };
 // /api/scene — generates the next Scene, given session whose latest
@@ -118,6 +161,8 @@ export type SceneRequest = {
 export type SceneResponse = {
  scene: Scene;
  imageBase64: string;
  characters: Character[];
  beatAudio?: Record<string, BeatAudio>;
 };
 // /api/vision — interprets a background click on the current image and
@@ -141,10 +186,16 @@ export type InsertBeatRequest = {
  freeformAction: string;
 };
-export type InsertBeatResponse = {
+/** Partial beat fields produced by the insert-beat director. */
-  partial: {
+export type InsertBeatPartial = {
-    narration?: string;
+  narration?: string;
-    speaker?: string;
+  speaker?: string;
-    line?: string;
+  line?: string;
-  };
+  lineDelivery?: string;
 };
 export type InsertBeatResponse = {
  partial: InsertBeatPartial;
  characters: Character[];
  audio?: BeatAudio;
 };
@@ -69,6 +69,9 @@ importers:
      '@yume/ai-client':
        specifier: workspace:*
        version: link:../ai-client
      '@yume/tts-client':
        specifier: workspace:*
        version: link:../tts-client
      '@yume/types':
        specifier: workspace:*
        version: link:../types
@@ -76,6 +79,12 @@ importers:
        specifier: ^0.33.5
        version: 0.33.5
  packages/tts-client:
    dependencies:
      '@yume/types':
        specifier: workspace:*
        version: link:../types
  packages/types: {}
 packages:
		`@@ -0,0 +1 @@`
							`export { xiaomiProvision as provisionVoice, xiaomiSynthesize as synthesize } from "./xiaomi";`