feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)

Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration.

- Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session
- Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud)
- Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer
- Graceful degradation: any TTS step failing → silent beat, game continues
- MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens
- Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS)

Squashed from #3:
- feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导
- feat(engine): MOCK_IMAGE 占位图便于本地测试
- fix(tts): address Copilot review on PR #3
- fix(tts): Copilot round-2 review feedback

Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
Zonghao Yuan
2026-05-28 20:45:21 +08:00
committed by GitHub
parent d1f13d51a3
commit fcd4e6c1ab
18 changed files with 787 additions and 62 deletions
+40 -13
View File
@@ -4,6 +4,8 @@ import type {
BeatChoice,
BeatChoiceEffect,
BeatNext,
Character,
InsertBeatPartial,
ProviderConfig,
Scene,
Session,
@@ -43,13 +45,20 @@ type RawBeat = {
narration?: string;
speaker?: string;
line?: string;
lineDelivery?: string;
next?: RawNext;
};
type RawCharacterUpdate = {
name?: string;
description?: string;
};
type RawScene = {
scenePrompt?: string;
entryBeatId?: string;
beats?: RawBeat[];
characterUpdates?: RawCharacterUpdate[];
};
function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
// last/dangling continue into a real scene-change exit so the player can
// never get stuck self-looping on it.
const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
const line = raw.line?.trim() || undefined;
return {
id,
narration: raw.narration?.trim() || undefined,
speaker: raw.speaker?.trim() || undefined,
line: raw.line?.trim() || undefined,
line,
// lineDelivery only meaningful when there is a line to deliver.
lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
next: coerceNext(raw.next, fallback),
};
}
function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
if (!Array.isArray(raw)) return [];
return raw
.map((c) => ({
name: c.name?.trim() ?? "",
description: c.description?.trim() ?? "",
}))
.filter((c) => c.name && c.description);
}
const FALLBACK_SEED = "故事继续推进";
function fallbackExitChoice(beatId: string): BeatChoice {
@@ -230,10 +252,15 @@ function newSceneId(): string {
// Called both on real scene transitions AND on speculative prefetch.
// ──────────────────────────────────────────────────────────────────────
export type SceneResult = {
scene: Scene;
characterUpdates: Character[];
};
export async function directScene(
config: ProviderConfig,
session: Session,
): Promise<Scene> {
): Promise<SceneResult> {
const raw = await chat(
config,
[
@@ -264,10 +291,13 @@ export async function directScene(
: beats[0]!.id;
return {
id: newSceneId(),
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
beats,
entryBeatId,
scene: {
id: newSceneId(),
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
beats,
entryBeatId,
},
characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
};
}
@@ -280,7 +310,7 @@ export async function directInsertBeat(
config: ProviderConfig,
session: Session,
freeformAction: string,
): Promise<{ narration?: string; speaker?: string; line?: string }> {
): Promise<InsertBeatPartial> {
const raw = await chat(
config,
[
@@ -293,15 +323,12 @@ export async function directInsertBeat(
{ temperature: 0.9, responseFormat: "json_object" },
);
const parsed = parseJsonLoose<{
narration?: string;
speaker?: string;
line?: string;
}>(raw);
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
const narration = parsed.narration?.trim() || undefined;
const speaker = parsed.speaker?.trim() || undefined;
const line = parsed.line?.trim() || undefined;
const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;
// If the model returned nothing usable, supply a fallback narration so the
// frontend doesn't append a silent empty beat that renders no dialogue —
@@ -309,5 +336,5 @@ export async function directInsertBeat(
if (!narration && !speaker && !line) {
return { narration: "(你停下脚步,环视片刻。)" };
}
return { narration, speaker, line };
return { narration, speaker, line, lineDelivery };
}