feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)
Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -9,6 +9,8 @@ export type Beat = {
|
||||
narration?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
/** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
|
||||
lineDelivery?: string;
|
||||
next: BeatNext;
|
||||
};
|
||||
|
||||
@@ -54,6 +56,30 @@ export type SceneHistoryEntry = {
|
||||
exit?: SceneExit;
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// Characters & voices (TTS)
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export type CharacterVoice = {
|
||||
provider: "xiaomi";
|
||||
/** Xiaomi MiMo design output stored as reference audio for later clones. */
|
||||
referenceAudioBase64: string;
|
||||
mimeType: string;
|
||||
};
|
||||
|
||||
export type Character = {
|
||||
name: string;
|
||||
/** Free-form voice design description; must begin with explicit gender. */
|
||||
description: string;
|
||||
voice?: CharacterVoice;
|
||||
};
|
||||
|
||||
/** A single beat's synthesized audio, attached to the response. */
|
||||
export type BeatAudio = {
|
||||
base64: string;
|
||||
mime: string;
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// Session
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
@@ -64,6 +90,8 @@ export type Session = {
|
||||
worldSetting: string;
|
||||
styleGuide: string;
|
||||
history: SceneHistoryEntry[];
|
||||
/** Character registry — accumulates across scenes; voices persist for reuse. */
|
||||
characters: Character[];
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
@@ -87,10 +115,21 @@ export type ProviderConfig = {
|
||||
model: string;
|
||||
};
|
||||
|
||||
export type TtsConfig = {
|
||||
baseUrl: string;
|
||||
apiKey: string;
|
||||
/** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
|
||||
speechModel: string;
|
||||
};
|
||||
|
||||
export type EngineConfig = {
|
||||
text: ProviderConfig;
|
||||
image: ProviderConfig;
|
||||
vision: ProviderConfig;
|
||||
/** Optional — when missing the game runs silently (no TTS). */
|
||||
tts?: TtsConfig;
|
||||
/** When true the renderer returns a placeholder PNG instead of calling the image API. */
|
||||
mockImage?: boolean;
|
||||
};
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
@@ -106,6 +145,10 @@ export type StartResponse = {
|
||||
sessionId: string;
|
||||
scene: Scene;
|
||||
imageBase64: string;
|
||||
/** Post-voice character registry (with provisioned voices). */
|
||||
characters: Character[];
|
||||
/** Per-beat synthesized audio, keyed by beat.id. */
|
||||
beatAudio?: Record<string, BeatAudio>;
|
||||
};
|
||||
|
||||
// /api/scene — generates the next Scene, given session whose latest
|
||||
@@ -118,6 +161,8 @@ export type SceneRequest = {
|
||||
export type SceneResponse = {
|
||||
scene: Scene;
|
||||
imageBase64: string;
|
||||
characters: Character[];
|
||||
beatAudio?: Record<string, BeatAudio>;
|
||||
};
|
||||
|
||||
// /api/vision — interprets a background click on the current image and
|
||||
@@ -141,10 +186,16 @@ export type InsertBeatRequest = {
|
||||
freeformAction: string;
|
||||
};
|
||||
|
||||
export type InsertBeatResponse = {
|
||||
partial: {
|
||||
narration?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
};
|
||||
/** Partial beat fields produced by the insert-beat director. */
|
||||
export type InsertBeatPartial = {
|
||||
narration?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
lineDelivery?: string;
|
||||
};
|
||||
|
||||
export type InsertBeatResponse = {
|
||||
partial: InsertBeatPartial;
|
||||
characters: Character[];
|
||||
audio?: BeatAudio;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user