feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)
Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"name": "@yume/tts-client",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"main": "./src/index.ts",
|
||||
"types": "./src/index.ts",
|
||||
"exports": {
|
||||
".": "./src/index.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"@yume/types": "workspace:*"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
export { xiaomiProvision as provisionVoice, xiaomiSynthesize as synthesize } from "./xiaomi";
|
||||
@@ -0,0 +1,113 @@
|
||||
import type { CharacterVoice, TtsConfig } from "@yume/types";
|
||||
|
||||
// Xiaomi MiMo currently outputs wav / pcm16 only (mp3 not supported for output).
|
||||
// The reference clip we persist is therefore wav. Kept as a single switch so we
|
||||
// can flip to mp3 the day the API supports it.
|
||||
const OUTPUT_FORMAT = "wav";
|
||||
const OUTPUT_MIME = "audio/wav";
|
||||
|
||||
function buildHeaders(cfg: TtsConfig): HeadersInit {
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
"api-key": cfg.apiKey,
|
||||
};
|
||||
}
|
||||
|
||||
function joinUrl(baseUrl: string, path: string): string {
|
||||
return `${baseUrl.replace(/\/$/, "")}${path}`;
|
||||
}
|
||||
|
||||
function designModel(cfg: TtsConfig): string {
|
||||
return `${cfg.speechModel}-voicedesign`;
|
||||
}
|
||||
|
||||
function cloneModel(cfg: TtsConfig): string {
|
||||
return `${cfg.speechModel}-voiceclone`;
|
||||
}
|
||||
|
||||
type ChatAudioResponse = {
|
||||
choices?: Array<{ message?: { audio?: { data?: string } } }>;
|
||||
error?: { message?: string };
|
||||
message?: string;
|
||||
};
|
||||
|
||||
function extractAudio(json: ChatAudioResponse, where: string): string {
|
||||
const data = json.choices?.[0]?.message?.audio?.data;
|
||||
if (!data) {
|
||||
const err = json.error?.message ?? json.message ?? JSON.stringify(json);
|
||||
throw new Error(`Xiaomi ${where} returned no audio: ${err.slice(0, 300)}`);
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
export async function xiaomiProvision(
|
||||
cfg: TtsConfig,
|
||||
description: string,
|
||||
): Promise<CharacterVoice> {
|
||||
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
||||
|
||||
const body = {
|
||||
model: designModel(cfg),
|
||||
messages: [
|
||||
{ role: "user", content: description },
|
||||
{ role: "assistant", content: "你好,这是音色试听样本。" },
|
||||
],
|
||||
audio: { format: OUTPUT_FORMAT },
|
||||
};
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: buildHeaders(cfg),
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
throw new Error(`Xiaomi voicedesign ${res.status}: ${text.slice(0, 300)}`);
|
||||
}
|
||||
|
||||
const json = (await res.json()) as ChatAudioResponse;
|
||||
const referenceAudioBase64 = extractAudio(json, "voicedesign");
|
||||
|
||||
return { provider: "xiaomi", referenceAudioBase64, mimeType: OUTPUT_MIME };
|
||||
}
|
||||
|
||||
export async function xiaomiSynthesize(
|
||||
cfg: TtsConfig,
|
||||
voice: CharacterVoice,
|
||||
text: string,
|
||||
delivery?: string,
|
||||
): Promise<{ audioBase64: string; mimeType: string }> {
|
||||
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
||||
|
||||
// The free-form delivery direction rides in the `user` (director) message,
|
||||
// so it shapes the performance without ever being read aloud. The spoken
|
||||
// text stays in the `assistant` message, clean.
|
||||
const body = {
|
||||
model: cloneModel(cfg),
|
||||
messages: [
|
||||
{ role: "user", content: delivery?.trim() ?? "" },
|
||||
{ role: "assistant", content: text },
|
||||
],
|
||||
audio: {
|
||||
format: OUTPUT_FORMAT,
|
||||
voice: `data:${voice.mimeType};base64,${voice.referenceAudioBase64}`,
|
||||
},
|
||||
};
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: buildHeaders(cfg),
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const txt = await res.text();
|
||||
throw new Error(`Xiaomi voiceclone ${res.status}: ${txt.slice(0, 300)}`);
|
||||
}
|
||||
|
||||
const json = (await res.json()) as ChatAudioResponse;
|
||||
const audioBase64 = extractAudio(json, "voiceclone");
|
||||
|
||||
return { audioBase64, mimeType: OUTPUT_MIME };
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"extends": "../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"noEmit": true
|
||||
},
|
||||
"include": ["src/**/*"]
|
||||
}
|
||||
Reference in New Issue
Block a user