feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)
Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@yume/ai-client": "workspace:*",
|
||||
"@yume/tts-client": "workspace:*",
|
||||
"@yume/types": "workspace:*",
|
||||
"sharp": "^0.33.5"
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import type {
|
||||
BeatChoice,
|
||||
BeatChoiceEffect,
|
||||
BeatNext,
|
||||
Character,
|
||||
InsertBeatPartial,
|
||||
ProviderConfig,
|
||||
Scene,
|
||||
Session,
|
||||
@@ -43,13 +45,20 @@ type RawBeat = {
|
||||
narration?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
lineDelivery?: string;
|
||||
next?: RawNext;
|
||||
};
|
||||
|
||||
type RawCharacterUpdate = {
|
||||
name?: string;
|
||||
description?: string;
|
||||
};
|
||||
|
||||
type RawScene = {
|
||||
scenePrompt?: string;
|
||||
entryBeatId?: string;
|
||||
beats?: RawBeat[];
|
||||
characterUpdates?: RawCharacterUpdate[];
|
||||
};
|
||||
|
||||
function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
|
||||
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
|
||||
// last/dangling continue into a real scene-change exit so the player can
|
||||
// never get stuck self-looping on it.
|
||||
const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
|
||||
const line = raw.line?.trim() || undefined;
|
||||
return {
|
||||
id,
|
||||
narration: raw.narration?.trim() || undefined,
|
||||
speaker: raw.speaker?.trim() || undefined,
|
||||
line: raw.line?.trim() || undefined,
|
||||
line,
|
||||
// lineDelivery only meaningful when there is a line to deliver.
|
||||
lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
|
||||
next: coerceNext(raw.next, fallback),
|
||||
};
|
||||
}
|
||||
|
||||
function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
|
||||
if (!Array.isArray(raw)) return [];
|
||||
return raw
|
||||
.map((c) => ({
|
||||
name: c.name?.trim() ?? "",
|
||||
description: c.description?.trim() ?? "",
|
||||
}))
|
||||
.filter((c) => c.name && c.description);
|
||||
}
|
||||
|
||||
const FALLBACK_SEED = "故事继续推进";
|
||||
|
||||
function fallbackExitChoice(beatId: string): BeatChoice {
|
||||
@@ -230,10 +252,15 @@ function newSceneId(): string {
|
||||
// Called both on real scene transitions AND on speculative prefetch.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export type SceneResult = {
|
||||
scene: Scene;
|
||||
characterUpdates: Character[];
|
||||
};
|
||||
|
||||
export async function directScene(
|
||||
config: ProviderConfig,
|
||||
session: Session,
|
||||
): Promise<Scene> {
|
||||
): Promise<SceneResult> {
|
||||
const raw = await chat(
|
||||
config,
|
||||
[
|
||||
@@ -264,10 +291,13 @@ export async function directScene(
|
||||
: beats[0]!.id;
|
||||
|
||||
return {
|
||||
id: newSceneId(),
|
||||
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
|
||||
beats,
|
||||
entryBeatId,
|
||||
scene: {
|
||||
id: newSceneId(),
|
||||
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
|
||||
beats,
|
||||
entryBeatId,
|
||||
},
|
||||
characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -280,7 +310,7 @@ export async function directInsertBeat(
|
||||
config: ProviderConfig,
|
||||
session: Session,
|
||||
freeformAction: string,
|
||||
): Promise<{ narration?: string; speaker?: string; line?: string }> {
|
||||
): Promise<InsertBeatPartial> {
|
||||
const raw = await chat(
|
||||
config,
|
||||
[
|
||||
@@ -293,15 +323,12 @@ export async function directInsertBeat(
|
||||
{ temperature: 0.9, responseFormat: "json_object" },
|
||||
);
|
||||
|
||||
const parsed = parseJsonLoose<{
|
||||
narration?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
}>(raw);
|
||||
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
|
||||
|
||||
const narration = parsed.narration?.trim() || undefined;
|
||||
const speaker = parsed.speaker?.trim() || undefined;
|
||||
const line = parsed.line?.trim() || undefined;
|
||||
const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;
|
||||
|
||||
// If the model returned nothing usable, supply a fallback narration so the
|
||||
// frontend doesn't append a silent empty beat that renders no dialogue —
|
||||
@@ -309,5 +336,5 @@ export async function directInsertBeat(
|
||||
if (!narration && !speaker && !line) {
|
||||
return { narration: "(你停下脚步,环视片刻。)" };
|
||||
}
|
||||
return { narration, speaker, line };
|
||||
return { narration, speaker, line, lineDelivery };
|
||||
}
|
||||
|
||||
@@ -5,4 +5,7 @@ export {
|
||||
requestInsertBeat,
|
||||
} from "./orchestrator";
|
||||
export { annotateClick } from "./annotate";
|
||||
export { voiceBeat, voiceScene } from "./voice";
|
||||
export type { SceneResult } from "./director";
|
||||
export type { InsertBeatPartial } from "@yume/types";
|
||||
export * from "./prompts";
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
import sharp from "sharp";
|
||||
|
||||
let cached: string | undefined;
|
||||
|
||||
// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
|
||||
// TTS path without paying for image generation. Generated once, then memoized.
|
||||
export async function mockImageBase64(): Promise<string> {
|
||||
if (cached) return cached;
|
||||
|
||||
const W = 1792;
|
||||
const H = 1024;
|
||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
|
||||
<rect width="${W}" height="${H}" fill="#161109"/>
|
||||
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
|
||||
stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
|
||||
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
|
||||
font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
|
||||
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
|
||||
font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
|
||||
</svg>`;
|
||||
|
||||
const png = await sharp(Buffer.from(svg)).png().toBuffer();
|
||||
cached = png.toString("base64");
|
||||
return cached;
|
||||
}
|
||||
@@ -1,7 +1,10 @@
|
||||
import type {
|
||||
BeatAudio,
|
||||
Character,
|
||||
EngineConfig,
|
||||
InsertBeatRequest,
|
||||
InsertBeatResponse,
|
||||
Scene,
|
||||
SceneRequest,
|
||||
SceneResponse,
|
||||
Session,
|
||||
@@ -12,15 +15,55 @@ import type {
|
||||
} from "@yume/types";
|
||||
import { annotateClick } from "./annotate";
|
||||
import { directInsertBeat, directScene } from "./director";
|
||||
import { mockImageBase64 } from "./mockImage";
|
||||
import { render } from "./renderer";
|
||||
import { interpret } from "./vision";
|
||||
import { voiceBeat, voiceScene } from "./voice";
|
||||
|
||||
function newSessionId(): string {
|
||||
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||
}
|
||||
|
||||
// Merge new character entries into the registry by name. If a name already
|
||||
// exists we preserve the existing voice (so a description revision never
|
||||
// silently re-provisions a voice the player has already heard).
|
||||
function mergeCharacters(existing: Character[], updates: Character[]): Character[] {
|
||||
if (updates.length === 0) return existing;
|
||||
const byName = new Map(existing.map((c) => [c.name, c]));
|
||||
for (const u of updates) {
|
||||
const prev = byName.get(u.name);
|
||||
byName.set(u.name, prev?.voice ? { ...u, voice: prev.voice } : u);
|
||||
}
|
||||
return Array.from(byName.values());
|
||||
}
|
||||
|
||||
async function renderImage(
|
||||
config: EngineConfig,
|
||||
scene: Scene,
|
||||
styleGuide: string,
|
||||
): Promise<string> {
|
||||
if (config.mockImage) return mockImageBase64();
|
||||
return render(config.image, scene, styleGuide);
|
||||
}
|
||||
|
||||
async function runVoiceScene(
|
||||
config: EngineConfig,
|
||||
session: Session,
|
||||
scene: Scene,
|
||||
): Promise<{
|
||||
beatAudio?: Record<string, BeatAudio>;
|
||||
characters: Character[];
|
||||
}> {
|
||||
if (!config.tts) return { characters: session.characters };
|
||||
const res = await voiceScene(config.tts, session, scene);
|
||||
return {
|
||||
beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
|
||||
characters: res.characters,
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// startSession — first scene + image
|
||||
// startSession — first scene + image + per-beat voice
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function startSession(
|
||||
@@ -33,31 +76,55 @@ export async function startSession(
|
||||
worldSetting: req.worldSetting.trim(),
|
||||
styleGuide: req.styleGuide.trim(),
|
||||
history: [],
|
||||
characters: [],
|
||||
};
|
||||
|
||||
const scene = await directScene(config.text, session);
|
||||
const imageBase64 = await render(config.image, scene, session.styleGuide);
|
||||
const { scene, characterUpdates } = await directScene(config.text, session);
|
||||
const preVoiceSession: Session = {
|
||||
...session,
|
||||
characters: mergeCharacters(session.characters, characterUpdates),
|
||||
};
|
||||
|
||||
const [imageBase64, voiceRes] = await Promise.all([
|
||||
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||
runVoiceScene(config, preVoiceSession, scene),
|
||||
]);
|
||||
|
||||
return {
|
||||
sessionId: session.id,
|
||||
scene,
|
||||
imageBase64,
|
||||
characters: voiceRes.characters,
|
||||
beatAudio: voiceRes.beatAudio,
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// requestScene — generate the NEXT scene + image.
|
||||
// Frontend passes a session whose latest history entry has `exit` set.
|
||||
// Also used for prefetch speculation (frontend synthesizes the exit).
|
||||
// requestScene — generate the NEXT scene + image + per-beat voice.
|
||||
// Used both on real scene transitions and on speculative prefetch.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function requestScene(
|
||||
config: EngineConfig,
|
||||
req: SceneRequest,
|
||||
): Promise<SceneResponse> {
|
||||
const scene = await directScene(config.text, req.session);
|
||||
const imageBase64 = await render(config.image, scene, req.session.styleGuide);
|
||||
return { scene, imageBase64 };
|
||||
const { scene, characterUpdates } = await directScene(config.text, req.session);
|
||||
const preVoiceSession: Session = {
|
||||
...req.session,
|
||||
characters: mergeCharacters(req.session.characters, characterUpdates),
|
||||
};
|
||||
|
||||
const [imageBase64, voiceRes] = await Promise.all([
|
||||
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||
runVoiceScene(config, preVoiceSession, scene),
|
||||
]);
|
||||
|
||||
return {
|
||||
scene,
|
||||
imageBase64,
|
||||
characters: voiceRes.characters,
|
||||
beatAudio: voiceRes.beatAudio,
|
||||
};
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
@@ -75,6 +142,7 @@ export async function visionDecide(
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
// requestInsertBeat — generates a transient in-scene beat (no image regen)
|
||||
// and voices the line if any.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function requestInsertBeat(
|
||||
@@ -86,5 +154,49 @@ export async function requestInsertBeat(
|
||||
req.session,
|
||||
req.freeformAction,
|
||||
);
|
||||
return { partial };
|
||||
|
||||
// INSERT_BEAT prompt forbids new characters — but if the director violates
|
||||
// it, voiceBeat's name-inferred fallback would silently provision and persist
|
||||
// the hallucinated speaker. Strip the speaker attribution and promote the
|
||||
// line into narration so the player still sees the text (the client only
|
||||
// renders `line` when there is a `speaker`).
|
||||
if (
|
||||
partial.speaker &&
|
||||
!req.session.characters.some((c) => c.name === partial.speaker)
|
||||
) {
|
||||
console.warn(
|
||||
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
|
||||
);
|
||||
const promotedNarration =
|
||||
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
|
||||
return {
|
||||
partial: {
|
||||
narration: promotedNarration,
|
||||
speaker: undefined,
|
||||
line: undefined,
|
||||
lineDelivery: undefined,
|
||||
},
|
||||
characters: req.session.characters,
|
||||
};
|
||||
}
|
||||
|
||||
if (!config.tts) {
|
||||
// Always echo characters so callers don't need a ?? fallback.
|
||||
return { partial, characters: req.session.characters };
|
||||
}
|
||||
|
||||
// Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
|
||||
// registered cast, so we voice against the existing character set.
|
||||
const voiceRes = await voiceBeat(
|
||||
config.tts,
|
||||
req.session,
|
||||
req.session.characters,
|
||||
partial,
|
||||
);
|
||||
|
||||
return {
|
||||
partial,
|
||||
characters: voiceRes.characters,
|
||||
audio: voiceRes.audio,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -4,11 +4,12 @@ import type { Scene, Session } from "@yume/types";
|
||||
// Director — emits one Scene (background + a graph of beats) at a time.
|
||||
// ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史,输出**一个完整的场景**。
|
||||
export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史、已登记角色,输出**一个完整的场景**,并为每句台词配上细腻的配音导演指令。
|
||||
|
||||
一个场景包含:
|
||||
- 一张背景图(你给出英文 scenePrompt)
|
||||
- 一组对话节拍 beats,玩家会按顺序经历它们
|
||||
- 任何**首次登场**的角色,需在 characterUpdates 里登记其专属音色设计
|
||||
|
||||
每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接:
|
||||
- "continue": 玩家点击图片背景 / 按继续,自然推进到下一个 beat
|
||||
@@ -30,27 +31,42 @@ choice 的 effect 有两种:
|
||||
- choice 至少 2 个,至多 4 个,互不重复
|
||||
|
||||
文本风格约束:
|
||||
- narration / line 用中文,scenePrompt 用英文
|
||||
- narration / line 用中文(**纯净可显示文本**,绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的,会被玩家看见)
|
||||
- scenePrompt / lineDelivery / characterUpdates 内的文字按下方专门说明
|
||||
- 单个 beat 的 narration 与 line 加起来 ≤80 字
|
||||
- 单个 choice label ≤15 字
|
||||
- scenePrompt 只描述画面里看到什么,不要描述 UI
|
||||
- scenePrompt 用英文,只描述画面里看到什么,不要描述 UI
|
||||
|
||||
配音相关字段:
|
||||
- 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的"配音导演指令",描述该句台词怎么念(情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏)。例:"鼓起勇气又害羞,声音发颤、偏小,句尾带一丝气声,语速偏慢"。平淡场合写"平静自然、语速适中"即可,但要贴当下情境。
|
||||
- characterUpdates 仅当**有新角色首次出现**时列出该新角色的音色设计;已登记的角色不要重复列出。
|
||||
- characterUpdates[].description **必须以明确性别开头**("女性,…" / "男性,…"),随后描述:年龄、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言。例:"女性,约17岁少女,音色清亮带点稚嫩甜美,性格开朗,语速偏快,标准普通话"。
|
||||
|
||||
角色与台词的硬性规则(影响配音正确性):
|
||||
- 任何 beat 的 speaker 字段一旦填了名字,**该名字必须**:① 在"已登记角色"列表中存在,或 ② 本次输出的 characterUpdates 里登记。绝不允许 speaker 是个未登记的陌生名字。
|
||||
- speaker 名字必须与登记名**完全一致**,不要加「(回忆)」「学姐」之类后缀或别名。
|
||||
|
||||
必须输出严格 JSON,结构如下:
|
||||
{
|
||||
"scenePrompt": "english scene description, no UI",
|
||||
"entryBeatId": "b1",
|
||||
"characterUpdates": [
|
||||
{ "name": "夏海", "description": "女性,约17岁少女,音色清亮带点稚嫩甜美…" }
|
||||
],
|
||||
"beats": [
|
||||
{
|
||||
"id": "b1",
|
||||
"narration": "可空",
|
||||
"narration": "可空(纯净文本)",
|
||||
"speaker": "可空",
|
||||
"line": "可空",
|
||||
"line": "可空(纯净文本)",
|
||||
"lineDelivery": "line 非空时必填:配音导演指令",
|
||||
"next": { "type": "continue", "nextBeatId": "b2" }
|
||||
},
|
||||
{
|
||||
"id": "b2",
|
||||
"speaker": "...",
|
||||
"line": "...",
|
||||
"speaker": "夏海",
|
||||
"line": "学长,我有话想对你说。",
|
||||
"lineDelivery": "鼓起勇气,但又有点害羞,语速偏慢,句尾微微上扬",
|
||||
"next": {
|
||||
"type": "choice",
|
||||
"choices": [
|
||||
@@ -77,6 +93,13 @@ export function buildDirectorUserMessage(session: Session): string {
|
||||
parts.push(`世界观:${session.worldSetting}`);
|
||||
parts.push(`画风:${session.styleGuide}`);
|
||||
|
||||
if (session.characters.length > 0) {
|
||||
parts.push("\n已登记角色(speaker 必须用这些名字之一,或在本次 characterUpdates 里登记新名):");
|
||||
for (const c of session.characters) {
|
||||
parts.push(`- ${c.name}:${c.description}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (session.history.length === 0) {
|
||||
parts.push("\n这是故事的开场。请生成第一个场景,严格以 JSON 格式返回。");
|
||||
return parts.join("\n");
|
||||
@@ -142,19 +165,22 @@ export function buildDirectorUserMessage(session: Session): string {
|
||||
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**(比如看一眼桌上的相框、想了想刚才那句话)。请基于此动作,写出一个**单独的、过渡性的 beat**:可以是旁白、角色台词、或两者结合。
|
||||
|
||||
文本风格约束:
|
||||
- narration / line 用中文
|
||||
- narration / line 用中文,**纯净可显示文本**,不要写 (叹气) 这类配音标注
|
||||
- narration 与 line 加起来 ≤80 字
|
||||
- 不要打破当前场景的物理状态(玩家仍在原地、对面仍是同一个角色)
|
||||
- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
|
||||
- 如果有 line,speaker 必须用**已登记角色**里的名字(绝不允许引入新角色)
|
||||
- 如果有 line,**必须**给出 lineDelivery(配音导演指令,自由中文,描述这句话怎么念)
|
||||
|
||||
必须输出严格 JSON:
|
||||
{
|
||||
"narration": "...",
|
||||
"speaker": "...",
|
||||
"line": "..."
|
||||
"line": "...",
|
||||
"lineDelivery": "..."
|
||||
}
|
||||
|
||||
字段都可为空字符串。不要输出 JSON 以外的任何文本。`;
|
||||
narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
|
||||
|
||||
export function buildInsertBeatUserMessage(
|
||||
session: Session,
|
||||
@@ -163,9 +189,16 @@ export function buildInsertBeatUserMessage(
|
||||
const parts: string[] = [];
|
||||
parts.push(`世界观:${session.worldSetting}`);
|
||||
|
||||
if (session.characters.length > 0) {
|
||||
parts.push("\n已登记角色(speaker 只能用这些名字):");
|
||||
for (const c of session.characters) {
|
||||
parts.push(`- ${c.name}`);
|
||||
}
|
||||
}
|
||||
|
||||
const current = session.history.at(-1);
|
||||
if (current) {
|
||||
parts.push(`当前场景:${current.scene.scenePrompt}`);
|
||||
parts.push(`\n当前场景:${current.scene.scenePrompt}`);
|
||||
const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
|
||||
const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId);
|
||||
if (lastBeat) {
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
import { provisionVoice, synthesize } from "@yume/tts-client";
|
||||
import type {
|
||||
BeatAudio,
|
||||
Character,
|
||||
CharacterVoice,
|
||||
Scene,
|
||||
Session,
|
||||
TtsConfig,
|
||||
} from "@yume/types";
|
||||
|
||||
export type BeatLike = {
|
||||
id?: string;
|
||||
speaker?: string;
|
||||
line?: string;
|
||||
lineDelivery?: string;
|
||||
};
|
||||
|
||||
// When the director references a speaker that was never registered, derive a
|
||||
// description from the name + world so the voice's gender/temperament is at
|
||||
// least inferred from the name — never borrowed from another character.
|
||||
function inferredSpeakerDescription(name: string, session: Session): string {
|
||||
return `请根据角色名「${name}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
|
||||
}
|
||||
|
||||
// Voice a single beat against a mutable character registry.
|
||||
// Returns the (possibly-extended) registry plus the audio if synthesized.
|
||||
// Narration-only beats and missing-line beats return no audio (VN convention).
|
||||
export async function voiceBeat(
|
||||
cfg: TtsConfig,
|
||||
session: Session,
|
||||
characters: Character[],
|
||||
beat: BeatLike,
|
||||
): Promise<{ audio?: BeatAudio; characters: Character[] }> {
|
||||
if (!beat.speaker || !beat.line) {
|
||||
return { characters };
|
||||
}
|
||||
|
||||
const speakerName = beat.speaker;
|
||||
const text = beat.line;
|
||||
const delivery = beat.lineDelivery;
|
||||
|
||||
// Hoisted so the catch can return the in-progress registry even if synthesis
|
||||
// fails after provisioning succeeded — otherwise the just-provisioned voice
|
||||
// would be lost and the next beat for this speaker would pay to re-design it
|
||||
// (extra cost, latency, and more 429 risk on rate-limited providers).
|
||||
let nextCharacters: Character[] = characters;
|
||||
|
||||
try {
|
||||
const idx = characters.findIndex((c) => c.name === speakerName);
|
||||
let voice: CharacterVoice | undefined;
|
||||
|
||||
if (idx !== -1 && characters[idx]?.voice) {
|
||||
voice = characters[idx]!.voice;
|
||||
} else if (idx !== -1) {
|
||||
const target = characters[idx]!;
|
||||
voice = await provisionVoice(cfg, target.description);
|
||||
nextCharacters = characters.map((c, i) =>
|
||||
i === idx ? { ...c, voice } : c,
|
||||
);
|
||||
} else {
|
||||
const description = inferredSpeakerDescription(speakerName, session);
|
||||
voice = await provisionVoice(cfg, description);
|
||||
nextCharacters = [...characters, { name: speakerName, description, voice }];
|
||||
}
|
||||
|
||||
const { audioBase64, mimeType } = await synthesize(
|
||||
cfg,
|
||||
voice,
|
||||
text,
|
||||
delivery,
|
||||
);
|
||||
return {
|
||||
audio: { base64: audioBase64, mime: mimeType },
|
||||
characters: nextCharacters,
|
||||
};
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[voice] degraded: ${msg}`);
|
||||
return { characters: nextCharacters };
|
||||
}
|
||||
}
|
||||
|
||||
// Voice every beat in a scene. Sequential by design: a single speaker
|
||||
// appearing in multiple beats must provision exactly once and share that
|
||||
// voice across calls — parallel synthesis would race and create duplicates.
|
||||
// With 2–6 beats × ~500ms per clone the total cost is well inside the image
|
||||
// generation budget (10s+), so the simplicity is worth it.
|
||||
export async function voiceScene(
|
||||
cfg: TtsConfig,
|
||||
session: Session,
|
||||
scene: Scene,
|
||||
): Promise<{
|
||||
beatAudio: Record<string, BeatAudio>;
|
||||
characters: Character[];
|
||||
}> {
|
||||
let characters = session.characters;
|
||||
const beatAudio: Record<string, BeatAudio> = {};
|
||||
|
||||
for (const beat of scene.beats) {
|
||||
const res = await voiceBeat(cfg, session, characters, beat);
|
||||
characters = res.characters;
|
||||
if (res.audio) beatAudio[beat.id] = res.audio;
|
||||
}
|
||||
|
||||
return { beatAudio, characters };
|
||||
}
|
||||
Reference in New Issue
Block a user