feat(tts): Xiaomi MiMo per-beat voice + MOCK_IMAGE testing aid (#3)
Adds optional Xiaomi MiMo TTS layer on top of the scene/beat engine and a MOCK_IMAGE flag for cheap local TTS iteration. - Per-character voice provisioning via MiMo voice design → clone, reference audio persisted in session - Per-line free-form delivery direction (Director writes "鼓起勇气又害羞,声音发颤" style instructions; sent to MiMo's director channel, never read aloud) - Per-beat audio served with the scene response; frontend plays via hidden <audio> with typewriter synced to audio duration; mute toggle persisted via localStorage lazy initializer - Graceful degradation: any TTS step failing → silent beat, game continues - MOCK_IMAGE=true returns a sharp-generated placeholder PNG so local TTS iteration doesn't burn image tokens - Recommended config in .env.example: MiMo Token Plan covers TEXT/VISION/TTS with one key (mimo-v2.5-pro for text, mimo-v2.5 omni for vision, mimo-v2.5-tts for TTS) Squashed from #3: - feat(tts): 小米 MiMo 逐 beat 配音 + 按 session 角色音色 + 自由文本配音指导 - feat(engine): MOCK_IMAGE 占位图便于本地测试 - fix(tts): address Copilot review on PR #3 - fix(tts): Copilot round-2 review feedback Known limitation: Session.characters carries the full WAV reference audio (~200-300KB/character base64) and round-trips through every /api/scene, /api/vision, /api/insert-beat request. This is intrinsic to MiMo's design→clone model (voice identity IS the audio, no server-side voiceId). Fixing requires server-side storage which is out of scope; documented for future hardening. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
+33
-12
@@ -1,24 +1,45 @@
|
|||||||
# =============================================================
|
# =============================================================
|
||||||
# 云梦 — AI 视觉小说
|
# 云梦 — AI 视觉小说
|
||||||
# Three independently configurable AI providers
|
# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
|
||||||
# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI,
|
# (one API key covers all three) + any image provider for IMAGE.
|
||||||
# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama).
|
|
||||||
#
|
#
|
||||||
|
# Any OpenAI-compatible endpoint works for any slot — OpenRouter,
|
||||||
|
# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
|
||||||
# Image generation uses the chat-completions + modalities API
|
# Image generation uses the chat-completions + modalities API
|
||||||
# (OpenRouter-style), NOT the legacy /images/generations endpoint.
|
# (OpenRouter-style), NOT the legacy /images/generations endpoint.
|
||||||
# =============================================================
|
# =============================================================
|
||||||
|
|
||||||
# ---- 1. Text LLM (story director) -----------------------------
|
# ---- 1. Text LLM · scene director ----------------------------------
|
||||||
TEXT_BASE_URL=https://openrouter.ai/api/v1
|
# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN)
|
||||||
TEXT_API_KEY=sk-or-v1-xxx
|
# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1
|
||||||
TEXT_MODEL=~anthropic/claude-sonnet-latest
|
# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys)
|
||||||
|
TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
|
||||||
|
TEXT_API_KEY=tp-xxx
|
||||||
|
TEXT_MODEL=mimo-v2.5-pro
|
||||||
|
|
||||||
# ---- 2. Image generator (renders the whole UI screen) ---------
|
# ---- 2. Image generator (renders the scene background) -------------
|
||||||
|
# Any provider supporting chat-completions + modalities image output.
|
||||||
IMAGE_BASE_URL=https://openrouter.ai/api/v1
|
IMAGE_BASE_URL=https://openrouter.ai/api/v1
|
||||||
IMAGE_API_KEY=sk-or-v1-xxx
|
IMAGE_API_KEY=sk-or-v1-xxx
|
||||||
IMAGE_MODEL=openai/gpt-5.4-image-2
|
IMAGE_MODEL=openai/gpt-5.4-image-2
|
||||||
|
|
||||||
# ---- 3. Vision model (interprets where the user clicked) ------
|
# ---- 3. Vision model · multimodal click interpretation -------------
|
||||||
VISION_BASE_URL=https://openrouter.ai/api/v1
|
# Recommended: MiMo V2.5 omni — multimodal.
|
||||||
VISION_API_KEY=sk-or-v1-xxx
|
# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and
|
||||||
VISION_MODEL=~google/gemini-flash-latest
|
# rejects image_url content parts.
|
||||||
|
VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
|
||||||
|
VISION_API_KEY=tp-xxx
|
||||||
|
VISION_MODEL=mimo-v2.5
|
||||||
|
|
||||||
|
# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------
|
||||||
|
# Per-character voice design → clone, with per-line delivery direction.
|
||||||
|
# Voice identity = the reference audio kept in the session (no server expiry).
|
||||||
|
# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL.
|
||||||
|
TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
|
||||||
|
TTS_API_KEY=tp-xxx
|
||||||
|
TTS_SPEECH_MODEL=mimo-v2.5-tts
|
||||||
|
|
||||||
|
# ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) -----
|
||||||
|
# true → return a placeholder image instead of calling the image model.
|
||||||
|
# Text/story/voice still run normally. Great for iterating on TTS.
|
||||||
|
MOCK_IMAGE=false
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import { PlayCanvas, type Phase } from "@/components/PlayCanvas";
|
|||||||
import { PRESETS } from "@/lib/presets";
|
import { PRESETS } from "@/lib/presets";
|
||||||
import type {
|
import type {
|
||||||
Beat,
|
Beat,
|
||||||
|
BeatAudio,
|
||||||
BeatChoice,
|
BeatChoice,
|
||||||
InsertBeatResponse,
|
InsertBeatResponse,
|
||||||
Scene,
|
Scene,
|
||||||
@@ -24,6 +25,8 @@ import type {
|
|||||||
VisionResponse,
|
VisionResponse,
|
||||||
} from "@yume/types";
|
} from "@yume/types";
|
||||||
|
|
||||||
|
const MUTED_STORAGE_KEY = "yume:muted";
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
// Prefetch pool — speculative SceneResponses keyed by choice path.
|
// Prefetch pool — speculative SceneResponses keyed by choice path.
|
||||||
//
|
//
|
||||||
@@ -133,7 +136,16 @@ function prefetchScenePath(
|
|||||||
nextSceneSeed: sole.effect.nextSceneSeed,
|
nextSceneSeed: sole.effect.nextSceneSeed,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
prefetchScenePath(pool, baseSession, [...steps, nextStep], depth + 1);
|
// Carry forward the registry that the parent prefetch result already
|
||||||
|
// settled (it may include characters introduced by the intermediate
|
||||||
|
// scene). Without this, the L2+ prefetch starts from the original
|
||||||
|
// base.characters and a later transition through this survivor would
|
||||||
|
// silently drop voices the player has already heard.
|
||||||
|
const carriedBase: Session = {
|
||||||
|
...baseSession,
|
||||||
|
characters: data.characters,
|
||||||
|
};
|
||||||
|
prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,6 +193,18 @@ function PlayInner() {
|
|||||||
const [currentScene, setCurrentScene] = useState<Scene | null>(null);
|
const [currentScene, setCurrentScene] = useState<Scene | null>(null);
|
||||||
const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
|
const [currentBeatId, setCurrentBeatId] = useState<string | null>(null);
|
||||||
const [imageBase64, setImageBase64] = useState<string | null>(null);
|
const [imageBase64, setImageBase64] = useState<string | null>(null);
|
||||||
|
const [beatAudioMap, setBeatAudioMap] = useState<Record<string, BeatAudio>>({});
|
||||||
|
// Lazy-initialize from localStorage so PlayCanvas never mounts with the
|
||||||
|
// wrong muted value (an effect-based read would briefly let audio play
|
||||||
|
// before the preference settled in a scenario where audio arrives early).
|
||||||
|
const [muted, setMuted] = useState<boolean>(() => {
|
||||||
|
if (typeof window === "undefined") return false;
|
||||||
|
try {
|
||||||
|
return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1";
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
const [pendingClick, setPendingClick] = useState<{
|
const [pendingClick, setPendingClick] = useState<{
|
||||||
x: number;
|
x: number;
|
||||||
y: number;
|
y: number;
|
||||||
@@ -203,6 +227,10 @@ function PlayInner() {
|
|||||||
return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
|
return currentScene.beats.find((b) => b.id === currentBeatId) ?? null;
|
||||||
}, [currentScene, currentBeatId]);
|
}, [currentScene, currentBeatId]);
|
||||||
|
|
||||||
|
const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined;
|
||||||
|
const audioBase64 = currentBeatAudio?.base64 ?? null;
|
||||||
|
const audioMime = currentBeatAudio?.mime ?? null;
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
sessionRef.current = session;
|
sessionRef.current = session;
|
||||||
}, [session]);
|
}, [session]);
|
||||||
@@ -231,6 +259,19 @@ function PlayInner() {
|
|||||||
});
|
});
|
||||||
}, [currentBeatId]);
|
}, [currentBeatId]);
|
||||||
|
|
||||||
|
// ── Mute persistence (read is via the useState lazy initializer above) ─
|
||||||
|
const toggleMuted = useCallback(() => {
|
||||||
|
setMuted((prev) => {
|
||||||
|
const next = !prev;
|
||||||
|
try {
|
||||||
|
window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0");
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
return next;
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
// ── Presentation mode toggle ─────────────────────────────────────────
|
// ── Presentation mode toggle ─────────────────────────────────────────
|
||||||
const togglePresentation = useCallback(async () => {
|
const togglePresentation = useCallback(async () => {
|
||||||
const entering = !presentation;
|
const entering = !presentation;
|
||||||
@@ -327,12 +368,14 @@ function PlayInner() {
|
|||||||
visitedBeatIds: [data.scene.entryBeatId],
|
visitedBeatIds: [data.scene.entryBeatId],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
characters: data.characters,
|
||||||
};
|
};
|
||||||
visitedBeatsRef.current = [data.scene.entryBeatId];
|
visitedBeatsRef.current = [data.scene.entryBeatId];
|
||||||
setSession(initial);
|
setSession(initial);
|
||||||
setCurrentScene(data.scene);
|
setCurrentScene(data.scene);
|
||||||
setCurrentBeatId(data.scene.entryBeatId);
|
setCurrentBeatId(data.scene.entryBeatId);
|
||||||
setImageBase64(data.imageBase64);
|
setImageBase64(data.imageBase64);
|
||||||
|
setBeatAudioMap(data.beatAudio ?? {});
|
||||||
setPhase("ready");
|
setPhase("ready");
|
||||||
})
|
})
|
||||||
.catch((e) => setError(String(e)));
|
.catch((e) => setError(String(e)));
|
||||||
@@ -409,12 +452,14 @@ function PlayInner() {
|
|||||||
visitedBeatIds: [result.scene.entryBeatId],
|
visitedBeatIds: [result.scene.entryBeatId],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
characters: result.characters,
|
||||||
};
|
};
|
||||||
visitedBeatsRef.current = [result.scene.entryBeatId];
|
visitedBeatsRef.current = [result.scene.entryBeatId];
|
||||||
setSession(newSession);
|
setSession(newSession);
|
||||||
setCurrentScene(result.scene);
|
setCurrentScene(result.scene);
|
||||||
setCurrentBeatId(result.scene.entryBeatId);
|
setCurrentBeatId(result.scene.entryBeatId);
|
||||||
setImageBase64(result.imageBase64);
|
setImageBase64(result.imageBase64);
|
||||||
|
setBeatAudioMap(result.beatAudio ?? {});
|
||||||
setLastExitLabel(exitLabel);
|
setLastExitLabel(exitLabel);
|
||||||
setPhase("ready");
|
setPhase("ready");
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -514,7 +559,8 @@ function PlayInner() {
|
|||||||
};
|
};
|
||||||
throw new Error(j.error ?? insertRes.statusText);
|
throw new Error(j.error ?? insertRes.statusText);
|
||||||
}
|
}
|
||||||
const { partial } = (await insertRes.json()) as InsertBeatResponse;
|
const { partial, characters: insertChars, audio } =
|
||||||
|
(await insertRes.json()) as InsertBeatResponse;
|
||||||
|
|
||||||
const fromBeatId =
|
const fromBeatId =
|
||||||
currentBeatRef.current?.id ?? currentScene.entryBeatId;
|
currentBeatRef.current?.id ?? currentScene.entryBeatId;
|
||||||
@@ -526,6 +572,7 @@ function PlayInner() {
|
|||||||
narration: partial.narration,
|
narration: partial.narration,
|
||||||
speaker: partial.speaker,
|
speaker: partial.speaker,
|
||||||
line: partial.line,
|
line: partial.line,
|
||||||
|
lineDelivery: partial.lineDelivery,
|
||||||
next: { type: "continue", nextBeatId: fromBeatId },
|
next: { type: "continue", nextBeatId: fromBeatId },
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -541,11 +588,15 @@ function PlayInner() {
|
|||||||
history: s.history.map((h, i, arr) =>
|
history: s.history.map((h, i, arr) =>
|
||||||
i === arr.length - 1 ? { ...h, scene: patched } : h,
|
i === arr.length - 1 ? { ...h, scene: patched } : h,
|
||||||
),
|
),
|
||||||
|
characters: insertChars,
|
||||||
}
|
}
|
||||||
: s,
|
: s,
|
||||||
);
|
);
|
||||||
setCurrentScene(patched);
|
setCurrentScene(patched);
|
||||||
setCurrentBeatId(newBeatId);
|
setCurrentBeatId(newBeatId);
|
||||||
|
if (audio) {
|
||||||
|
setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio }));
|
||||||
|
}
|
||||||
setLastExitLabel(decision.intent.freeformAction);
|
setLastExitLabel(decision.intent.freeformAction);
|
||||||
setPhase("ready");
|
setPhase("ready");
|
||||||
setPendingClick(null);
|
setPendingClick(null);
|
||||||
@@ -627,6 +678,9 @@ function PlayInner() {
|
|||||||
<div className="fixed inset-0 bg-black flex items-center justify-center z-50">
|
<div className="fixed inset-0 bg-black flex items-center justify-center z-50">
|
||||||
<PlayCanvas
|
<PlayCanvas
|
||||||
imageBase64={imageBase64}
|
imageBase64={imageBase64}
|
||||||
|
audioBase64={audioBase64}
|
||||||
|
audioMime={audioMime}
|
||||||
|
muted={muted}
|
||||||
phase={phase}
|
phase={phase}
|
||||||
beat={currentBeat}
|
beat={currentBeat}
|
||||||
pendingClick={pendingClick}
|
pendingClick={pendingClick}
|
||||||
@@ -666,6 +720,9 @@ function PlayInner() {
|
|||||||
<main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
|
<main className="flex-1 flex flex-col items-center justify-center px-4 md:px-8 py-6 md:py-10">
|
||||||
<PlayCanvas
|
<PlayCanvas
|
||||||
imageBase64={imageBase64}
|
imageBase64={imageBase64}
|
||||||
|
audioBase64={audioBase64}
|
||||||
|
audioMime={audioMime}
|
||||||
|
muted={muted}
|
||||||
phase={phase}
|
phase={phase}
|
||||||
beat={currentBeat}
|
beat={currentBeat}
|
||||||
pendingClick={pendingClick}
|
pendingClick={pendingClick}
|
||||||
@@ -700,7 +757,17 @@ function PlayInner() {
|
|||||||
F · 演 · 示
|
F · 演 · 示
|
||||||
</button>
|
</button>
|
||||||
<div className="text-[9px] smallcaps text-clay-400 num">Ⅰ · Ⅰ</div>
|
<div className="text-[9px] smallcaps text-clay-400 num">Ⅰ · Ⅰ</div>
|
||||||
<span className="text-[9px] w-[60px]" aria-hidden />
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={toggleMuted}
|
||||||
|
className="text-[9px] smallcaps text-clay-400 hover:text-clay-700 transition-colors flex items-center gap-2 w-[80px] justify-end"
|
||||||
|
aria-label={muted ? "取消静音" : "静音"}
|
||||||
|
>
|
||||||
|
<i
|
||||||
|
className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[10px]`}
|
||||||
|
/>
|
||||||
|
{muted ? "静 · 音" : "有 · 声"}
|
||||||
|
</button>
|
||||||
</footer>
|
</footer>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -13,30 +13,66 @@ export type Phase =
|
|||||||
const SHADOW =
|
const SHADOW =
|
||||||
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
|
"0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)";
|
||||||
|
|
||||||
|
const DEFAULT_CHAR_MS = 28;
|
||||||
|
const MIN_CHAR_MS = 30;
|
||||||
|
// Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice
|
||||||
|
// while preserving pitch. Typewriter pacing is divided by the same factor.
|
||||||
|
const SPEECH_RATE = 1.2;
|
||||||
|
// If audio metadata never arrives within this window, give up waiting and
|
||||||
|
// let the typewriter run at default speed.
|
||||||
|
const AUDIO_WAIT_TIMEOUT_MS = 2500;
|
||||||
|
|
||||||
// ── Typewriter hook ────────────────────────────────────────────────────
|
// ── Typewriter hook ────────────────────────────────────────────────────
|
||||||
// Returns the progressively-revealed text, a `done` flag, and a `skip()` that
|
// Returns the progressively-revealed text, a `done` flag, and a `skip()` that
|
||||||
// instantly completes the current text. Reset is keyed by `resetKey` (the beat
|
// instantly completes the current text. Reset is keyed by `resetKey` (the beat
|
||||||
// id) rather than the text, so a new beat whose line happens to match the
|
// id) rather than the text, so a new beat whose line happens to match the
|
||||||
// previous one still replays from scratch. `done` is derived synchronously
|
// previous one still replays from scratch.
|
||||||
// (not from a post-paint effect) so a stale "done" frame never paints.
|
//
|
||||||
|
// When `targetDurationMs` is provided we space characters to span that audio
|
||||||
|
// duration, keeping text and voice in lockstep. While `waitForAudio` is true
|
||||||
|
// and we don't yet know a duration, the typewriter holds (so text doesn't
|
||||||
|
// race ahead of an audio that's still loading).
|
||||||
function useTypewriter(
|
function useTypewriter(
|
||||||
text: string,
|
text: string,
|
||||||
resetKey: string,
|
resetKey: string,
|
||||||
speed = 28,
|
opts: { targetDurationMs?: number; waitForAudio: boolean } = {
|
||||||
|
waitForAudio: false,
|
||||||
|
},
|
||||||
): { shown: string; done: boolean; skip: () => void } {
|
): { shown: string; done: boolean; skip: () => void } {
|
||||||
|
const { targetDurationMs, waitForAudio } = opts;
|
||||||
const [displayed, setDisplayed] = useState("");
|
const [displayed, setDisplayed] = useState("");
|
||||||
const [prevKey, setPrevKey] = useState(resetKey);
|
const [prevKey, setPrevKey] = useState(resetKey);
|
||||||
const timer = useRef<ReturnType<typeof setInterval> | null>(null);
|
const timer = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||||
|
// Sticky once the player has skipped this beat: prevents a late-arriving
|
||||||
|
// audio metadata event from re-triggering the effect and replaying the text.
|
||||||
|
const skippedRef = useRef(false);
|
||||||
|
|
||||||
// Render-phase reset (React "adjust state on prop change" pattern): when the
|
// Render-phase reset (React "adjust state on prop change" pattern): when the
|
||||||
// beat changes, drop the old progress before this render commits.
|
// beat changes, drop the old progress before this render commits.
|
||||||
if (resetKey !== prevKey) {
|
if (resetKey !== prevKey) {
|
||||||
setPrevKey(resetKey);
|
setPrevKey(resetKey);
|
||||||
setDisplayed("");
|
setDisplayed("");
|
||||||
|
skippedRef.current = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!text) return;
|
if (!text) return;
|
||||||
|
// `=== undefined` (not `!targetDurationMs`): 0 means "audio failed or
|
||||||
|
// timed out — run at default speed". The original truthy check stalled
|
||||||
|
// the typewriter forever on those fallback paths.
|
||||||
|
if (waitForAudio && targetDurationMs === undefined) return;
|
||||||
|
// If the player skipped, settle on the full text and don't restart even
|
||||||
|
// when audio metadata arrives late and re-triggers this effect.
|
||||||
|
if (skippedRef.current) {
|
||||||
|
setDisplayed(text);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const speed =
|
||||||
|
targetDurationMs && text.length > 0
|
||||||
|
? Math.max(MIN_CHAR_MS, targetDurationMs / text.length)
|
||||||
|
: DEFAULT_CHAR_MS;
|
||||||
|
|
||||||
let i = 0;
|
let i = 0;
|
||||||
timer.current = setInterval(() => {
|
timer.current = setInterval(() => {
|
||||||
i += 1;
|
i += 1;
|
||||||
@@ -50,13 +86,14 @@ function useTypewriter(
|
|||||||
if (timer.current) clearInterval(timer.current);
|
if (timer.current) clearInterval(timer.current);
|
||||||
timer.current = null;
|
timer.current = null;
|
||||||
};
|
};
|
||||||
}, [resetKey, text, speed]);
|
}, [resetKey, text, targetDurationMs, waitForAudio]);
|
||||||
|
|
||||||
const skip = useCallback(() => {
|
const skip = useCallback(() => {
|
||||||
if (timer.current) {
|
if (timer.current) {
|
||||||
clearInterval(timer.current);
|
clearInterval(timer.current);
|
||||||
timer.current = null;
|
timer.current = null;
|
||||||
}
|
}
|
||||||
|
skippedRef.current = true;
|
||||||
setDisplayed(text);
|
setDisplayed(text);
|
||||||
}, [text]);
|
}, [text]);
|
||||||
|
|
||||||
@@ -123,6 +160,9 @@ function ChoiceButton({
|
|||||||
// ── Main component ─────────────────────────────────────────────────────
|
// ── Main component ─────────────────────────────────────────────────────
|
||||||
export function PlayCanvas({
|
export function PlayCanvas({
|
||||||
imageBase64,
|
imageBase64,
|
||||||
|
audioBase64,
|
||||||
|
audioMime,
|
||||||
|
muted,
|
||||||
phase,
|
phase,
|
||||||
beat,
|
beat,
|
||||||
pendingClick,
|
pendingClick,
|
||||||
@@ -132,6 +172,9 @@ export function PlayCanvas({
|
|||||||
fullViewport = false,
|
fullViewport = false,
|
||||||
}: {
|
}: {
|
||||||
imageBase64: string | null;
|
imageBase64: string | null;
|
||||||
|
audioBase64: string | null;
|
||||||
|
audioMime: string | null;
|
||||||
|
muted: boolean;
|
||||||
phase: Phase;
|
phase: Phase;
|
||||||
beat: Beat | null;
|
beat: Beat | null;
|
||||||
pendingClick: { x: number; y: number } | null;
|
pendingClick: { x: number; y: number } | null;
|
||||||
@@ -141,7 +184,11 @@ export function PlayCanvas({
|
|||||||
fullViewport?: boolean;
|
fullViewport?: boolean;
|
||||||
}) {
|
}) {
|
||||||
const imgRef = useRef<HTMLImageElement>(null);
|
const imgRef = useRef<HTMLImageElement>(null);
|
||||||
|
const audioRef = useRef<HTMLAudioElement>(null);
|
||||||
const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
|
const [dims, setDims] = useState<{ w: number; h: number } | null>(null);
|
||||||
|
const [audioDurationMs, setAudioDurationMs] = useState<number | undefined>(
|
||||||
|
undefined,
|
||||||
|
);
|
||||||
|
|
||||||
const isChoiceBeat = beat?.next.type === "choice";
|
const isChoiceBeat = beat?.next.type === "choice";
|
||||||
const choices: BeatChoice[] = isChoiceBeat
|
const choices: BeatChoice[] = isChoiceBeat
|
||||||
@@ -150,7 +197,56 @@ export function PlayCanvas({
|
|||||||
|
|
||||||
const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? "";
|
const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? "";
|
||||||
const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
|
const { shown: typedBody, done: typingDone, skip: skipTypewriter } =
|
||||||
useTypewriter(displayBody, beat?.id ?? "", 30);
|
useTypewriter(displayBody, beat?.id ?? "", {
|
||||||
|
targetDurationMs: audioDurationMs,
|
||||||
|
waitForAudio: Boolean(audioBase64),
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Audio source change ──────────────────────────────────────────────
|
||||||
|
// Reset duration when audio source changes; if loading takes too long,
|
||||||
|
// unblock the typewriter via timeout so text doesn't stall.
|
||||||
|
useEffect(() => {
|
||||||
|
setAudioDurationMs(undefined);
|
||||||
|
if (!audioBase64) return;
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
setAudioDurationMs((prev) => prev ?? 0);
|
||||||
|
}, AUDIO_WAIT_TIMEOUT_MS);
|
||||||
|
return () => clearTimeout(timer);
|
||||||
|
}, [audioBase64]);
|
||||||
|
|
||||||
|
// ── Mute toggle ───────────────────────────────────────────────────────
|
||||||
|
useEffect(() => {
|
||||||
|
const el = audioRef.current;
|
||||||
|
if (!el) return;
|
||||||
|
el.muted = muted;
|
||||||
|
el.playbackRate = SPEECH_RATE;
|
||||||
|
if (!muted && audioBase64 && el.paused) {
|
||||||
|
el.play().catch(() => {
|
||||||
|
// autoplay blocked — silent until next interaction
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, [muted, audioBase64]);
|
||||||
|
|
||||||
|
function handleAudioMetadata() {
|
||||||
|
const el = audioRef.current;
|
||||||
|
if (!el) return;
|
||||||
|
el.playbackRate = SPEECH_RATE;
|
||||||
|
// Effective playback time is shorter once sped up — keep the typewriter in sync.
|
||||||
|
const ms = Number.isFinite(el.duration)
|
||||||
|
? (el.duration * 1000) / SPEECH_RATE
|
||||||
|
: 0;
|
||||||
|
setAudioDurationMs(ms > 0 ? ms : 0);
|
||||||
|
if (!muted) {
|
||||||
|
el.play().catch(() => {
|
||||||
|
// autoplay blocked
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleAudioError() {
|
||||||
|
// Treat as zero duration so the typewriter runs at default speed.
|
||||||
|
setAudioDurationMs(0);
|
||||||
|
}
|
||||||
|
|
||||||
function handleImageClick(e: React.MouseEvent<HTMLImageElement>) {
|
function handleImageClick(e: React.MouseEvent<HTMLImageElement>) {
|
||||||
if (phase !== "ready" || !imgRef.current || !beat) return;
|
if (phase !== "ready" || !imgRef.current || !beat) return;
|
||||||
@@ -197,6 +293,19 @@ export function PlayCanvas({
|
|||||||
<div
|
<div
|
||||||
className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
|
className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`}
|
||||||
>
|
>
|
||||||
|
{/* Hidden audio element — voice playback for the current beat */}
|
||||||
|
{audioBase64 && (
|
||||||
|
<audio
|
||||||
|
key={audioBase64.slice(-48)}
|
||||||
|
ref={audioRef}
|
||||||
|
src={`data:${audioMime ?? "audio/wav"};base64,${audioBase64}`}
|
||||||
|
preload="auto"
|
||||||
|
onLoadedMetadata={handleAudioMetadata}
|
||||||
|
onError={handleAudioError}
|
||||||
|
className="hidden"
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
|
||||||
{imageBase64 ? (
|
{imageBase64 ? (
|
||||||
<div
|
<div
|
||||||
className="relative inline-block"
|
className="relative inline-block"
|
||||||
|
|||||||
+19
-1
@@ -1,4 +1,4 @@
|
|||||||
import type { EngineConfig } from "@yume/types";
|
import type { EngineConfig, TtsConfig } from "@yume/types";
|
||||||
|
|
||||||
function readVar(name: string): string {
|
function readVar(name: string): string {
|
||||||
const v = process.env[name];
|
const v = process.env[name];
|
||||||
@@ -6,6 +6,22 @@ function readVar(name: string): string {
|
|||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function readOptionalVar(name: string): string | undefined {
|
||||||
|
const v = process.env[name];
|
||||||
|
return v && v.length > 0 ? v : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadTtsConfig(): TtsConfig | undefined {
|
||||||
|
const baseUrl = readOptionalVar("TTS_BASE_URL");
|
||||||
|
const apiKey = readOptionalVar("TTS_API_KEY");
|
||||||
|
const speechModel = readOptionalVar("TTS_SPEECH_MODEL");
|
||||||
|
|
||||||
|
// Missing any → TTS disabled (game runs silently).
|
||||||
|
if (!baseUrl || !apiKey || !speechModel) return undefined;
|
||||||
|
|
||||||
|
return { baseUrl, apiKey, speechModel };
|
||||||
|
}
|
||||||
|
|
||||||
export function loadEngineConfig(): EngineConfig {
|
export function loadEngineConfig(): EngineConfig {
|
||||||
return {
|
return {
|
||||||
text: {
|
text: {
|
||||||
@@ -23,5 +39,7 @@ export function loadEngineConfig(): EngineConfig {
|
|||||||
apiKey: readVar("VISION_API_KEY"),
|
apiKey: readVar("VISION_API_KEY"),
|
||||||
model: readVar("VISION_MODEL"),
|
model: readVar("VISION_MODEL"),
|
||||||
},
|
},
|
||||||
|
tts: loadTtsConfig(),
|
||||||
|
mockImage: readOptionalVar("MOCK_IMAGE") === "true",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,12 @@ import type { NextConfig } from "next";
|
|||||||
const config: NextConfig = {
|
const config: NextConfig = {
|
||||||
reactStrictMode: true,
|
reactStrictMode: true,
|
||||||
typedRoutes: false,
|
typedRoutes: false,
|
||||||
transpilePackages: ["@yume/engine", "@yume/ai-client", "@yume/types"],
|
transpilePackages: [
|
||||||
|
"@yume/engine",
|
||||||
|
"@yume/ai-client",
|
||||||
|
"@yume/types",
|
||||||
|
"@yume/tts-client",
|
||||||
|
],
|
||||||
serverExternalPackages: ["sharp"],
|
serverExternalPackages: ["sharp"],
|
||||||
turbopack: {
|
turbopack: {
|
||||||
root: path.join(__dirname, "..", ".."),
|
root: path.join(__dirname, "..", ".."),
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@yume/ai-client": "workspace:*",
|
"@yume/ai-client": "workspace:*",
|
||||||
|
"@yume/tts-client": "workspace:*",
|
||||||
"@yume/types": "workspace:*",
|
"@yume/types": "workspace:*",
|
||||||
"sharp": "^0.33.5"
|
"sharp": "^0.33.5"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import type {
|
|||||||
BeatChoice,
|
BeatChoice,
|
||||||
BeatChoiceEffect,
|
BeatChoiceEffect,
|
||||||
BeatNext,
|
BeatNext,
|
||||||
|
Character,
|
||||||
|
InsertBeatPartial,
|
||||||
ProviderConfig,
|
ProviderConfig,
|
||||||
Scene,
|
Scene,
|
||||||
Session,
|
Session,
|
||||||
@@ -43,13 +45,20 @@ type RawBeat = {
|
|||||||
narration?: string;
|
narration?: string;
|
||||||
speaker?: string;
|
speaker?: string;
|
||||||
line?: string;
|
line?: string;
|
||||||
|
lineDelivery?: string;
|
||||||
next?: RawNext;
|
next?: RawNext;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type RawCharacterUpdate = {
|
||||||
|
name?: string;
|
||||||
|
description?: string;
|
||||||
|
};
|
||||||
|
|
||||||
type RawScene = {
|
type RawScene = {
|
||||||
scenePrompt?: string;
|
scenePrompt?: string;
|
||||||
entryBeatId?: string;
|
entryBeatId?: string;
|
||||||
beats?: RawBeat[];
|
beats?: RawBeat[];
|
||||||
|
characterUpdates?: RawCharacterUpdate[];
|
||||||
};
|
};
|
||||||
|
|
||||||
function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
|
function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
|
||||||
@@ -90,15 +99,28 @@ function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
|
|||||||
// last/dangling continue into a real scene-change exit so the player can
|
// last/dangling continue into a real scene-change exit so the player can
|
||||||
// never get stuck self-looping on it.
|
// never get stuck self-looping on it.
|
||||||
const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
|
const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
|
||||||
|
const line = raw.line?.trim() || undefined;
|
||||||
return {
|
return {
|
||||||
id,
|
id,
|
||||||
narration: raw.narration?.trim() || undefined,
|
narration: raw.narration?.trim() || undefined,
|
||||||
speaker: raw.speaker?.trim() || undefined,
|
speaker: raw.speaker?.trim() || undefined,
|
||||||
line: raw.line?.trim() || undefined,
|
line,
|
||||||
|
// lineDelivery only meaningful when there is a line to deliver.
|
||||||
|
lineDelivery: line ? raw.lineDelivery?.trim() || undefined : undefined,
|
||||||
next: coerceNext(raw.next, fallback),
|
next: coerceNext(raw.next, fallback),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function coerceCharacterUpdates(raw: RawCharacterUpdate[] | undefined): Character[] {
|
||||||
|
if (!Array.isArray(raw)) return [];
|
||||||
|
return raw
|
||||||
|
.map((c) => ({
|
||||||
|
name: c.name?.trim() ?? "",
|
||||||
|
description: c.description?.trim() ?? "",
|
||||||
|
}))
|
||||||
|
.filter((c) => c.name && c.description);
|
||||||
|
}
|
||||||
|
|
||||||
const FALLBACK_SEED = "故事继续推进";
|
const FALLBACK_SEED = "故事继续推进";
|
||||||
|
|
||||||
function fallbackExitChoice(beatId: string): BeatChoice {
|
function fallbackExitChoice(beatId: string): BeatChoice {
|
||||||
@@ -230,10 +252,15 @@ function newSceneId(): string {
|
|||||||
// Called both on real scene transitions AND on speculative prefetch.
|
// Called both on real scene transitions AND on speculative prefetch.
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type SceneResult = {
|
||||||
|
scene: Scene;
|
||||||
|
characterUpdates: Character[];
|
||||||
|
};
|
||||||
|
|
||||||
export async function directScene(
|
export async function directScene(
|
||||||
config: ProviderConfig,
|
config: ProviderConfig,
|
||||||
session: Session,
|
session: Session,
|
||||||
): Promise<Scene> {
|
): Promise<SceneResult> {
|
||||||
const raw = await chat(
|
const raw = await chat(
|
||||||
config,
|
config,
|
||||||
[
|
[
|
||||||
@@ -264,10 +291,13 @@ export async function directScene(
|
|||||||
: beats[0]!.id;
|
: beats[0]!.id;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id: newSceneId(),
|
scene: {
|
||||||
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
|
id: newSceneId(),
|
||||||
beats,
|
scenePrompt: parsed.scenePrompt?.trim() || "an empty scene",
|
||||||
entryBeatId,
|
beats,
|
||||||
|
entryBeatId,
|
||||||
|
},
|
||||||
|
characterUpdates: coerceCharacterUpdates(parsed.characterUpdates),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -280,7 +310,7 @@ export async function directInsertBeat(
|
|||||||
config: ProviderConfig,
|
config: ProviderConfig,
|
||||||
session: Session,
|
session: Session,
|
||||||
freeformAction: string,
|
freeformAction: string,
|
||||||
): Promise<{ narration?: string; speaker?: string; line?: string }> {
|
): Promise<InsertBeatPartial> {
|
||||||
const raw = await chat(
|
const raw = await chat(
|
||||||
config,
|
config,
|
||||||
[
|
[
|
||||||
@@ -293,15 +323,12 @@ export async function directInsertBeat(
|
|||||||
{ temperature: 0.9, responseFormat: "json_object" },
|
{ temperature: 0.9, responseFormat: "json_object" },
|
||||||
);
|
);
|
||||||
|
|
||||||
const parsed = parseJsonLoose<{
|
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
|
||||||
narration?: string;
|
|
||||||
speaker?: string;
|
|
||||||
line?: string;
|
|
||||||
}>(raw);
|
|
||||||
|
|
||||||
const narration = parsed.narration?.trim() || undefined;
|
const narration = parsed.narration?.trim() || undefined;
|
||||||
const speaker = parsed.speaker?.trim() || undefined;
|
const speaker = parsed.speaker?.trim() || undefined;
|
||||||
const line = parsed.line?.trim() || undefined;
|
const line = parsed.line?.trim() || undefined;
|
||||||
|
const lineDelivery = line ? parsed.lineDelivery?.trim() || undefined : undefined;
|
||||||
|
|
||||||
// If the model returned nothing usable, supply a fallback narration so the
|
// If the model returned nothing usable, supply a fallback narration so the
|
||||||
// frontend doesn't append a silent empty beat that renders no dialogue —
|
// frontend doesn't append a silent empty beat that renders no dialogue —
|
||||||
@@ -309,5 +336,5 @@ export async function directInsertBeat(
|
|||||||
if (!narration && !speaker && !line) {
|
if (!narration && !speaker && !line) {
|
||||||
return { narration: "(你停下脚步,环视片刻。)" };
|
return { narration: "(你停下脚步,环视片刻。)" };
|
||||||
}
|
}
|
||||||
return { narration, speaker, line };
|
return { narration, speaker, line, lineDelivery };
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,4 +5,7 @@ export {
|
|||||||
requestInsertBeat,
|
requestInsertBeat,
|
||||||
} from "./orchestrator";
|
} from "./orchestrator";
|
||||||
export { annotateClick } from "./annotate";
|
export { annotateClick } from "./annotate";
|
||||||
|
export { voiceBeat, voiceScene } from "./voice";
|
||||||
|
export type { SceneResult } from "./director";
|
||||||
|
export type { InsertBeatPartial } from "@yume/types";
|
||||||
export * from "./prompts";
|
export * from "./prompts";
|
||||||
|
|||||||
@@ -0,0 +1,25 @@
|
|||||||
|
import sharp from "sharp";
|
||||||
|
|
||||||
|
let cached: string | undefined;
|
||||||
|
|
||||||
|
// A static 16:9 placeholder used when MOCK_IMAGE=true, so we can exercise the
|
||||||
|
// TTS path without paying for image generation. Generated once, then memoized.
|
||||||
|
export async function mockImageBase64(): Promise<string> {
|
||||||
|
if (cached) return cached;
|
||||||
|
|
||||||
|
const W = 1792;
|
||||||
|
const H = 1024;
|
||||||
|
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
|
||||||
|
<rect width="${W}" height="${H}" fill="#161109"/>
|
||||||
|
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none"
|
||||||
|
stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
|
||||||
|
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif"
|
||||||
|
font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
|
||||||
|
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif"
|
||||||
|
font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
|
||||||
|
</svg>`;
|
||||||
|
|
||||||
|
const png = await sharp(Buffer.from(svg)).png().toBuffer();
|
||||||
|
cached = png.toString("base64");
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
@@ -1,7 +1,10 @@
|
|||||||
import type {
|
import type {
|
||||||
|
BeatAudio,
|
||||||
|
Character,
|
||||||
EngineConfig,
|
EngineConfig,
|
||||||
InsertBeatRequest,
|
InsertBeatRequest,
|
||||||
InsertBeatResponse,
|
InsertBeatResponse,
|
||||||
|
Scene,
|
||||||
SceneRequest,
|
SceneRequest,
|
||||||
SceneResponse,
|
SceneResponse,
|
||||||
Session,
|
Session,
|
||||||
@@ -12,15 +15,55 @@ import type {
|
|||||||
} from "@yume/types";
|
} from "@yume/types";
|
||||||
import { annotateClick } from "./annotate";
|
import { annotateClick } from "./annotate";
|
||||||
import { directInsertBeat, directScene } from "./director";
|
import { directInsertBeat, directScene } from "./director";
|
||||||
|
import { mockImageBase64 } from "./mockImage";
|
||||||
import { render } from "./renderer";
|
import { render } from "./renderer";
|
||||||
import { interpret } from "./vision";
|
import { interpret } from "./vision";
|
||||||
|
import { voiceBeat, voiceScene } from "./voice";
|
||||||
|
|
||||||
function newSessionId(): string {
|
function newSessionId(): string {
|
||||||
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Merge new character entries into the registry by name. If a name already
|
||||||
|
// exists we preserve the existing voice (so a description revision never
|
||||||
|
// silently re-provisions a voice the player has already heard).
|
||||||
|
function mergeCharacters(existing: Character[], updates: Character[]): Character[] {
|
||||||
|
if (updates.length === 0) return existing;
|
||||||
|
const byName = new Map(existing.map((c) => [c.name, c]));
|
||||||
|
for (const u of updates) {
|
||||||
|
const prev = byName.get(u.name);
|
||||||
|
byName.set(u.name, prev?.voice ? { ...u, voice: prev.voice } : u);
|
||||||
|
}
|
||||||
|
return Array.from(byName.values());
|
||||||
|
}
|
||||||
|
|
||||||
|
async function renderImage(
|
||||||
|
config: EngineConfig,
|
||||||
|
scene: Scene,
|
||||||
|
styleGuide: string,
|
||||||
|
): Promise<string> {
|
||||||
|
if (config.mockImage) return mockImageBase64();
|
||||||
|
return render(config.image, scene, styleGuide);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runVoiceScene(
|
||||||
|
config: EngineConfig,
|
||||||
|
session: Session,
|
||||||
|
scene: Scene,
|
||||||
|
): Promise<{
|
||||||
|
beatAudio?: Record<string, BeatAudio>;
|
||||||
|
characters: Character[];
|
||||||
|
}> {
|
||||||
|
if (!config.tts) return { characters: session.characters };
|
||||||
|
const res = await voiceScene(config.tts, session, scene);
|
||||||
|
return {
|
||||||
|
beatAudio: Object.keys(res.beatAudio).length ? res.beatAudio : undefined,
|
||||||
|
characters: res.characters,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
// startSession — first scene + image
|
// startSession — first scene + image + per-beat voice
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export async function startSession(
|
export async function startSession(
|
||||||
@@ -33,31 +76,55 @@ export async function startSession(
|
|||||||
worldSetting: req.worldSetting.trim(),
|
worldSetting: req.worldSetting.trim(),
|
||||||
styleGuide: req.styleGuide.trim(),
|
styleGuide: req.styleGuide.trim(),
|
||||||
history: [],
|
history: [],
|
||||||
|
characters: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
const scene = await directScene(config.text, session);
|
const { scene, characterUpdates } = await directScene(config.text, session);
|
||||||
const imageBase64 = await render(config.image, scene, session.styleGuide);
|
const preVoiceSession: Session = {
|
||||||
|
...session,
|
||||||
|
characters: mergeCharacters(session.characters, characterUpdates),
|
||||||
|
};
|
||||||
|
|
||||||
|
const [imageBase64, voiceRes] = await Promise.all([
|
||||||
|
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||||
|
runVoiceScene(config, preVoiceSession, scene),
|
||||||
|
]);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
sessionId: session.id,
|
sessionId: session.id,
|
||||||
scene,
|
scene,
|
||||||
imageBase64,
|
imageBase64,
|
||||||
|
characters: voiceRes.characters,
|
||||||
|
beatAudio: voiceRes.beatAudio,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
// requestScene — generate the NEXT scene + image.
|
// requestScene — generate the NEXT scene + image + per-beat voice.
|
||||||
// Frontend passes a session whose latest history entry has `exit` set.
|
// Used both on real scene transitions and on speculative prefetch.
|
||||||
// Also used for prefetch speculation (frontend synthesizes the exit).
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export async function requestScene(
|
export async function requestScene(
|
||||||
config: EngineConfig,
|
config: EngineConfig,
|
||||||
req: SceneRequest,
|
req: SceneRequest,
|
||||||
): Promise<SceneResponse> {
|
): Promise<SceneResponse> {
|
||||||
const scene = await directScene(config.text, req.session);
|
const { scene, characterUpdates } = await directScene(config.text, req.session);
|
||||||
const imageBase64 = await render(config.image, scene, req.session.styleGuide);
|
const preVoiceSession: Session = {
|
||||||
return { scene, imageBase64 };
|
...req.session,
|
||||||
|
characters: mergeCharacters(req.session.characters, characterUpdates),
|
||||||
|
};
|
||||||
|
|
||||||
|
const [imageBase64, voiceRes] = await Promise.all([
|
||||||
|
renderImage(config, scene, preVoiceSession.styleGuide),
|
||||||
|
runVoiceScene(config, preVoiceSession, scene),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
scene,
|
||||||
|
imageBase64,
|
||||||
|
characters: voiceRes.characters,
|
||||||
|
beatAudio: voiceRes.beatAudio,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
@@ -75,6 +142,7 @@ export async function visionDecide(
|
|||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
// requestInsertBeat — generates a transient in-scene beat (no image regen)
|
// requestInsertBeat — generates a transient in-scene beat (no image regen)
|
||||||
|
// and voices the line if any.
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export async function requestInsertBeat(
|
export async function requestInsertBeat(
|
||||||
@@ -86,5 +154,49 @@ export async function requestInsertBeat(
|
|||||||
req.session,
|
req.session,
|
||||||
req.freeformAction,
|
req.freeformAction,
|
||||||
);
|
);
|
||||||
return { partial };
|
|
||||||
|
// INSERT_BEAT prompt forbids new characters — but if the director violates
|
||||||
|
// it, voiceBeat's name-inferred fallback would silently provision and persist
|
||||||
|
// the hallucinated speaker. Strip the speaker attribution and promote the
|
||||||
|
// line into narration so the player still sees the text (the client only
|
||||||
|
// renders `line` when there is a `speaker`).
|
||||||
|
if (
|
||||||
|
partial.speaker &&
|
||||||
|
!req.session.characters.some((c) => c.name === partial.speaker)
|
||||||
|
) {
|
||||||
|
console.warn(
|
||||||
|
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
|
||||||
|
);
|
||||||
|
const promotedNarration =
|
||||||
|
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
|
||||||
|
return {
|
||||||
|
partial: {
|
||||||
|
narration: promotedNarration,
|
||||||
|
speaker: undefined,
|
||||||
|
line: undefined,
|
||||||
|
lineDelivery: undefined,
|
||||||
|
},
|
||||||
|
characters: req.session.characters,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!config.tts) {
|
||||||
|
// Always echo characters so callers don't need a ?? fallback.
|
||||||
|
return { partial, characters: req.session.characters };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert beats stay in-scene and (per the INSERT_BEAT prompt) reuse the
|
||||||
|
// registered cast, so we voice against the existing character set.
|
||||||
|
const voiceRes = await voiceBeat(
|
||||||
|
config.tts,
|
||||||
|
req.session,
|
||||||
|
req.session.characters,
|
||||||
|
partial,
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
partial,
|
||||||
|
characters: voiceRes.characters,
|
||||||
|
audio: voiceRes.audio,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,11 +4,12 @@ import type { Scene, Session } from "@yume/types";
|
|||||||
// Director — emits one Scene (background + a graph of beats) at a time.
|
// Director — emits one Scene (background + a graph of beats) at a time.
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史,输出**一个完整的场景**。
|
export const DIRECTOR_SYSTEM = `你是一个交互视觉小说的「场景导演」。每次基于世界观、画风、玩家历史、已登记角色,输出**一个完整的场景**,并为每句台词配上细腻的配音导演指令。
|
||||||
|
|
||||||
一个场景包含:
|
一个场景包含:
|
||||||
- 一张背景图(你给出英文 scenePrompt)
|
- 一张背景图(你给出英文 scenePrompt)
|
||||||
- 一组对话节拍 beats,玩家会按顺序经历它们
|
- 一组对话节拍 beats,玩家会按顺序经历它们
|
||||||
|
- 任何**首次登场**的角色,需在 characterUpdates 里登记其专属音色设计
|
||||||
|
|
||||||
每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接:
|
每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接:
|
||||||
- "continue": 玩家点击图片背景 / 按继续,自然推进到下一个 beat
|
- "continue": 玩家点击图片背景 / 按继续,自然推进到下一个 beat
|
||||||
@@ -30,27 +31,42 @@ choice 的 effect 有两种:
|
|||||||
- choice 至少 2 个,至多 4 个,互不重复
|
- choice 至少 2 个,至多 4 个,互不重复
|
||||||
|
|
||||||
文本风格约束:
|
文本风格约束:
|
||||||
- narration / line 用中文,scenePrompt 用英文
|
- narration / line 用中文(**纯净可显示文本**,绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的,会被玩家看见)
|
||||||
|
- scenePrompt / lineDelivery / characterUpdates 内的文字按下方专门说明
|
||||||
- 单个 beat 的 narration 与 line 加起来 ≤80 字
|
- 单个 beat 的 narration 与 line 加起来 ≤80 字
|
||||||
- 单个 choice label ≤15 字
|
- 单个 choice label ≤15 字
|
||||||
- scenePrompt 只描述画面里看到什么,不要描述 UI
|
- scenePrompt 用英文,只描述画面里看到什么,不要描述 UI
|
||||||
|
|
||||||
|
配音相关字段:
|
||||||
|
- 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的"配音导演指令",描述该句台词怎么念(情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏)。例:"鼓起勇气又害羞,声音发颤、偏小,句尾带一丝气声,语速偏慢"。平淡场合写"平静自然、语速适中"即可,但要贴当下情境。
|
||||||
|
- characterUpdates 仅当**有新角色首次出现**时列出该新角色的音色设计;已登记的角色不要重复列出。
|
||||||
|
- characterUpdates[].description **必须以明确性别开头**("女性,…" / "男性,…"),随后描述:年龄、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言。例:"女性,约17岁少女,音色清亮带点稚嫩甜美,性格开朗,语速偏快,标准普通话"。
|
||||||
|
|
||||||
|
角色与台词的硬性规则(影响配音正确性):
|
||||||
|
- 任何 beat 的 speaker 字段一旦填了名字,**该名字必须**:① 在"已登记角色"列表中存在,或 ② 本次输出的 characterUpdates 里登记。绝不允许 speaker 是个未登记的陌生名字。
|
||||||
|
- speaker 名字必须与登记名**完全一致**,不要加「(回忆)」「学姐」之类后缀或别名。
|
||||||
|
|
||||||
必须输出严格 JSON,结构如下:
|
必须输出严格 JSON,结构如下:
|
||||||
{
|
{
|
||||||
"scenePrompt": "english scene description, no UI",
|
"scenePrompt": "english scene description, no UI",
|
||||||
"entryBeatId": "b1",
|
"entryBeatId": "b1",
|
||||||
|
"characterUpdates": [
|
||||||
|
{ "name": "夏海", "description": "女性,约17岁少女,音色清亮带点稚嫩甜美…" }
|
||||||
|
],
|
||||||
"beats": [
|
"beats": [
|
||||||
{
|
{
|
||||||
"id": "b1",
|
"id": "b1",
|
||||||
"narration": "可空",
|
"narration": "可空(纯净文本)",
|
||||||
"speaker": "可空",
|
"speaker": "可空",
|
||||||
"line": "可空",
|
"line": "可空(纯净文本)",
|
||||||
|
"lineDelivery": "line 非空时必填:配音导演指令",
|
||||||
"next": { "type": "continue", "nextBeatId": "b2" }
|
"next": { "type": "continue", "nextBeatId": "b2" }
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "b2",
|
"id": "b2",
|
||||||
"speaker": "...",
|
"speaker": "夏海",
|
||||||
"line": "...",
|
"line": "学长,我有话想对你说。",
|
||||||
|
"lineDelivery": "鼓起勇气,但又有点害羞,语速偏慢,句尾微微上扬",
|
||||||
"next": {
|
"next": {
|
||||||
"type": "choice",
|
"type": "choice",
|
||||||
"choices": [
|
"choices": [
|
||||||
@@ -77,6 +93,13 @@ export function buildDirectorUserMessage(session: Session): string {
|
|||||||
parts.push(`世界观:${session.worldSetting}`);
|
parts.push(`世界观:${session.worldSetting}`);
|
||||||
parts.push(`画风:${session.styleGuide}`);
|
parts.push(`画风:${session.styleGuide}`);
|
||||||
|
|
||||||
|
if (session.characters.length > 0) {
|
||||||
|
parts.push("\n已登记角色(speaker 必须用这些名字之一,或在本次 characterUpdates 里登记新名):");
|
||||||
|
for (const c of session.characters) {
|
||||||
|
parts.push(`- ${c.name}:${c.description}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (session.history.length === 0) {
|
if (session.history.length === 0) {
|
||||||
parts.push("\n这是故事的开场。请生成第一个场景,严格以 JSON 格式返回。");
|
parts.push("\n这是故事的开场。请生成第一个场景,严格以 JSON 格式返回。");
|
||||||
return parts.join("\n");
|
return parts.join("\n");
|
||||||
@@ -142,19 +165,22 @@ export function buildDirectorUserMessage(session: Session): string {
|
|||||||
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**(比如看一眼桌上的相框、想了想刚才那句话)。请基于此动作,写出一个**单独的、过渡性的 beat**:可以是旁白、角色台词、或两者结合。
|
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**(比如看一眼桌上的相框、想了想刚才那句话)。请基于此动作,写出一个**单独的、过渡性的 beat**:可以是旁白、角色台词、或两者结合。
|
||||||
|
|
||||||
文本风格约束:
|
文本风格约束:
|
||||||
- narration / line 用中文
|
- narration / line 用中文,**纯净可显示文本**,不要写 (叹气) 这类配音标注
|
||||||
- narration 与 line 加起来 ≤80 字
|
- narration 与 line 加起来 ≤80 字
|
||||||
- 不要打破当前场景的物理状态(玩家仍在原地、对面仍是同一个角色)
|
- 不要打破当前场景的物理状态(玩家仍在原地、对面仍是同一个角色)
|
||||||
- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
|
- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
|
||||||
|
- 如果有 line,speaker 必须用**已登记角色**里的名字(绝不允许引入新角色)
|
||||||
|
- 如果有 line,**必须**给出 lineDelivery(配音导演指令,自由中文,描述这句话怎么念)
|
||||||
|
|
||||||
必须输出严格 JSON:
|
必须输出严格 JSON:
|
||||||
{
|
{
|
||||||
"narration": "...",
|
"narration": "...",
|
||||||
"speaker": "...",
|
"speaker": "...",
|
||||||
"line": "..."
|
"line": "...",
|
||||||
|
"lineDelivery": "..."
|
||||||
}
|
}
|
||||||
|
|
||||||
字段都可为空字符串。不要输出 JSON 以外的任何文本。`;
|
narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
|
||||||
|
|
||||||
export function buildInsertBeatUserMessage(
|
export function buildInsertBeatUserMessage(
|
||||||
session: Session,
|
session: Session,
|
||||||
@@ -163,9 +189,16 @@ export function buildInsertBeatUserMessage(
|
|||||||
const parts: string[] = [];
|
const parts: string[] = [];
|
||||||
parts.push(`世界观:${session.worldSetting}`);
|
parts.push(`世界观:${session.worldSetting}`);
|
||||||
|
|
||||||
|
if (session.characters.length > 0) {
|
||||||
|
parts.push("\n已登记角色(speaker 只能用这些名字):");
|
||||||
|
for (const c of session.characters) {
|
||||||
|
parts.push(`- ${c.name}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const current = session.history.at(-1);
|
const current = session.history.at(-1);
|
||||||
if (current) {
|
if (current) {
|
||||||
parts.push(`当前场景:${current.scene.scenePrompt}`);
|
parts.push(`\n当前场景:${current.scene.scenePrompt}`);
|
||||||
const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
|
const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId;
|
||||||
const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId);
|
const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId);
|
||||||
if (lastBeat) {
|
if (lastBeat) {
|
||||||
|
|||||||
@@ -0,0 +1,106 @@
|
|||||||
|
import { provisionVoice, synthesize } from "@yume/tts-client";
|
||||||
|
import type {
|
||||||
|
BeatAudio,
|
||||||
|
Character,
|
||||||
|
CharacterVoice,
|
||||||
|
Scene,
|
||||||
|
Session,
|
||||||
|
TtsConfig,
|
||||||
|
} from "@yume/types";
|
||||||
|
|
||||||
|
export type BeatLike = {
|
||||||
|
id?: string;
|
||||||
|
speaker?: string;
|
||||||
|
line?: string;
|
||||||
|
lineDelivery?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
// When the director references a speaker that was never registered, derive a
|
||||||
|
// description from the name + world so the voice's gender/temperament is at
|
||||||
|
// least inferred from the name — never borrowed from another character.
|
||||||
|
function inferredSpeakerDescription(name: string, session: Session): string {
|
||||||
|
return `请根据角色名「${name}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice a single beat against a mutable character registry.
|
||||||
|
// Returns the (possibly-extended) registry plus the audio if synthesized.
|
||||||
|
// Narration-only beats and missing-line beats return no audio (VN convention).
|
||||||
|
export async function voiceBeat(
|
||||||
|
cfg: TtsConfig,
|
||||||
|
session: Session,
|
||||||
|
characters: Character[],
|
||||||
|
beat: BeatLike,
|
||||||
|
): Promise<{ audio?: BeatAudio; characters: Character[] }> {
|
||||||
|
if (!beat.speaker || !beat.line) {
|
||||||
|
return { characters };
|
||||||
|
}
|
||||||
|
|
||||||
|
const speakerName = beat.speaker;
|
||||||
|
const text = beat.line;
|
||||||
|
const delivery = beat.lineDelivery;
|
||||||
|
|
||||||
|
// Hoisted so the catch can return the in-progress registry even if synthesis
|
||||||
|
// fails after provisioning succeeded — otherwise the just-provisioned voice
|
||||||
|
// would be lost and the next beat for this speaker would pay to re-design it
|
||||||
|
// (extra cost, latency, and more 429 risk on rate-limited providers).
|
||||||
|
let nextCharacters: Character[] = characters;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const idx = characters.findIndex((c) => c.name === speakerName);
|
||||||
|
let voice: CharacterVoice | undefined;
|
||||||
|
|
||||||
|
if (idx !== -1 && characters[idx]?.voice) {
|
||||||
|
voice = characters[idx]!.voice;
|
||||||
|
} else if (idx !== -1) {
|
||||||
|
const target = characters[idx]!;
|
||||||
|
voice = await provisionVoice(cfg, target.description);
|
||||||
|
nextCharacters = characters.map((c, i) =>
|
||||||
|
i === idx ? { ...c, voice } : c,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
const description = inferredSpeakerDescription(speakerName, session);
|
||||||
|
voice = await provisionVoice(cfg, description);
|
||||||
|
nextCharacters = [...characters, { name: speakerName, description, voice }];
|
||||||
|
}
|
||||||
|
|
||||||
|
const { audioBase64, mimeType } = await synthesize(
|
||||||
|
cfg,
|
||||||
|
voice,
|
||||||
|
text,
|
||||||
|
delivery,
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
audio: { base64: audioBase64, mime: mimeType },
|
||||||
|
characters: nextCharacters,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
const msg = err instanceof Error ? err.message : String(err);
|
||||||
|
console.error(`[voice] degraded: ${msg}`);
|
||||||
|
return { characters: nextCharacters };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voice every beat in a scene. Sequential by design: a single speaker
|
||||||
|
// appearing in multiple beats must provision exactly once and share that
|
||||||
|
// voice across calls — parallel synthesis would race and create duplicates.
|
||||||
|
// With 2–6 beats × ~500ms per clone the total cost is well inside the image
|
||||||
|
// generation budget (10s+), so the simplicity is worth it.
|
||||||
|
export async function voiceScene(
|
||||||
|
cfg: TtsConfig,
|
||||||
|
session: Session,
|
||||||
|
scene: Scene,
|
||||||
|
): Promise<{
|
||||||
|
beatAudio: Record<string, BeatAudio>;
|
||||||
|
characters: Character[];
|
||||||
|
}> {
|
||||||
|
let characters = session.characters;
|
||||||
|
const beatAudio: Record<string, BeatAudio> = {};
|
||||||
|
|
||||||
|
for (const beat of scene.beats) {
|
||||||
|
const res = await voiceBeat(cfg, session, characters, beat);
|
||||||
|
characters = res.characters;
|
||||||
|
if (res.audio) beatAudio[beat.id] = res.audio;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { beatAudio, characters };
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"name": "@yume/tts-client",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"private": true,
|
||||||
|
"type": "module",
|
||||||
|
"main": "./src/index.ts",
|
||||||
|
"types": "./src/index.ts",
|
||||||
|
"exports": {
|
||||||
|
".": "./src/index.ts"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"typecheck": "tsc --noEmit"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@yume/types": "workspace:*"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export { xiaomiProvision as provisionVoice, xiaomiSynthesize as synthesize } from "./xiaomi";
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
import type { CharacterVoice, TtsConfig } from "@yume/types";
|
||||||
|
|
||||||
|
// Xiaomi MiMo currently outputs wav / pcm16 only (mp3 not supported for output).
|
||||||
|
// The reference clip we persist is therefore wav. Kept as a single switch so we
|
||||||
|
// can flip to mp3 the day the API supports it.
|
||||||
|
const OUTPUT_FORMAT = "wav";
|
||||||
|
const OUTPUT_MIME = "audio/wav";
|
||||||
|
|
||||||
|
function buildHeaders(cfg: TtsConfig): HeadersInit {
|
||||||
|
return {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"api-key": cfg.apiKey,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function joinUrl(baseUrl: string, path: string): string {
|
||||||
|
return `${baseUrl.replace(/\/$/, "")}${path}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function designModel(cfg: TtsConfig): string {
|
||||||
|
return `${cfg.speechModel}-voicedesign`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cloneModel(cfg: TtsConfig): string {
|
||||||
|
return `${cfg.speechModel}-voiceclone`;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ChatAudioResponse = {
|
||||||
|
choices?: Array<{ message?: { audio?: { data?: string } } }>;
|
||||||
|
error?: { message?: string };
|
||||||
|
message?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function extractAudio(json: ChatAudioResponse, where: string): string {
|
||||||
|
const data = json.choices?.[0]?.message?.audio?.data;
|
||||||
|
if (!data) {
|
||||||
|
const err = json.error?.message ?? json.message ?? JSON.stringify(json);
|
||||||
|
throw new Error(`Xiaomi ${where} returned no audio: ${err.slice(0, 300)}`);
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function xiaomiProvision(
|
||||||
|
cfg: TtsConfig,
|
||||||
|
description: string,
|
||||||
|
): Promise<CharacterVoice> {
|
||||||
|
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
||||||
|
|
||||||
|
const body = {
|
||||||
|
model: designModel(cfg),
|
||||||
|
messages: [
|
||||||
|
{ role: "user", content: description },
|
||||||
|
{ role: "assistant", content: "你好,这是音色试听样本。" },
|
||||||
|
],
|
||||||
|
audio: { format: OUTPUT_FORMAT },
|
||||||
|
};
|
||||||
|
|
||||||
|
const res = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: buildHeaders(cfg),
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const text = await res.text();
|
||||||
|
throw new Error(`Xiaomi voicedesign ${res.status}: ${text.slice(0, 300)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const json = (await res.json()) as ChatAudioResponse;
|
||||||
|
const referenceAudioBase64 = extractAudio(json, "voicedesign");
|
||||||
|
|
||||||
|
return { provider: "xiaomi", referenceAudioBase64, mimeType: OUTPUT_MIME };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function xiaomiSynthesize(
|
||||||
|
cfg: TtsConfig,
|
||||||
|
voice: CharacterVoice,
|
||||||
|
text: string,
|
||||||
|
delivery?: string,
|
||||||
|
): Promise<{ audioBase64: string; mimeType: string }> {
|
||||||
|
const url = joinUrl(cfg.baseUrl, "/chat/completions");
|
||||||
|
|
||||||
|
// The free-form delivery direction rides in the `user` (director) message,
|
||||||
|
// so it shapes the performance without ever being read aloud. The spoken
|
||||||
|
// text stays in the `assistant` message, clean.
|
||||||
|
const body = {
|
||||||
|
model: cloneModel(cfg),
|
||||||
|
messages: [
|
||||||
|
{ role: "user", content: delivery?.trim() ?? "" },
|
||||||
|
{ role: "assistant", content: text },
|
||||||
|
],
|
||||||
|
audio: {
|
||||||
|
format: OUTPUT_FORMAT,
|
||||||
|
voice: `data:${voice.mimeType};base64,${voice.referenceAudioBase64}`,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const res = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: buildHeaders(cfg),
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const txt = await res.text();
|
||||||
|
throw new Error(`Xiaomi voiceclone ${res.status}: ${txt.slice(0, 300)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const json = (await res.json()) as ChatAudioResponse;
|
||||||
|
const audioBase64 = extractAudio(json, "voiceclone");
|
||||||
|
|
||||||
|
return { audioBase64, mimeType: OUTPUT_MIME };
|
||||||
|
}
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"extends": "../../tsconfig.base.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"noEmit": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*"]
|
||||||
|
}
|
||||||
@@ -9,6 +9,8 @@ export type Beat = {
|
|||||||
narration?: string;
|
narration?: string;
|
||||||
speaker?: string;
|
speaker?: string;
|
||||||
line?: string;
|
line?: string;
|
||||||
|
/** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
|
||||||
|
lineDelivery?: string;
|
||||||
next: BeatNext;
|
next: BeatNext;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -54,6 +56,30 @@ export type SceneHistoryEntry = {
|
|||||||
exit?: SceneExit;
|
exit?: SceneExit;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
// Characters & voices (TTS)
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type CharacterVoice = {
|
||||||
|
provider: "xiaomi";
|
||||||
|
/** Xiaomi MiMo design output stored as reference audio for later clones. */
|
||||||
|
referenceAudioBase64: string;
|
||||||
|
mimeType: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type Character = {
|
||||||
|
name: string;
|
||||||
|
/** Free-form voice design description; must begin with explicit gender. */
|
||||||
|
description: string;
|
||||||
|
voice?: CharacterVoice;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** A single beat's synthesized audio, attached to the response. */
|
||||||
|
export type BeatAudio = {
|
||||||
|
base64: string;
|
||||||
|
mime: string;
|
||||||
|
};
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
// Session
|
// Session
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
@@ -64,6 +90,8 @@ export type Session = {
|
|||||||
worldSetting: string;
|
worldSetting: string;
|
||||||
styleGuide: string;
|
styleGuide: string;
|
||||||
history: SceneHistoryEntry[];
|
history: SceneHistoryEntry[];
|
||||||
|
/** Character registry — accumulates across scenes; voices persist for reuse. */
|
||||||
|
characters: Character[];
|
||||||
};
|
};
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
@@ -87,10 +115,21 @@ export type ProviderConfig = {
|
|||||||
model: string;
|
model: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type TtsConfig = {
|
||||||
|
baseUrl: string;
|
||||||
|
apiKey: string;
|
||||||
|
/** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
|
||||||
|
speechModel: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type EngineConfig = {
|
export type EngineConfig = {
|
||||||
text: ProviderConfig;
|
text: ProviderConfig;
|
||||||
image: ProviderConfig;
|
image: ProviderConfig;
|
||||||
vision: ProviderConfig;
|
vision: ProviderConfig;
|
||||||
|
/** Optional — when missing the game runs silently (no TTS). */
|
||||||
|
tts?: TtsConfig;
|
||||||
|
/** When true the renderer returns a placeholder PNG instead of calling the image API. */
|
||||||
|
mockImage?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ──────────────────────────────────────────────────────────────────────
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
@@ -106,6 +145,10 @@ export type StartResponse = {
|
|||||||
sessionId: string;
|
sessionId: string;
|
||||||
scene: Scene;
|
scene: Scene;
|
||||||
imageBase64: string;
|
imageBase64: string;
|
||||||
|
/** Post-voice character registry (with provisioned voices). */
|
||||||
|
characters: Character[];
|
||||||
|
/** Per-beat synthesized audio, keyed by beat.id. */
|
||||||
|
beatAudio?: Record<string, BeatAudio>;
|
||||||
};
|
};
|
||||||
|
|
||||||
// /api/scene — generates the next Scene, given session whose latest
|
// /api/scene — generates the next Scene, given session whose latest
|
||||||
@@ -118,6 +161,8 @@ export type SceneRequest = {
|
|||||||
export type SceneResponse = {
|
export type SceneResponse = {
|
||||||
scene: Scene;
|
scene: Scene;
|
||||||
imageBase64: string;
|
imageBase64: string;
|
||||||
|
characters: Character[];
|
||||||
|
beatAudio?: Record<string, BeatAudio>;
|
||||||
};
|
};
|
||||||
|
|
||||||
// /api/vision — interprets a background click on the current image and
|
// /api/vision — interprets a background click on the current image and
|
||||||
@@ -141,10 +186,16 @@ export type InsertBeatRequest = {
|
|||||||
freeformAction: string;
|
freeformAction: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type InsertBeatResponse = {
|
/** Partial beat fields produced by the insert-beat director. */
|
||||||
partial: {
|
export type InsertBeatPartial = {
|
||||||
narration?: string;
|
narration?: string;
|
||||||
speaker?: string;
|
speaker?: string;
|
||||||
line?: string;
|
line?: string;
|
||||||
};
|
lineDelivery?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type InsertBeatResponse = {
|
||||||
|
partial: InsertBeatPartial;
|
||||||
|
characters: Character[];
|
||||||
|
audio?: BeatAudio;
|
||||||
};
|
};
|
||||||
|
|||||||
Generated
+9
@@ -69,6 +69,9 @@ importers:
|
|||||||
'@yume/ai-client':
|
'@yume/ai-client':
|
||||||
specifier: workspace:*
|
specifier: workspace:*
|
||||||
version: link:../ai-client
|
version: link:../ai-client
|
||||||
|
'@yume/tts-client':
|
||||||
|
specifier: workspace:*
|
||||||
|
version: link:../tts-client
|
||||||
'@yume/types':
|
'@yume/types':
|
||||||
specifier: workspace:*
|
specifier: workspace:*
|
||||||
version: link:../types
|
version: link:../types
|
||||||
@@ -76,6 +79,12 @@ importers:
|
|||||||
specifier: ^0.33.5
|
specifier: ^0.33.5
|
||||||
version: 0.33.5
|
version: 0.33.5
|
||||||
|
|
||||||
|
packages/tts-client:
|
||||||
|
dependencies:
|
||||||
|
'@yume/types':
|
||||||
|
specifier: workspace:*
|
||||||
|
version: link:../types
|
||||||
|
|
||||||
packages/types: {}
|
packages/types: {}
|
||||||
|
|
||||||
packages:
|
packages:
|
||||||
|
|||||||
Reference in New Issue
Block a user