diff --git a/apps/web/.env.example b/apps/web/.env.example index daf42be..aa0d983 100644 --- a/apps/web/.env.example +++ b/apps/web/.env.example @@ -1,24 +1,45 @@ # ============================================================= # 云梦 — AI 视觉小说 -# Three independently configurable AI providers -# Any OpenAI-compatible endpoint works (OpenRouter, OpenAI, -# Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, Ollama). +# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS +# (one API key covers all three) + any image provider for IMAGE. # +# Any OpenAI-compatible endpoint works for any slot — OpenRouter, +# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc. # Image generation uses the chat-completions + modalities API # (OpenRouter-style), NOT the legacy /images/generations endpoint. # ============================================================= -# ---- 1. Text LLM (story director) ----------------------------- -TEXT_BASE_URL=https://openrouter.ai/api/v1 -TEXT_API_KEY=sk-or-v1-xxx -TEXT_MODEL=~anthropic/claude-sonnet-latest +# ---- 1. Text LLM · scene director ---------------------------------- +# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN) +# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1 +# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys) +TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 +TEXT_API_KEY=tp-xxx +TEXT_MODEL=mimo-v2.5-pro -# ---- 2. Image generator (renders the whole UI screen) --------- +# ---- 2. Image generator (renders the scene background) ------------- +# Any provider supporting chat-completions + modalities image output. IMAGE_BASE_URL=https://openrouter.ai/api/v1 IMAGE_API_KEY=sk-or-v1-xxx IMAGE_MODEL=openai/gpt-5.4-image-2 -# ---- 3. Vision model (interprets where the user clicked) ------ -VISION_BASE_URL=https://openrouter.ai/api/v1 -VISION_API_KEY=sk-or-v1-xxx -VISION_MODEL=~google/gemini-flash-latest +# ---- 3. Vision model · multimodal click interpretation ------------- +# Recommended: MiMo V2.5 omni — multimodal. +# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and +# rejects image_url content parts. +VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 +VISION_API_KEY=tp-xxx +VISION_MODEL=mimo-v2.5 + +# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------ +# Per-character voice design → clone, with per-line delivery direction. +# Voice identity = the reference audio kept in the session (no server expiry). +# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL. +TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 +TTS_API_KEY=tp-xxx +TTS_SPEECH_MODEL=mimo-v2.5-tts + +# ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) ----- +# true → return a placeholder image instead of calling the image model. +# Text/story/voice still run normally. Great for iterating on TTS. +MOCK_IMAGE=false diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx index 0e56ebd..586f4ac 100644 --- a/apps/web/app/play/page.tsx +++ b/apps/web/app/play/page.tsx @@ -14,6 +14,7 @@ import { PlayCanvas, type Phase } from "@/components/PlayCanvas"; import { PRESETS } from "@/lib/presets"; import type { Beat, + BeatAudio, BeatChoice, InsertBeatResponse, Scene, @@ -24,6 +25,8 @@ import type { VisionResponse, } from "@yume/types"; +const MUTED_STORAGE_KEY = "yume:muted"; + // ────────────────────────────────────────────────────────────────────── // Prefetch pool — speculative SceneResponses keyed by choice path. // @@ -133,7 +136,16 @@ function prefetchScenePath( nextSceneSeed: sole.effect.nextSceneSeed, }, }; - prefetchScenePath(pool, baseSession, [...steps, nextStep], depth + 1); + // Carry forward the registry that the parent prefetch result already + // settled (it may include characters introduced by the intermediate + // scene). Without this, the L2+ prefetch starts from the original + // base.characters and a later transition through this survivor would + // silently drop voices the player has already heard. + const carriedBase: Session = { + ...baseSession, + characters: data.characters, + }; + prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1); } } @@ -181,6 +193,18 @@ function PlayInner() { const [currentScene, setCurrentScene] = useState(null); const [currentBeatId, setCurrentBeatId] = useState(null); const [imageBase64, setImageBase64] = useState(null); + const [beatAudioMap, setBeatAudioMap] = useState>({}); + // Lazy-initialize from localStorage so PlayCanvas never mounts with the + // wrong muted value (an effect-based read would briefly let audio play + // before the preference settled in a scenario where audio arrives early). + const [muted, setMuted] = useState(() => { + if (typeof window === "undefined") return false; + try { + return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1"; + } catch { + return false; + } + }); const [pendingClick, setPendingClick] = useState<{ x: number; y: number; @@ -203,6 +227,10 @@ function PlayInner() { return currentScene.beats.find((b) => b.id === currentBeatId) ?? null; }, [currentScene, currentBeatId]); + const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined; + const audioBase64 = currentBeatAudio?.base64 ?? null; + const audioMime = currentBeatAudio?.mime ?? null; + useEffect(() => { sessionRef.current = session; }, [session]); @@ -231,6 +259,19 @@ function PlayInner() { }); }, [currentBeatId]); + // ── Mute persistence (read is via the useState lazy initializer above) ─ + const toggleMuted = useCallback(() => { + setMuted((prev) => { + const next = !prev; + try { + window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0"); + } catch { + // ignore + } + return next; + }); + }, []); + // ── Presentation mode toggle ───────────────────────────────────────── const togglePresentation = useCallback(async () => { const entering = !presentation; @@ -327,12 +368,14 @@ function PlayInner() { visitedBeatIds: [data.scene.entryBeatId], }, ], + characters: data.characters, }; visitedBeatsRef.current = [data.scene.entryBeatId]; setSession(initial); setCurrentScene(data.scene); setCurrentBeatId(data.scene.entryBeatId); setImageBase64(data.imageBase64); + setBeatAudioMap(data.beatAudio ?? {}); setPhase("ready"); }) .catch((e) => setError(String(e))); @@ -409,12 +452,14 @@ function PlayInner() { visitedBeatIds: [result.scene.entryBeatId], }, ], + characters: result.characters, }; visitedBeatsRef.current = [result.scene.entryBeatId]; setSession(newSession); setCurrentScene(result.scene); setCurrentBeatId(result.scene.entryBeatId); setImageBase64(result.imageBase64); + setBeatAudioMap(result.beatAudio ?? {}); setLastExitLabel(exitLabel); setPhase("ready"); } catch (e) { @@ -514,7 +559,8 @@ function PlayInner() { }; throw new Error(j.error ?? insertRes.statusText); } - const { partial } = (await insertRes.json()) as InsertBeatResponse; + const { partial, characters: insertChars, audio } = + (await insertRes.json()) as InsertBeatResponse; const fromBeatId = currentBeatRef.current?.id ?? currentScene.entryBeatId; @@ -526,6 +572,7 @@ function PlayInner() { narration: partial.narration, speaker: partial.speaker, line: partial.line, + lineDelivery: partial.lineDelivery, next: { type: "continue", nextBeatId: fromBeatId }, }; @@ -541,11 +588,15 @@ function PlayInner() { history: s.history.map((h, i, arr) => i === arr.length - 1 ? { ...h, scene: patched } : h, ), + characters: insertChars, } : s, ); setCurrentScene(patched); setCurrentBeatId(newBeatId); + if (audio) { + setBeatAudioMap((m) => ({ ...m, [newBeatId]: audio })); + } setLastExitLabel(decision.intent.freeformAction); setPhase("ready"); setPendingClick(null); @@ -627,6 +678,9 @@ function PlayInner() {
Ⅰ · Ⅰ
- +
); diff --git a/apps/web/components/PlayCanvas.tsx b/apps/web/components/PlayCanvas.tsx index 51acdaa..9e77dde 100644 --- a/apps/web/components/PlayCanvas.tsx +++ b/apps/web/components/PlayCanvas.tsx @@ -13,30 +13,66 @@ export type Phase = const SHADOW = "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)"; +const DEFAULT_CHAR_MS = 28; +const MIN_CHAR_MS = 30; +// Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice +// while preserving pitch. Typewriter pacing is divided by the same factor. +const SPEECH_RATE = 1.2; +// If audio metadata never arrives within this window, give up waiting and +// let the typewriter run at default speed. +const AUDIO_WAIT_TIMEOUT_MS = 2500; + // ── Typewriter hook ──────────────────────────────────────────────────── // Returns the progressively-revealed text, a `done` flag, and a `skip()` that // instantly completes the current text. Reset is keyed by `resetKey` (the beat // id) rather than the text, so a new beat whose line happens to match the -// previous one still replays from scratch. `done` is derived synchronously -// (not from a post-paint effect) so a stale "done" frame never paints. +// previous one still replays from scratch. +// +// When `targetDurationMs` is provided we space characters to span that audio +// duration, keeping text and voice in lockstep. While `waitForAudio` is true +// and we don't yet know a duration, the typewriter holds (so text doesn't +// race ahead of an audio that's still loading). function useTypewriter( text: string, resetKey: string, - speed = 28, + opts: { targetDurationMs?: number; waitForAudio: boolean } = { + waitForAudio: false, + }, ): { shown: string; done: boolean; skip: () => void } { + const { targetDurationMs, waitForAudio } = opts; const [displayed, setDisplayed] = useState(""); const [prevKey, setPrevKey] = useState(resetKey); const timer = useRef | null>(null); + // Sticky once the player has skipped this beat: prevents a late-arriving + // audio metadata event from re-triggering the effect and replaying the text. + const skippedRef = useRef(false); // Render-phase reset (React "adjust state on prop change" pattern): when the // beat changes, drop the old progress before this render commits. if (resetKey !== prevKey) { setPrevKey(resetKey); setDisplayed(""); + skippedRef.current = false; } useEffect(() => { if (!text) return; + // `=== undefined` (not `!targetDurationMs`): 0 means "audio failed or + // timed out — run at default speed". The original truthy check stalled + // the typewriter forever on those fallback paths. + if (waitForAudio && targetDurationMs === undefined) return; + // If the player skipped, settle on the full text and don't restart even + // when audio metadata arrives late and re-triggers this effect. + if (skippedRef.current) { + setDisplayed(text); + return; + } + + const speed = + targetDurationMs && text.length > 0 + ? Math.max(MIN_CHAR_MS, targetDurationMs / text.length) + : DEFAULT_CHAR_MS; + let i = 0; timer.current = setInterval(() => { i += 1; @@ -50,13 +86,14 @@ function useTypewriter( if (timer.current) clearInterval(timer.current); timer.current = null; }; - }, [resetKey, text, speed]); + }, [resetKey, text, targetDurationMs, waitForAudio]); const skip = useCallback(() => { if (timer.current) { clearInterval(timer.current); timer.current = null; } + skippedRef.current = true; setDisplayed(text); }, [text]); @@ -123,6 +160,9 @@ function ChoiceButton({ // ── Main component ───────────────────────────────────────────────────── export function PlayCanvas({ imageBase64, + audioBase64, + audioMime, + muted, phase, beat, pendingClick, @@ -132,6 +172,9 @@ export function PlayCanvas({ fullViewport = false, }: { imageBase64: string | null; + audioBase64: string | null; + audioMime: string | null; + muted: boolean; phase: Phase; beat: Beat | null; pendingClick: { x: number; y: number } | null; @@ -141,7 +184,11 @@ export function PlayCanvas({ fullViewport?: boolean; }) { const imgRef = useRef(null); + const audioRef = useRef(null); const [dims, setDims] = useState<{ w: number; h: number } | null>(null); + const [audioDurationMs, setAudioDurationMs] = useState( + undefined, + ); const isChoiceBeat = beat?.next.type === "choice"; const choices: BeatChoice[] = isChoiceBeat @@ -150,7 +197,56 @@ export function PlayCanvas({ const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? ""; const { shown: typedBody, done: typingDone, skip: skipTypewriter } = - useTypewriter(displayBody, beat?.id ?? "", 30); + useTypewriter(displayBody, beat?.id ?? "", { + targetDurationMs: audioDurationMs, + waitForAudio: Boolean(audioBase64), + }); + + // ── Audio source change ────────────────────────────────────────────── + // Reset duration when audio source changes; if loading takes too long, + // unblock the typewriter via timeout so text doesn't stall. + useEffect(() => { + setAudioDurationMs(undefined); + if (!audioBase64) return; + const timer = setTimeout(() => { + setAudioDurationMs((prev) => prev ?? 0); + }, AUDIO_WAIT_TIMEOUT_MS); + return () => clearTimeout(timer); + }, [audioBase64]); + + // ── Mute toggle ─────────────────────────────────────────────────────── + useEffect(() => { + const el = audioRef.current; + if (!el) return; + el.muted = muted; + el.playbackRate = SPEECH_RATE; + if (!muted && audioBase64 && el.paused) { + el.play().catch(() => { + // autoplay blocked — silent until next interaction + }); + } + }, [muted, audioBase64]); + + function handleAudioMetadata() { + const el = audioRef.current; + if (!el) return; + el.playbackRate = SPEECH_RATE; + // Effective playback time is shorter once sped up — keep the typewriter in sync. + const ms = Number.isFinite(el.duration) + ? (el.duration * 1000) / SPEECH_RATE + : 0; + setAudioDurationMs(ms > 0 ? ms : 0); + if (!muted) { + el.play().catch(() => { + // autoplay blocked + }); + } + } + + function handleAudioError() { + // Treat as zero duration so the typewriter runs at default speed. + setAudioDurationMs(0); + } function handleImageClick(e: React.MouseEvent) { if (phase !== "ready" || !imgRef.current || !beat) return; @@ -197,6 +293,19 @@ export function PlayCanvas({
+ {/* Hidden audio element — voice playback for the current beat */} + {audioBase64 && ( +