diff --git a/README.en.md b/README.en.md index e600b6d..93fac18 100644 --- a/README.en.md +++ b/README.en.md @@ -159,6 +159,12 @@ With the recommended trio, each scene's cost comes mainly from the image generat By default the browser fetches images directly from the provider — no setup needed; leave `NEXT_PUBLIC_IMAGE_PROXY_URL` blank and you're completely unaffected. You only want this if you hit progressive "top-to-bottom" image loading (Chrome's `ERR_QUIC_PROTOCOL_ERROR` on some networks paints partial PNGs row by row): deploy a tiny Cloudflare Worker that re-fetches images server-side and serves them atomically over HTTP/2. One-click deploy at **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)**, then paste the `workers.dev` URL it prints into `NEXT_PUBLIC_IMAGE_PROXY_URL`. +**5. Let players bring their own voice Key (optional, recommended)** + +Xiaomi rate-limits the TTS model by RPM/TPM. When a public deployment has many people playing at once through a single shared `TTS_API_KEY`, those limits are easy to hit — the symptom is **story and visuals work fine, but there's no audio**. To fix this, players can optionally enter **their own** Xiaomi MiMo key on the homepage (free to obtain). Synthesis then runs **browser-direct to Xiaomi**, the **key stays in the player's browser and never touches your server**, and they get stable voice with lower latency. It's purely additive: leave it blank and playback falls back to your server key exactly as before. + +See the [Bring-your-own voice Key guide](docs/xiaomi-tts-key.md) for how to obtain and enter one. + --- ## Roadmap diff --git a/README.ja.md b/README.ja.md index ee2fdcd..bb536ad 100644 --- a/README.ja.md +++ b/README.ja.md @@ -158,6 +158,12 @@ InfiPlot は 4 種類のモデルプロバイダと通信します。**テキス デフォルトではブラウザが画像プロバイダーに直接アクセスするため、設定は不要です —— `NEXT_PUBLIC_IMAGE_PROXY_URL` を空欄のままにすれば、まったく影響ありません。画像が「上から順に」表示される現象(一部のネットワークで Chrome の `ERR_QUIC_PROTOCOL_ERROR` により PNG が行ごとに描画される)に遭遇した場合のみ必要です。小さな Cloudflare Worker をデプロイすると、画像をサーバー側で再取得し HTTP/2 で一括返却します。ワンクリックデプロイは **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)** を参照し、出力された `workers.dev` の URL を `NEXT_PUBLIC_IMAGE_PROXY_URL` に設定してください。 +**5. プレイヤー自身の音声 Key(任意・推奨)** + +Xiaomi は TTS モデルに RPM/TPM 制限を設けています。公開デプロイで多数のプレイヤーが単一の `TTS_API_KEY` を共有して同時にプレイすると、この制限に達しやすく、**ストーリーも画像も正常なのに音声だけ出ない**という症状になります。対策として、プレイヤーはトップページで**自分の** Xiaomi MiMo Key(無料で取得可)を任意で入力できます。合成は**ブラウザから Xiaomi へ直接**行われ、**Key はプレイヤーのブラウザ内にのみ保存され、あなたのサーバーを一切経由しません**。これにより安定した音声と低遅延が得られます。完全な追加機能であり、未入力ならこれまで通りサーバー側の Key にフォールバックします。 + +取得・入力の手順は [音声 Key 持ち込みガイド](docs/xiaomi-tts-key.md) を参照してください。 + --- ## Roadmap diff --git a/README.md b/README.md index f597861..80c38ce 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,12 @@ InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Visio 默认浏览器直连图片供应商,无需任何配置 —— 留空 `NEXT_PUBLIC_IMAGE_PROXY_URL` 即可,完全不受影响。只有当你遇到图片「层层加载」(Chrome 在某些网络下 `ERR_QUIC_PROTOCOL_ERROR` 导致 PNG 逐行渲染)时才需要它:部署一个极小的 Cloudflare Worker,把图片改为服务端转发 + HTTP/2 原子返回。一键部署见 **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)**,然后把它给出的 `workers.dev` 地址填进 `NEXT_PUBLIC_IMAGE_PROXY_URL`。 +**5. 玩家自带配音 Key(可选,推荐)** + +小米对 TTS 模型有 RPM/TPM 限额。当你的公共部署有多人同时游玩、共用同一把 `TTS_API_KEY` 时,很容易撞到限额,表现为**剧情、画面都正常,唯独没有声音**。为此,玩家可以在首页可选地填入**自己的**小米 MiMo Key(免费申请)——配音请求由**浏览器直连小米**完成,**Key 只存在玩家本地、绝不经过你的服务器**,从而获得稳定配音与更低延迟。这是纯增强:不填则照常使用你部署的服务器 Key,行为不变。 + +申请与填写步骤见 [自带配音 Key 教程](docs/xiaomi-tts-key.md)。 + --- ## Roadmap diff --git a/app/api/insert-beat/route.ts b/app/api/insert-beat/route.ts index ca36c55..e40450a 100644 --- a/app/api/insert-beat/route.ts +++ b/app/api/insert-beat/route.ts @@ -22,7 +22,9 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(req.headers); + const base = loadEngineConfig(req.headers); + // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. + const config = body.clientTts ? { ...base, tts: undefined } : base; const result = await requestInsertBeat(config, body); return NextResponse.json(result); } catch (err) { diff --git a/app/api/scene/route.ts b/app/api/scene/route.ts index 11cb26b..e7a8127 100644 --- a/app/api/scene/route.ts +++ b/app/api/scene/route.ts @@ -23,7 +23,9 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(req.headers); + const base = loadEngineConfig(req.headers); + // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. + const config = body.clientTts ? { ...base, tts: undefined } : base; const result = await requestScene(config, body); return NextResponse.json(result); } catch (err) { diff --git a/app/api/start/route.ts b/app/api/start/route.ts index 378850e..5c9af7c 100644 --- a/app/api/start/route.ts +++ b/app/api/start/route.ts @@ -41,7 +41,11 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(req.headers); + const base = loadEngineConfig(req.headers); + // BYO key: the browser provisions + synths voices directly against Xiaomi + // (key never reaches us), so strip server-side TTS so the engine skips all + // provisioning + synth. See StartRequest.clientTts. + const config = body.clientTts ? { ...base, tts: undefined } : base; const result = await startSession(config, body); return NextResponse.json(result); } catch (err) { diff --git a/app/page.tsx b/app/page.tsx index d5b8251..8395c2a 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -10,14 +10,8 @@ import { PLOT_STYLES, type Gender, } from "@/lib/options"; - -/* ============================================================================ - InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型) - - 顶部 Header:左上角衬线 wordmark logo -"use client"; - -import { useRouter } from "next/navigation"; -import { useEffect, useRef, useState, type ReactNode } from "react"; +import { readStoredTtsConfig } from "@/lib/clientTtsConfig"; +import { TtsKeyModal } from "@/components/TtsKeyModal"; /* ============================================================================ InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型) @@ -1771,7 +1765,12 @@ export default function HomePage() { // 顶部使用提示:默认展示,用户可点 × 永久关闭(localStorage:infiplot:hintClosed)。 const [hintClosed, setHintClosed] = useState(false); + // 自带 TTS Key 弹窗:可选增强,Key 只存浏览器、绝不经过服务器。 + const [ttsOpen, setTtsOpen] = useState(false); + const [ttsConfigured, setTtsConfigured] = useState(false); + const styleRow = OPTS.findIndex((o) => o.modal); + const voiceRow = OPTS.findIndex((o) => o.label === "语音配音"); const genderIndex = sel[0] ?? 0; const gender = (OPTS[0]!.items[genderIndex] as Gender) ?? "男性向"; const phrases = EXAMPLE_PHRASES[gender]; @@ -1826,6 +1825,11 @@ export default function HomePage() { } }; + // 启动时回填「已启用」徽标——读 localStorage 判断用户是否已存过 Key。 + useEffect(() => { + setTtsConfigured(readStoredTtsConfig() != null); + }, []); + // 输入框随内容自动增高:长文本整段可见(打字与点卡片填入都覆盖)。 useEffect(() => { const el = inputRef.current; @@ -2067,6 +2071,30 @@ export default function HomePage() { ))} + {/* 自带 TTS Key 入口:公共语音模型有 RPM/TPM 限额,高并发易静音; + 填自己的小米 MiMo Key(免费)→ 稳定配音、延迟更低,且 Key 只存本地。 */} +
+ +
+ {/* 使用提示:可被用户永久关闭(localStorage:infiplot:hintClosed) */} {!hintClosed && (
@@ -2235,6 +2263,22 @@ export default function HomePage() { {byoApiOpen && ( setByoApiOpen(false)} /> )} + + {ttsOpen && ( + setTtsOpen(false)} + onSaved={(configured) => { + setTtsConfigured(configured); + // 启用自带 Key 时顺手把「语音配音」拨到「开启」——否则用户配了 Key + // 却还是静音,体验自相矛盾。停用时不动其选择,尊重用户原本的偏好。 + if (configured && voiceRow >= 0) { + const onIdx = OPTS[voiceRow]!.items.indexOf("开启"); + if (onIdx >= 0) + setSel((s) => s.map((v, j) => (j === voiceRow ? onIdx : v))); + } + }} + /> + )}
); } diff --git a/app/play/page.tsx b/app/play/page.tsx index 5900733..0793efd 100644 --- a/app/play/page.tsx +++ b/app/play/page.tsx @@ -11,19 +11,25 @@ import { useState, } from "react"; import { PlayCanvas, type Phase } from "@/components/PlayCanvas"; +import { TtsKeyModal } from "@/components/TtsKeyModal"; import { annotateClick } from "@/lib/annotateClient"; +import { loadClientTtsConfig } from "@/lib/clientTtsConfig"; import { PRESETS } from "@/lib/presets"; +import { provisionVoice, synthesize } from "@infiplot/tts-client"; import type { Beat, BeatAudio, BeatAudioResponse, BeatChoice, + Character, + CharacterVoice, InsertBeatResponse, Scene, SceneExit, SceneResponse, Session, StartResponse, + TtsConfig, VisionResponse, } from "@infiplot/types"; import { track } from "@/lib/analytics"; @@ -47,6 +53,11 @@ function getByoHeaders(): Record { return {}; } +// Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a +// non-BYO, unmuted player. Set high enough that one transient miss won't trip +// it, low enough to catch a scene that's clearly being rate-limited. +const SILENCE_NUDGE_THRESHOLD = 3; + // Cap how long we wait for the browser to download + decode a scene image // before giving up and rendering anyway. Runware's CDN is usually <2s for a // 1792×1024 PNG, but over slow links / VPN / strict corp networks the same @@ -274,6 +285,7 @@ function prefetchScenePath( baseSession: Session, steps: ScenePathStep[], depth: number, + clientTts: boolean, ): void { if (depth >= PREFETCH_MAX_DEPTH) return; const key = pathKey(steps); @@ -288,7 +300,7 @@ function prefetchScenePath( "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify({ session: specSession }), + body: JSON.stringify({ session: specSession, clientTts }), signal: abort.signal, }); if (!res.ok) { @@ -327,7 +339,13 @@ function prefetchScenePath( characters: data.characters, storyState: data.storyState, }; - prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1); + prefetchScenePath( + pool, + carriedBase, + [...steps, nextStep], + depth + 1, + clientTts, + ); } } @@ -362,6 +380,44 @@ function clearPool(pool: Map): void { pool.clear(); } +// ────────────────────────────────────────────────────────────────────── +// BYO voice resolution (client-direct Xiaomi TTS). +// +// In BYO mode the server skips all TTS (clientTts:true), so the browser must +// obtain each speaker's reference audio itself. `cache` is keyed by character +// NAME and persists for the whole session, so a voice locked in on a +// character's first speaking beat stays identical across every later scene — +// even though /api/scene returns its characters without `.voice`. Storing the +// in-flight Promise (not the resolved value) dedupes the burst of concurrent +// beats by the same speaker into ONE voicedesign call, which matters because +// Xiaomi rate-limits voicedesign hard. +// ────────────────────────────────────────────────────────────────────── + +async function resolveByoVoice( + cache: Map>, + cfg: TtsConfig, + speaker: Character, +): Promise { + const cached = cache.get(speaker.name); + if (cached) return cached; + // Prebaked cards ship baked reference audio — reuse it directly (cross-key + // synth with the user's key works), keeping the prebaked voice identical. + if (speaker.voice) { + const ready = Promise.resolve(speaker.voice); + cache.set(speaker.name, ready); + return ready; + } + if (!speaker.voiceDescription) return null; + const p = provisionVoice(cfg, speaker.voiceDescription); + cache.set(speaker.name, p); + try { + return await p; + } catch (e) { + cache.delete(speaker.name); // failed provision — let a later beat retry + throw e; + } +} + // ────────────────────────────────────────────────────────────────────── // Component // ────────────────────────────────────────────────────────────────────── @@ -402,6 +458,16 @@ function PlayInner() { const [error, setError] = useState(null); const [presentation, setPresentation] = useState(false); const [lastExitLabel, setLastExitLabel] = useState(null); + // Consecutive server-side TTS misses (null audio / failed /api/beat-audio). + // Climbs when the shared server key is rate-limited by MiMo — the exact pain + // BYO fixes — so the play page can nudge non-BYO users to add their own key. + // Reset to 0 on any successful synth. Only the server path touches it. + const [silenceStrikes, setSilenceStrikes] = useState(0); + // Once the player dismisses the silence nudge, keep it gone for this session. + const [nudgeDismissed, setNudgeDismissed] = useState(false); + // The in-place BYO-key modal, opened from the silence nudge so the player can + // add a key without leaving the play page. + const [ttsModalOpen, setTtsModalOpen] = useState(false); const startedRef = useRef(false); const poolRef = useRef>(new Map()); @@ -416,6 +482,21 @@ function PlayInner() { // 不再单独维护 audioEnabledRef —— 单一来源避免两个 flag 漂移。 const mutedRef = useRef(muted); + // Resolved bring-your-own Xiaomi TTS config (region preset + key), read once + // from localStorage. When non-null, the browser provisions + synths voices + // directly against Xiaomi — the key never touches our server — and every + // start/scene/insert-beat request carries clientTts:true so the engine skips + // server-side TTS. null = user hasn't opted in (server default / silent). + const [byoTtsConfig, setByoTtsConfig] = useState(() => + loadClientTtsConfig(), + ); + const byoTtsRef = useRef(byoTtsConfig); + // BYO voice cache (see resolveByoVoice). Keyed by character name; persists + // across scenes so each speaker is provisioned at most once per session. + const provisionedVoicesRef = useRef>>( + new Map(), + ); + // Mirrors for use inside async handlers (closure-stable) const sessionRef = useRef(null); const currentSceneRef = useRef(null); @@ -496,34 +577,72 @@ function PlayInner() { // 「首页选关闭」也走这条路:bootstrap 时 muted 已被初始化为 true。 if (!beat.speaker || !beat.line) return; const speaker = sess.characters.find((c) => c.name === beat.speaker); - if (!speaker?.voice) return; // not yet provisioned — server can't synth anyway + if (!speaker) return; + + const byo = byoTtsRef.current; + // Non-BYO relies on the server having provisioned speaker.voice. BYO + // skipped server TTS, so it needs a baked voice (prebaked card) or a + // voiceDescription to provision from in the browser. + if (!byo && !speaker.voice) return; + if (byo && !speaker.voice && !speaker.voiceDescription) return; + if (beatAudioAbortRef.current.has(beat.id)) return; const abort = new AbortController(); beatAudioAbortRef.current.set(beat.id, abort); try { - const res = await fetch("/api/beat-audio", { - method: "POST", - headers: { - "Content-Type": "application/json", - ...getByoHeaders(), - }, - body: JSON.stringify({ - beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery }, - voice: speaker.voice, - }), - signal: abort.signal, - }); - if (!res.ok) return; - const json = (await res.json()) as BeatAudioResponse; - // Skip the state write if we've been aborted between the .ok check and + let audio: BeatAudio | null = null; + if (byo) { + // Client-direct: provision (once per speaker, cached) + synth against + // Xiaomi with the user's own key — no /api/beat-audio round-trip and + // the key never touches our server. + const voice = await resolveByoVoice( + provisionedVoicesRef.current, + byo, + speaker, + ); + if (!voice || abort.signal.aborted) return; + const out = await synthesize( + byo, + voice, + beat.line, + beat.lineDelivery, + abort.signal, + ); + audio = { base64: out.audioBase64, mime: out.mimeType }; + } else { + const res = await fetch("/api/beat-audio", { + method: "POST", + headers: { + "Content-Type": "application/json", + ...getByoHeaders(), + }, + body: JSON.stringify({ + beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery }, + voice: speaker.voice, + }), + signal: abort.signal, + }); + if (!res.ok) { + setSilenceStrikes((n) => Math.min(n + 1, 99)); + return; + } + const json = (await res.json()) as BeatAudioResponse; + audio = json.audio; + // Null audio usually means MiMo rate-limited or timed out the shared + // key — track the streak; a real clip resets it. + if (audio) setSilenceStrikes(0); + else setSilenceStrikes((n) => Math.min(n + 1, 99)); + } + // Skip the state write if we've been aborted between the await and // here — beat ids are scene-local, so a late arrival from a prior // scene would otherwise overwrite the current scene's audio under the // same id. - if (json.audio && !abort.signal.aborted) { - setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio })); + if (audio && !abort.signal.aborted) { + const settled = audio; + setBeatAudioMap((m) => ({ ...m, [beat.id]: settled })); } } catch { - // aborted or network error — silent fallback + // aborted / network / Xiaomi rate-limit — silent fallback (no audio) } finally { // Only clear the slot if it's still ours. An aborted prior fetch // running its finally late could otherwise delete the controller of a @@ -598,6 +717,27 @@ function PlayInner() { prefetchSceneAudio(); }, [muted, prefetchSceneAudio]); + // ── BYO key enabled/disabled from the play page (silence nudge → modal) ─ + // On enable: point the synth path at the user's key and immediately + // re-synthesize the current scene in-browser, so the voices the player just + // missed come back without a reload (their characters already carry + // server-provisioned `voice`, which resolveByoVoice reuses with the new key). + // On disable: just stop using it; later scenes fall back to the server. + const handleByoSaved = useCallback( + (configured: boolean) => { + const cfg = configured ? loadClientTtsConfig() : null; + byoTtsRef.current = cfg; + setByoTtsConfig(cfg); + if (cfg) { + setSilenceStrikes(0); + cancelBeatAudioFetches(); + setBeatAudioMap({}); + prefetchSceneAudio(); + } + }, + [prefetchSceneAudio], + ); + // ── Presentation mode toggle ───────────────────────────────────────── const togglePresentation = useCallback(async () => { const entering = !presentation; @@ -720,7 +860,10 @@ function PlayInner() { "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify(livePayload), + body: JSON.stringify({ + ...livePayload, + clientTts: !!byoTtsRef.current, + }), }).then(async (r) => { if (!r.ok) { const j = (await r.json().catch(() => ({}))) as { error?: string }; @@ -793,7 +936,7 @@ function PlayInner() { nextSceneSeed: choice.effect.nextSceneSeed, }, }; - prefetchScenePath(poolRef.current, s, [step], 0); + prefetchScenePath(poolRef.current, s, [step], 0, !!byoTtsRef.current); } }, [currentScene?.id, session?.id]); @@ -948,7 +1091,10 @@ function PlayInner() { "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify({ session: specSession }), + body: JSON.stringify({ + session: specSession, + clientTts: !!byoTtsRef.current, + }), }); if (!res.ok) { const j = (await res.json().catch(() => ({}))) as { error?: string }; @@ -995,6 +1141,7 @@ function PlayInner() { body: JSON.stringify({ session, freeformAction: decision.intent.freeformAction, + clientTts: !!byoTtsRef.current, }), }); if (!insertRes.ok) { @@ -1075,7 +1222,10 @@ function PlayInner() { "Content-Type": "application/json", ...getByoHeaders(), }, - body: JSON.stringify({ session: specSession }), + body: JSON.stringify({ + session: specSession, + clientTts: !!byoTtsRef.current, + }), }); if (!res.ok) { const j = (await res.json().catch(() => ({}))) as { @@ -1163,6 +1313,16 @@ function PlayInner() { const sceneCount = session?.history.length ?? 0; const beatCount = visitedBeatsRef.current.length; + // Surface the BYO-key nudge only to an unmuted, non-BYO player whose last few + // beats came back silent (shared key rate-limited) — the exact pain BYO fixes. + // Dismissible for the session. + const showSilenceNudge = + phase === "ready" && + !muted && + !byoTtsConfig && + !nudgeDismissed && + silenceStrikes >= SILENCE_NUDGE_THRESHOLD; + return (
@@ -1207,18 +1367,46 @@ function PlayInner() { } aboveCanvasLeft={ - + <> + + + {/* Silence nudge — a compact pill right beside the mute toggle. + Clicking opens the BYO-key modal in place (no trip to the + homepage). The × dismisses it for the session. */} + {showSilenceNudge && ( + + + + + )} + } /> @@ -1235,7 +1423,16 @@ function PlayInner() {

)}
+ + + {ttsModalOpen && ( + setTtsModalOpen(false)} + onSaved={handleByoSaved} + footerNote="保存后会立即用这把 Key 在你的浏览器里合成当前这一幕的配音;本设备后续游玩也会自动使用此 Key。" + /> + )} ); } diff --git a/components/TtsKeyModal.tsx b/components/TtsKeyModal.tsx new file mode 100644 index 0000000..bcdd7cd --- /dev/null +++ b/components/TtsKeyModal.tsx @@ -0,0 +1,255 @@ +"use client"; + +// Bring-your-own Xiaomi MiMo TTS key modal — shared by the homepage and the +// play page. Two-step picker (key family → region for Token Plan only), key +// stored CLIENT-SIDE ONLY (see lib/clientTtsConfig). `onSaved(configured)` +// fires after a save/disable so each host can react (homepage flips the +// 语音配音 toggle; the play page re-synthesizes the current scene in-browser). +// `footerNote` lets the host tailor the closing hint to its own context. + +import { type ReactNode, useEffect, useState } from "react"; +import { + clearStoredTtsConfig, + readStoredTtsConfig, + writeStoredTtsConfig, +} from "@/lib/clientTtsConfig"; +import { + findTtsPreset, + PAYG_PRESET_ID, + TTS_KEY_DOC_URL, + TTS_REGION_PRESETS, +} from "@/lib/ttsPresets"; + +const DEFAULT_FOOTER_NOTE: ReactNode = + "提示:需将上方「语音配音」设为「开启」配音才会生效。保存后本设备后续游玩会自动使用此 Key。"; + +export function TtsKeyModal({ + onClose, + onSaved, + footerNote = DEFAULT_FOOTER_NOTE, +}: { + onClose: () => void; + onSaved: (configured: boolean) => void; + footerNote?: ReactNode; +}) { + // Read storage once; useState initializers ignore later renders, so local + // edits aren't clobbered and we don't re-hit localStorage every render. + const [initial] = useState(() => readStoredTtsConfig()); + // Two-step picker: choose key family first, then — only for Token Plan — a + // region. Pay-as-you-go (`sk-`) keys hit one fixed endpoint, so no region. + const initialKind = findTtsPreset(initial?.presetId)?.kind ?? "token-plan"; + const [keyType, setKeyType] = useState<"token-plan" | "payg">(initialKind); + const [regionId, setRegionId] = useState( + initialKind === "token-plan" + ? (initial?.presetId ?? TTS_REGION_PRESETS[0]!.id) + : TTS_REGION_PRESETS[0]!.id, + ); + const [apiKey, setApiKey] = useState(initial?.apiKey ?? ""); + const [showKey, setShowKey] = useState(false); + const [shown, setShown] = useState(false); + const alreadyConfigured = initial != null; + + useEffect(() => { + const id = requestAnimationFrame(() => setShown(true)); + return () => cancelAnimationFrame(id); + }, []); + + const close = () => { + setShown(false); + setTimeout(onClose, 280); + }; + const save = () => { + const key = apiKey.trim(); + if (!key) return; + const presetId = keyType === "payg" ? PAYG_PRESET_ID : regionId; + writeStoredTtsConfig({ presetId, apiKey: key }); + onSaved(true); + close(); + }; + const disable = () => { + clearStoredTtsConfig(); + onSaved(false); + close(); + }; + + return ( +
+
e.stopPropagation()} + className={ + "flex w-[560px] max-w-[94vw] max-h-[88vh] flex-col overflow-hidden rounded-sm border border-clay-900/15 bg-cream-50 shadow-2xl shadow-clay-900/25 transition-all duration-300 " + + (shown ? "opacity-100 scale-100" : "opacity-0 scale-95") + } + > +
+
+ + 自带配音 Key + + + 可选 · 用你自己的小米 MiMo 免费额度,配音更稳定、延迟更低 + +
+ +
+ +
+

+ 经常没有声音?公共语音模型有调用频率限额(RPM / TPM),同时游玩的人多时很容易撞到限额而静音。填入你自己的小米 MiMo API Key 后,配音将 + 直接在你的浏览器里合成 + 、使用你自己的免费额度 ——{" "} + Key 只保存在本地浏览器、绝不经过我们的服务器 + 。 +

+ +
+ K e y · 类 型 +
+ {( + [ + { kind: "token-plan", label: "套餐 Token Plan", sub: "tp- 开头" }, + { kind: "payg", label: "按量付费 Pay-as-you-go", sub: "sk- 开头" }, + ] as const + ).map((t) => { + const active = keyType === t.kind; + return ( + + ); + })} +
+
+ + {keyType === "token-plan" ? ( +
+ 区 域 节 点 +
+ {TTS_REGION_PRESETS.map((p) => { + const active = p.id === regionId; + return ( + + ); + })} +
+ + 选择与你的套餐订阅地区一致的节点(通常也是延迟最低的那个)。 + +
+ ) : ( +
+ + + 按量付费使用统一端点{" "} + api.xiaomimimo.com + ,无需选择区域。 + +
+ )} + +
+ + A P I · K e y + +
+ setApiKey(e.target.value)} + type={showKey ? "text" : "password"} + autoComplete="off" + spellCheck={false} + placeholder={ + keyType === "payg" + ? "粘贴 sk- 开头的按量 Key" + : "粘贴 tp- 开头的套餐 Key" + } + className="h-11 w-full rounded-sm border border-clay-900/15 bg-cream-100 pl-4 pr-11 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400" + /> + +
+ + + 如何免费申请 Key?查看图文教程 + +
+ +

{footerNote}

+
+ +
+ {alreadyConfigured && ( + + )} + +
+
+
+ ); +} diff --git a/docs/xiaomi-tts-key.md b/docs/xiaomi-tts-key.md new file mode 100644 index 0000000..c8a3cd7 --- /dev/null +++ b/docs/xiaomi-tts-key.md @@ -0,0 +1,100 @@ +# 自带配音 Key 教程(小米 MiMo TTS) + +InfiPlot 的角色配音由小米 **MiMo-V2.5-TTS** 模型实时合成。本页教你免费申请一个属于自己的 API Key, +填进 InfiPlot 后即可获得**稳定的配音、更低的延迟**——而且这个 Key **只存在你的浏览器里,绝不会经过我们的服务器**。 + +> 本教程随仓库维护,链接长期有效。 + +--- + +## 为什么需要自带 Key? + +InfiPlot 默认用一把**公共的服务器 Key** 给所有人配音。小米对语音模型按 **RPM(每分钟请求数)/ TPM(每分钟 Token 数)** 做了限额,而且这个额度并不高。当很多人**同时**游玩时,公共 Key 很容易撞到限额,结果就是—— + +- 剧情、画面都正常,**唯独没有声音**(静音); +- 或者配音偶尔断断续续、要等很久。 + +填入你**自己的**免费 Key 后,你用的是自己独立的额度,不再和其他人抢,于是: + +- ✅ **配音稳定**,不再随机静音; +- ✅ **延迟更低**(可就近选区域,少一跳); +- ✅ **完全免费**——MiMo-V2.5-TTS 目前限时 **0x 计费**,不消耗套餐额度。 + +这是一个**可选增强**。不填也能正常玩,只是高峰期更容易遇到静音。 + +--- + +## 一、免费申请 API Key + +1. 打开小米 MiMo 开放平台并注册 / 登录: + - 注册即可领取免费额度(Token Plan)。 +2. 进入**控制台 → 套餐管理**: +3. 在该页面找到并**复制你的专属 API Key**。 + - 套餐 Key 形如 `tp-xxxxxxxx`;按量 Key 形如 `sk-xxxxxxxx`。两者相互独立、不能混用,**任选一个有效的即可**。 + - 妥善保管,**不要公开分享**这把 Key。 + +> MiMo-V2.5-TTS 系列当前为限时 **0x 计费**(不消耗套餐 Credits),所以配音这件事基本是免费的。具体以平台公告为准。 + +--- + +## 二、选择 Key 类型(套餐再选区域) + +小米有**两类 Key**,对应不同的端点。在 InfiPlot 里**先选类型**——看 Key 前缀就能判断:`tp-` 是套餐、`sk-` 是按量,两者不能混用。 + +**① 套餐 Token Plan(`tp-` 开头)** —— 再选一个**区域节点**,对应小米不同地区的 Token Plan: + +| 区域 | 说明 | 端点 | +| --- | --- | --- | +| 新加坡 · Singapore | 亚太地区推荐 | `https://token-plan-sgp.xiaomimimo.com/v1` | +| 中国大陆 · China | 中国大陆推荐 | `https://token-plan-cn.xiaomimimo.com/v1` | +| 欧洲 · Amsterdam | 欧洲推荐 | `https://token-plan-ams.xiaomimimo.com/v1` | + +选**离你最近**、且**与你套餐订阅区域一致**的那个——延迟最低、最不容易出错。一般跟着你注册时的区域走即可。 + +**② 按量付费 Pay-as-you-go(`sk-` 开头)** —— 使用统一端点 `https://api.xiaomimimo.com/v1`,**无需选择区域**。 + +--- + +## 三、在 InfiPlot 里填写 + +1. 回到 InfiPlot **首页**,在选项区下方点击 **「经常没声音?自带配音 Key(可选)」**。 +2. 在弹窗里: + - **选择 Key 类型**(套餐 / 按量);选「套餐」时再**选区域**,选「按量」则无需选区域; + - **粘贴你的 API Key**; +3. 点击 **「保存并启用」**。按钮会变成 **「自带配音 Key · 已启用」**,「语音配音」也会自动切到「开启」。 +4. 开始游玩——配音将由你的浏览器**直连小米**完成。 + +想停用时,再次打开弹窗点击 **「停用并清除」** 即可,本地保存的 Key 会被一并删除。 + +--- + +## 四、隐私说明 + +- 你的 API Key **只保存在你当前浏览器的 `localStorage`**(键名 `infiplot:tts`)里。 +- 启用后,配音请求由**你的浏览器直接发给小米**对应的端点,携带你的 Key。 +- 我们的服务器**完全不参与**这条链路,**既看不到也不会记录**你的 Key。 +- 换设备 / 换浏览器 / 清缓存后需要重新填写,这是预期行为。 + +--- + +## 五、常见问题 + +**Q:填了 Key 还是没声音?** +- 确认「语音配音」是「开启」状态; +- 确认 **Key 类型选对了**:`tp-` 选「套餐」、`sk-` 选「按量付费」,选错端点会一直失败; +- 确认 Key 没填错、没多余空格,且仍有额度; +- 套餐 Key 可换一个**区域**试试(区域与订阅地不匹配也可能失败); +- 打开浏览器开发者工具的 Network 面板,看对 `*.xiaomimimo.com` 的请求返回了什么错误。 + +**Q:会产生费用吗?** +- MiMo-V2.5-TTS 当前限时 0x 计费,正常游玩配音不消耗套餐额度。最终以小米平台的计费公告为准。 + +**Q:`tp-` 和 `sk-` 用哪个?** +- 看你手里是哪种 Key 就选哪种类型——`tp-` 选「套餐 Token Plan」、`sk-` 选「按量付费」。两者不能混用,选错端点会鉴权失败。 + +**Q:我的 Key 安全吗?** +- 安全。它只存在你本地浏览器、只发给小米官方端点,不经过 InfiPlot 服务器。但请勿把 Key 贴到公开场合或分享给他人。 + +--- + +有问题欢迎在 [GitHub Issues](https://github.com/zonghaoyuan/infiplot/issues) 反馈。 diff --git a/lib/clientTtsConfig.ts b/lib/clientTtsConfig.ts new file mode 100644 index 0000000..0221d24 --- /dev/null +++ b/lib/clientTtsConfig.ts @@ -0,0 +1,86 @@ +// Bring-your-own Xiaomi MiMo TTS key — stored CLIENT-SIDE ONLY. +// +// When a user supplies their own key, we persist {presetId, apiKey} in +// localStorage and the browser talks to Xiaomi directly (see lib/tts-client). +// The key is therefore never sent to our server: no request body, no header, +// no log. resolveTtsConfig() turns the stored pair into the TtsConfig shape the +// tts-client adapter expects, mapping the chosen endpoint preset to its baseUrl. + +import type { TtsConfig } from "@infiplot/types"; +import { DEFAULT_TTS_SPEECH_MODEL, findTtsPreset } from "./ttsPresets"; + +const STORAGE_KEY = "infiplot:tts"; + +/** Exactly what we persist — endpoint choice + raw key. Resolved to a full + * TtsConfig (with baseUrl + model) at read time so a renamed/removed preset + * can't leave a stale baseUrl baked into storage. */ +export type StoredTtsConfig = { + presetId: string; + apiKey: string; +}; + +/** Read + validate the persisted BYO config. Returns null when running on the + * server, when nothing is stored, on parse failure, or when the stored shape + * is no longer valid (unknown preset / empty key). */ +export function readStoredTtsConfig(): StoredTtsConfig | null { + if (typeof window === "undefined") return null; + try { + const raw = window.localStorage.getItem(STORAGE_KEY); + if (!raw) return null; + const parsed = JSON.parse(raw) as Partial; + const presetId = typeof parsed.presetId === "string" ? parsed.presetId : ""; + const apiKey = typeof parsed.apiKey === "string" ? parsed.apiKey : ""; + if (!findTtsPreset(presetId)) return null; + if (!apiKey.trim()) return null; + return { presetId, apiKey }; + } catch { + return null; + } +} + +/** Persist the BYO config. Trims the key so trailing whitespace from a paste + * never breaks the `api-key` header. */ +export function writeStoredTtsConfig(config: StoredTtsConfig): void { + if (typeof window === "undefined") return; + try { + const payload: StoredTtsConfig = { + presetId: config.presetId, + apiKey: config.apiKey.trim(), + }; + window.localStorage.setItem(STORAGE_KEY, JSON.stringify(payload)); + } catch { + // Storage disabled / quota / private mode — BYO simply stays off. + } +} + +export function clearStoredTtsConfig(): void { + if (typeof window === "undefined") return; + try { + window.localStorage.removeItem(STORAGE_KEY); + } catch { + // ignore + } +} + +/** Map a stored pair to the adapter-ready TtsConfig, resolving the endpoint + * preset to its baseUrl. Returns null when the preset is unknown or the key + * is blank — callers treat null as "no BYO; use server default / silent". */ +export function resolveTtsConfig( + stored: StoredTtsConfig | null, +): TtsConfig | null { + if (!stored) return null; + const preset = findTtsPreset(stored.presetId); + if (!preset) return null; + const apiKey = stored.apiKey.trim(); + if (!apiKey) return null; + return { + baseUrl: preset.baseUrl, + apiKey, + speechModel: DEFAULT_TTS_SPEECH_MODEL, + }; +} + +/** Convenience: read storage and resolve in one step. */ +export function loadClientTtsConfig(): TtsConfig | null { + return resolveTtsConfig(readStoredTtsConfig()); +} diff --git a/lib/ttsPresets.ts b/lib/ttsPresets.ts new file mode 100644 index 0000000..06fc1e6 --- /dev/null +++ b/lib/ttsPresets.ts @@ -0,0 +1,77 @@ +// Xiaomi MiMo TTS endpoint presets. +// +// Xiaomi issues two independent key types, each with its own base URL: +// - Token Plan (套餐, `tp-` key): per-region endpoints token-plan-{sgp,cn,ams}. +// - Pay-as-you-go (按量, `sk-` key): the single unified endpoint api.xiaomimimo.com. +// +// Used CLIENT-SIDE ONLY: when a user supplies their own key, the browser calls +// one of these endpoints directly (all return permissive CORS allowing the +// `api-key` header), so the key never transits our server. Every endpoint +// serves the same `mimo-v2.5-tts` family; Token Plan users pick the region +// matching their subscription (also the closest hop → lower synth latency), +// pay-as-you-go users have no region to choose. See docs/xiaomi-tts-key.md. + +export type TtsPreset = { + id: string; + /** Which key family this endpoint serves — drives the two-step picker UI. */ + kind: "token-plan" | "payg"; + /** Human label shown in the picker (region for Token Plan, type for payg). */ + label: string; + /** OpenAI-style base; the TTS adapter appends `/chat/completions`. */ + baseUrl: string; +}; + +/** Base model name; the adapter derives `-voicedesign` / `-voiceclone`. */ +export const DEFAULT_TTS_SPEECH_MODEL = "mimo-v2.5-tts"; + +/** + * In-repo tutorial for getting a free Xiaomi MiMo key + picking a region. + * Points at the default branch so it resolves once this lands on main (which + * is what production serves). Linked from the homepage BYO modal, the play + * page's silence nudge, and the README. + */ +export const TTS_KEY_DOC_URL = + "https://github.com/zonghaoyuan/infiplot/blob/main/docs/xiaomi-tts-key.md"; + +export const TTS_PRESETS: TtsPreset[] = [ + { + id: "sgp", + kind: "token-plan", + label: "新加坡 · Singapore", + baseUrl: "https://token-plan-sgp.xiaomimimo.com/v1", + }, + { + id: "cn", + kind: "token-plan", + label: "中国大陆 · China", + baseUrl: "https://token-plan-cn.xiaomimimo.com/v1", + }, + { + id: "ams", + kind: "token-plan", + label: "欧洲 · Amsterdam", + baseUrl: "https://token-plan-ams.xiaomimimo.com/v1", + }, + { + id: "payg", + kind: "payg", + label: "按量付费 · Pay-as-you-go", + baseUrl: "https://api.xiaomimimo.com/v1", + }, +]; + +/** Token Plan endpoints only — the region sub-options shown once the user + * picks the "套餐" key type. */ +export const TTS_REGION_PRESETS = TTS_PRESETS.filter( + (p) => p.kind === "token-plan", +); + +/** The single pay-as-you-go preset id (`sk-` keys have no region). */ +export const PAYG_PRESET_ID = "payg"; + +export function findTtsPreset( + id: string | null | undefined, +): TtsPreset | undefined { + if (!id) return undefined; + return TTS_PRESETS.find((p) => p.id === id); +} diff --git a/lib/types/index.ts b/lib/types/index.ts index 3ecaf1e..c5e6a35 100644 --- a/lib/types/index.ts +++ b/lib/types/index.ts @@ -300,6 +300,12 @@ export type StartRequest = { styleGuide: string; /** Optional user-uploaded style reference image — see Session.styleReferenceImage. */ styleReferenceImage?: string; + /** + * When true the client supplied its own Xiaomi TTS key and will provision + + * synth voices in the browser (key never touches our server). The route then + * drops `config.tts` so the engine skips all server-side TTS work. + */ + clientTts?: boolean; }; // /api/parse-style-image — vision LLM extracts a textual painting-style @@ -332,6 +338,8 @@ export type StartResponse = { // (frontend synthesizes a speculative exit). export type SceneRequest = { session: Session; + /** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */ + clientTts?: boolean; }; export type SceneResponse = { @@ -389,6 +397,8 @@ export type VisionResponse = { export type InsertBeatRequest = { session: Session; freeformAction: string; + /** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */ + clientTts?: boolean; }; /** Partial beat fields produced by the insert-beat director. */