feat(web): optional bring-your-own Xiaomi MiMo TTS key (browser-side synthesis)
Public users share one server TTS key, so Xiaomi's per-key RPM/TPM limits
cause silent playback under concurrency. This adds an OPTIONAL path: a user
can store their own Xiaomi MiMo key in the browser and synthesize voice
client-side against Xiaomi's CORS-open endpoints. The key lives only in
localStorage and is never sent to or logged by our server; the shared server
key still serves everyone who does not opt in.
- components/TtsKeyModal.tsx: shared key modal (key-family + region picker),
reused by both the home and play pages
- app/play/page.tsx: silence nudge moved beside the mute toggle; modal opens
in place instead of redirecting to the home page
- app/page.tsx: home page consumes the shared modal + readStoredTtsConfig
- lib/clientTtsConfig.ts, lib/ttsPresets.ts: browser config + region presets
- app/api/{start,scene,insert-beat}: thread per-request voice; lib/types update
- docs/xiaomi-tts-key.md + README note
Verified with tsc --noEmit (exit 0).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
// Bring-your-own Xiaomi MiMo TTS key — stored CLIENT-SIDE ONLY.
|
||||
//
|
||||
// When a user supplies their own key, we persist {presetId, apiKey} in
|
||||
// localStorage and the browser talks to Xiaomi directly (see lib/tts-client).
|
||||
// The key is therefore never sent to our server: no request body, no header,
|
||||
// no log. resolveTtsConfig() turns the stored pair into the TtsConfig shape the
|
||||
// tts-client adapter expects, mapping the chosen endpoint preset to its baseUrl.
|
||||
|
||||
import type { TtsConfig } from "@infiplot/types";
|
||||
import { DEFAULT_TTS_SPEECH_MODEL, findTtsPreset } from "./ttsPresets";
|
||||
|
||||
const STORAGE_KEY = "infiplot:tts";
|
||||
|
||||
/** Exactly what we persist — endpoint choice + raw key. Resolved to a full
|
||||
* TtsConfig (with baseUrl + model) at read time so a renamed/removed preset
|
||||
* can't leave a stale baseUrl baked into storage. */
|
||||
export type StoredTtsConfig = {
|
||||
presetId: string;
|
||||
apiKey: string;
|
||||
};
|
||||
|
||||
/** Read + validate the persisted BYO config. Returns null when running on the
|
||||
* server, when nothing is stored, on parse failure, or when the stored shape
|
||||
* is no longer valid (unknown preset / empty key). */
|
||||
export function readStoredTtsConfig(): StoredTtsConfig | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
try {
|
||||
const raw = window.localStorage.getItem(STORAGE_KEY);
|
||||
if (!raw) return null;
|
||||
const parsed = JSON.parse(raw) as Partial<StoredTtsConfig>;
|
||||
const presetId = typeof parsed.presetId === "string" ? parsed.presetId : "";
|
||||
const apiKey = typeof parsed.apiKey === "string" ? parsed.apiKey : "";
|
||||
if (!findTtsPreset(presetId)) return null;
|
||||
if (!apiKey.trim()) return null;
|
||||
return { presetId, apiKey };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Persist the BYO config. Trims the key so trailing whitespace from a paste
|
||||
* never breaks the `api-key` header. */
|
||||
export function writeStoredTtsConfig(config: StoredTtsConfig): void {
|
||||
if (typeof window === "undefined") return;
|
||||
try {
|
||||
const payload: StoredTtsConfig = {
|
||||
presetId: config.presetId,
|
||||
apiKey: config.apiKey.trim(),
|
||||
};
|
||||
window.localStorage.setItem(STORAGE_KEY, JSON.stringify(payload));
|
||||
} catch {
|
||||
// Storage disabled / quota / private mode — BYO simply stays off.
|
||||
}
|
||||
}
|
||||
|
||||
export function clearStoredTtsConfig(): void {
|
||||
if (typeof window === "undefined") return;
|
||||
try {
|
||||
window.localStorage.removeItem(STORAGE_KEY);
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
/** Map a stored pair to the adapter-ready TtsConfig, resolving the endpoint
|
||||
* preset to its baseUrl. Returns null when the preset is unknown or the key
|
||||
* is blank — callers treat null as "no BYO; use server default / silent". */
|
||||
export function resolveTtsConfig(
|
||||
stored: StoredTtsConfig | null,
|
||||
): TtsConfig | null {
|
||||
if (!stored) return null;
|
||||
const preset = findTtsPreset(stored.presetId);
|
||||
if (!preset) return null;
|
||||
const apiKey = stored.apiKey.trim();
|
||||
if (!apiKey) return null;
|
||||
return {
|
||||
baseUrl: preset.baseUrl,
|
||||
apiKey,
|
||||
speechModel: DEFAULT_TTS_SPEECH_MODEL,
|
||||
};
|
||||
}
|
||||
|
||||
/** Convenience: read storage and resolve in one step. */
|
||||
export function loadClientTtsConfig(): TtsConfig | null {
|
||||
return resolveTtsConfig(readStoredTtsConfig());
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
// Xiaomi MiMo TTS endpoint presets.
|
||||
//
|
||||
// Xiaomi issues two independent key types, each with its own base URL:
|
||||
// - Token Plan (套餐, `tp-` key): per-region endpoints token-plan-{sgp,cn,ams}.
|
||||
// - Pay-as-you-go (按量, `sk-` key): the single unified endpoint api.xiaomimimo.com.
|
||||
//
|
||||
// Used CLIENT-SIDE ONLY: when a user supplies their own key, the browser calls
|
||||
// one of these endpoints directly (all return permissive CORS allowing the
|
||||
// `api-key` header), so the key never transits our server. Every endpoint
|
||||
// serves the same `mimo-v2.5-tts` family; Token Plan users pick the region
|
||||
// matching their subscription (also the closest hop → lower synth latency),
|
||||
// pay-as-you-go users have no region to choose. See docs/xiaomi-tts-key.md.
|
||||
|
||||
export type TtsPreset = {
|
||||
id: string;
|
||||
/** Which key family this endpoint serves — drives the two-step picker UI. */
|
||||
kind: "token-plan" | "payg";
|
||||
/** Human label shown in the picker (region for Token Plan, type for payg). */
|
||||
label: string;
|
||||
/** OpenAI-style base; the TTS adapter appends `/chat/completions`. */
|
||||
baseUrl: string;
|
||||
};
|
||||
|
||||
/** Base model name; the adapter derives `-voicedesign` / `-voiceclone`. */
|
||||
export const DEFAULT_TTS_SPEECH_MODEL = "mimo-v2.5-tts";
|
||||
|
||||
/**
|
||||
* In-repo tutorial for getting a free Xiaomi MiMo key + picking a region.
|
||||
* Points at the default branch so it resolves once this lands on main (which
|
||||
* is what production serves). Linked from the homepage BYO modal, the play
|
||||
* page's silence nudge, and the README.
|
||||
*/
|
||||
export const TTS_KEY_DOC_URL =
|
||||
"https://github.com/zonghaoyuan/infiplot/blob/main/docs/xiaomi-tts-key.md";
|
||||
|
||||
export const TTS_PRESETS: TtsPreset[] = [
|
||||
{
|
||||
id: "sgp",
|
||||
kind: "token-plan",
|
||||
label: "新加坡 · Singapore",
|
||||
baseUrl: "https://token-plan-sgp.xiaomimimo.com/v1",
|
||||
},
|
||||
{
|
||||
id: "cn",
|
||||
kind: "token-plan",
|
||||
label: "中国大陆 · China",
|
||||
baseUrl: "https://token-plan-cn.xiaomimimo.com/v1",
|
||||
},
|
||||
{
|
||||
id: "ams",
|
||||
kind: "token-plan",
|
||||
label: "欧洲 · Amsterdam",
|
||||
baseUrl: "https://token-plan-ams.xiaomimimo.com/v1",
|
||||
},
|
||||
{
|
||||
id: "payg",
|
||||
kind: "payg",
|
||||
label: "按量付费 · Pay-as-you-go",
|
||||
baseUrl: "https://api.xiaomimimo.com/v1",
|
||||
},
|
||||
];
|
||||
|
||||
/** Token Plan endpoints only — the region sub-options shown once the user
|
||||
* picks the "套餐" key type. */
|
||||
export const TTS_REGION_PRESETS = TTS_PRESETS.filter(
|
||||
(p) => p.kind === "token-plan",
|
||||
);
|
||||
|
||||
/** The single pay-as-you-go preset id (`sk-` keys have no region). */
|
||||
export const PAYG_PRESET_ID = "payg";
|
||||
|
||||
export function findTtsPreset(
|
||||
id: string | null | undefined,
|
||||
): TtsPreset | undefined {
|
||||
if (!id) return undefined;
|
||||
return TTS_PRESETS.find((p) => p.id === id);
|
||||
}
|
||||
@@ -300,6 +300,12 @@ export type StartRequest = {
|
||||
styleGuide: string;
|
||||
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
|
||||
styleReferenceImage?: string;
|
||||
/**
|
||||
* When true the client supplied its own Xiaomi TTS key and will provision +
|
||||
* synth voices in the browser (key never touches our server). The route then
|
||||
* drops `config.tts` so the engine skips all server-side TTS work.
|
||||
*/
|
||||
clientTts?: boolean;
|
||||
};
|
||||
|
||||
// /api/parse-style-image — vision LLM extracts a textual painting-style
|
||||
@@ -332,6 +338,8 @@ export type StartResponse = {
|
||||
// (frontend synthesizes a speculative exit).
|
||||
export type SceneRequest = {
|
||||
session: Session;
|
||||
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
|
||||
clientTts?: boolean;
|
||||
};
|
||||
|
||||
export type SceneResponse = {
|
||||
@@ -389,6 +397,8 @@ export type VisionResponse = {
|
||||
export type InsertBeatRequest = {
|
||||
session: Session;
|
||||
freeformAction: string;
|
||||
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
|
||||
clientTts?: boolean;
|
||||
};
|
||||
|
||||
/** Partial beat fields produced by the insert-beat director. */
|
||||
|
||||
Reference in New Issue
Block a user