Merge pull request #83 from zonghaoyuan/staging

chore: sync staging → main
2026-06-15 16:49:47 +08:00
parent cd6c004589 dc08f64ec1
commit 272e940cf2
312 changed files with 5225 additions and 1071 deletions
@@ -3,18 +3,22 @@
 # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
 # (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]).
 #
-# TEXT / VISION default to any OpenAI-compatible endpoint, and can switch to
-# native Anthropic or Google Gemini via TEXT_PROVIDER / VISION_PROVIDER.
+# TEXT / VISION / IMAGE all speak the OpenAI wire format. Anthropic Claude
+# and Google Gemini are reachable through their own OpenAI-compatible
+# endpoints (see TEXT_PROVIDER notes below) — no native protocol switch is
+# needed.
 # TTS uses Xiaomi MiMo's own voice design / clone protocol
 # (not OpenAI-compatible; appends -voicedesign / -voiceclone).
 #
-# IMAGE supports Runware (its own task-array protocol), OpenAI (gpt-image),
-# and Google Gemini (Nano Banana) via IMAGE_PROVIDER.
+# IMAGE supports Runware (its own task-array protocol) and OpenAI (gpt-image)
+# via IMAGE_PROVIDER.
 #
 # *_PROVIDER (optional) selects the wire protocol; leave unset for the
-# OpenAI-compatible default (image is auto-detected from the URL). Base URLs
-# tolerate a missing or extra /v1 (or a trailing /chat/completions) — the
-# engine normalizes them.
+# OpenAI-compatible default (image is auto-detected from the URL). Valid
+# values are openai_compatible / openai / runware — native "anthropic" /
+# "google" protocols were removed when the Vercel AI SDK was dropped.
+# Base URLs tolerate a missing or extra /v1 (or a trailing /chat/completions)
+# — the engine normalizes them.
 # =============================================================

 # ---- 1. Text LLM · scene director ----------------------------------
@@ -30,9 +34,11 @@
 TEXT_BASE_URL=https://api.deepseek.com/v1
 TEXT_API_KEY=sk-xxx
 TEXT_MODEL=deepseek-v4-flash
-# TEXT_PROVIDER: openai_compatible (default) | anthropic | google
-#   anthropic → TEXT_BASE_URL=https://api.anthropic.com  TEXT_MODEL=claude-sonnet-4-6
-#   google    → TEXT_BASE_URL=https://generativelanguage.googleapis.com  TEXT_MODEL=gemini-3.5-flash
+# TEXT_PROVIDER: openai_compatible (default). This is the ONLY supported text
+# protocol. To use Claude or Gemini, leave TEXT_PROVIDER unset and point at
+# their OpenAI-compatible endpoints:
+#   Claude  → TEXT_BASE_URL=https://api.anthropic.com/v1  TEXT_MODEL=claude-sonnet-4-6
+#   Gemini  → TEXT_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai  TEXT_MODEL=gemini-3.5-flash
 # TEXT_PROVIDER=openai_compatible

 # ---- 2. Image generator (renders the scene background) -------------
@@ -44,32 +50,54 @@ TEXT_MODEL=deepseek-v4-flash
 IMAGE_BASE_URL=https://api.runware.ai/v1
 IMAGE_API_KEY=runware-xxx
 IMAGE_MODEL=runware:400@6
-# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible
-#                 | openai | google
+# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible | openai
 #   openai → gpt-image, supports referenceImages (character/scene continuity).
 #            IMAGE_BASE_URL=https://api.openai.com  IMAGE_MODEL=gpt-image-1
-#   google → Gemini "Nano Banana" (Imagen is EOL 2026-06-24, do not use it).
-#            IMAGE_BASE_URL=https://generativelanguage.googleapis.com
-#            IMAGE_MODEL=gemini-2.5-flash-image
-# NOTE: openai/google return raw bytes → inlined as a data: URI for the session
+# NOTE: openai returns raw bytes → inlined as a data: URI for the session
 # (heavier per-call transport than Runware's UUID re-reference loop). Runware
 # stays fastest + cheapest for the scene-by-scene flow.
 # IMAGE_PROVIDER=runware

+# Optional image-latency guards. BOTH default to OFF when unset — leaving
+# them blank keeps the exact historical behavior, so self-hosted deploys are
+# unaffected unless they opt in.
+#   IMAGE_TIMEOUT_MS — per-attempt hard deadline for image requests; a timed
+#     out attempt is retried like a 5xx. Recommended 30000 for Runware
+#     (healthy-day p99 is ~26-37s; Runware's own gateway 504s at ~55s).
+#   IMAGE_HEDGE_MS — scene-paint hedging: if the referenced scene paint has
+#     not finished after this many ms, race a second identical request and
+#     keep whichever finishes first (the loser is aborted, but the provider
+#     may still bill it). Rescues straggler tasks; never fires when the first
+#     attempt already failed (e.g. 429/503 saturation). Recommended 15000 for
+#     Runware (healthy-day p95). Do NOT set thresholds this low for providers
+#     that are normally slow (e.g. gpt-image takes 20-60s per image).
+# IMAGE_TIMEOUT_MS=30000
+# IMAGE_HEDGE_MS=15000
+
 # ---- 3. Vision model · multimodal click interpretation -------------
 # Recommended: MiMo V2.5 — multimodal, accepts image_url content parts.
 VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 VISION_API_KEY=tp-xxx
 VISION_MODEL=mimo-v2.5
-# VISION_PROVIDER: openai_compatible (default) | anthropic | google
-#   anthropic → VISION_BASE_URL=https://api.anthropic.com  VISION_MODEL=claude-sonnet-4-6
-#   google    → VISION_BASE_URL=https://generativelanguage.googleapis.com  VISION_MODEL=gemini-3.5-flash
+# VISION_PROVIDER: openai_compatible (default). Only openai_compatible is
+# supported — reach Claude/Gemini via their OpenAI-compatible endpoints
+# (same base URLs as TEXT above). Leave unset to use the default.
 # VISION_PROVIDER=openai_compatible

-# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------
-# Per-character voice design → clone, with per-line delivery direction.
-# Voice identity = the reference audio kept in the session (no server expiry).
-# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL.
+# ---- 4. TTS (optional — leave blank to disable) --------------------
+# Provider is auto-detected from TTS_BASE_URL host:
+#   *stepfun.com  → StepFun (preset voices, keyword-scored selection)
+#   otherwise     → Xiaomi MiMo (voicedesign + voiceclone)
+#
+# Xiaomi MiMo — per-character voice design → clone, with per-line delivery.
+#   TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
+#   TTS_API_KEY=tp-xxx
+#   TTS_SPEECH_MODEL=mimo-v2.5-tts
+#
+# StepFun — 32 preset voices, auto-selected by gender + age + tone scoring.
+#   TTS_BASE_URL=https://api.stepfun.com/v1
+#   TTS_API_KEY=sk-xxx
+#   TTS_SPEECH_MODEL=step-tts-mini          # or step-tts-2 / stepaudio-2.5-tts
 TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 TTS_API_KEY=tp-xxx
 TTS_SPEECH_MODEL=mimo-v2.5-tts
@@ -135,3 +163,12 @@ NEXT_PUBLIC_UMAMI_DOMAINS=
 # WARNING: rotating this secret invalidates every share file ever issued
 # (decryption will fail with "文件校验失败"). Only change when you're OK with that.
 GALLERY_SECRET=
+
+# ---- 8. Auth · Supabase (optional — leave blank to disable) -------
+# Sign up at https://supabase.com, create a project, copy the URL and
+# publishable key (starts with sb_publishable_ or eyJ…).
+# Both blank → login UI is completely absent, all API routes run unguarded,
+# and the app behaves exactly as before this feature existed.
+# NEXT_PUBLIC_ vars are inlined at BUILD time.
+NEXT_PUBLIC_SUPABASE_URL=
+NEXT_PUBLIC_SUPABASE_PUBLISHABLE_KEY=
@@ -16,6 +16,7 @@ out

 .open-next
 .wrangler
+.opendeploy

 .DS_Store
 *.log
@@ -26,3 +27,9 @@ repomix-output.xml

 users.md
 .dev.vars
+
+pitch/
+
+# OpenDeploy-only build with hardcoded public proxy URL — local, never commit
+# (a public fork would route image traffic through our Cloudflare Worker).
+Dockerfile.opendeploy
@@ -21,7 +21,7 @@ InfiPlot is a Next.js 16 / React 19 / TypeScript app for AI-driven interactive v
 - `lib/engine/agents/`: Architect, Writer, CharacterDesigner, Cinematographer, Painter.
 - `lib/engine/prompts.ts`: Agent prompts and prompt-cache-sensitive message builders.
 - `lib/ai-client/`: Text, image, vision, and retry wrappers.
- `lib/tts-client/`: TTS integration.
+- `lib/tts-client/`: TTS integration. `stepfun-voices.json` is the single source of truth for the 32 StepFun preset voices (shared by the scorer, CharacterDesigner prompt, `/api/tts-provider`, and the enrich script).
 - `lib/config.ts`: Server-side provider/environment loading.
 - `lib/presets.ts`, `lib/ttsPresets.ts`, `lib/options.ts`: Home-page presets and selectable options.
 - `scripts/`: Asset and preset generation helpers.
@@ -79,7 +79,7 @@ Maintain graceful degradation. Existing flows tolerate malformed AI JSON, failed

 `sceneKey` identifies a physical space such as `"classroom-dusk"`. If a new scene shares a key with prior history, the prior scene image should be reused as a reference. Character portraits are also references.

-Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The OpenAI/Gemini image paths can also accept references through the AI SDK, but they return data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.
+Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The native OpenAI image path (gpt-image) can also accept references via `images.edit`, but returns data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.

 Writer prompt caching depends on `buildWriterPlanUserMessage()` and `buildWriterBeatsUserMessage()` keeping their stable prefixes intact: world, style, story spine, archived history, known scene keys, and character list. The dynamic suffix contains current state, last beat, exit hint, and the current plan. Do not reorder or reformat stable prefix sections casually; it can destroy cache hit rates.

@@ -91,8 +91,9 @@ Common routes live under `app/api/`:
 - `POST /api/scene`: generates the next scene from an existing session.
 - `POST /api/vision`: interprets scene-image clicks.
 - `POST /api/insert-beat`: creates a transient beat without image generation.
- `POST /api/beat-audio`: lazy TTS for a displayed beat; returns binary audio, or `204` when silent.
+- `POST /api/beat-audio`: lazy TTS for a displayed beat; returns binary audio, or `204` when silent. `voice` is now OPTIONAL — when the server runs StepFun, the client omits the ~220KB Xiaomi reference audio and sends `stepfunVoiceId` / `voiceDescription` instead (saves Fast Origin Transfer bandwidth). The engine re-provisions on a provider mismatch before synthesizing.
 - `POST /api/parse-style-image`: extracts a style prompt from uploaded reference art.
+- `GET /api/tts-provider`: returns `{ provider: "stepfun" | "xiaomi" | null }` (the server's TTS provider, inferred from `TTS_BASE_URL`). Probed once at `/play` mount (non-BYO) so `fetchBeatAudio` can shape its request body — skip the ~220KB Xiaomi reference audio when the server runs StepFun. BYO client TTS takes precedence over this signal.
 - `POST /api/story-pack` / `POST /api/story-unpack`: stateless AES-GCM packing/unpacking for playable story share `.infiplot` files; uses `GALLERY_SECRET`.

 When changing public types or route payloads, update all route callers and client consumers in the same change.
@@ -114,6 +115,7 @@ Use pnpm with Node >=22. `pnpm-lock.yaml` is the source of truth; `package-lock.
 - `pnpm start`: run production server after building.
 - `pnpm lint`: Next.js built-in lint.
 - `pnpm typecheck`: `tsc --noEmit`.
+- `pnpm enrich:firstacts`: one-off enrichment of `public/home/firstact{,-portrait}/*.json` — adds `characters[i].stepfunVoiceId` via a TEXT-provider LLM call per character (uses `.env.local`). Idempotent; `--force` re-picks, `--only=f0,f1` filters, `--portrait` targets the portrait set.
 - `pnpm build:cf`: Cloudflare Workers build through OpenNext.
 - `pnpm preview:cf`: local Cloudflare preview.
 - `pnpm deploy:cf`: Cloudflare deploy.
@@ -136,9 +138,10 @@ Comment only non-obvious sequencing, provider quirks, fallback behavior, or arch

 Use `.env.example` as the source of truth. Never commit `.env.local`, API keys, uploaded user content, or generated secrets.

- Text and Vision use `TEXT_*` and `VISION_*`; default protocol is `openai_compatible`, with native `anthropic` and `google` available via `TEXT_PROVIDER` / `VISION_PROVIDER`.
- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, native `openai`, and native `google`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
- TTS uses Xiaomi MiMo protocol and is optional: blank config means silent mode.
+- Text and Vision use `TEXT_*` and `VISION_*` over the `openai_compatible` protocol (the only supported text/vision protocol); Claude and Gemini are reached via their own OpenAI-compatible endpoints with the `*_PROVIDER` var unset.
+- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, and native `openai`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
+- `IMAGE_TIMEOUT_MS` (per-attempt hard deadline) and `IMAGE_HEDGE_MS` (Painter scene-paint hedging: race a second request when the first is still pending after the threshold) are both OFF when unset — the default path must stay byte-identical to historical behavior. Hedging applies only to the Tier-A scene paint, never to portraits, and never fires after a fast failure (saturation guard). Client-side engine configs (`resolveEngineConfig`) intentionally do not set these fields.
+- TTS supports Xiaomi MiMo (voicedesign + voiceclone) or StepFun (preset voices), inferred from `TTS_BASE_URL` (host containing `stepfun.com` → StepFun, otherwise → MiMo). `CharacterVoice` is a discriminated union on `provider`; synth dispatches on the voice's own tag so a session may carry both shapes through a provider switch. Blank config means silent mode. StepFun voice selection: the CharacterDesigner LLM picks a preset id directly from the 32-entry catalog (`lib/tts-client/stepfun-voices.json`, rendered by `formatStepfunCatalogForPrompt`) when `config.tts` is StepFun — zero extra LLM call. `pickStepfunVoiceId` (keyword scorer) is the fallback for orphan speakers / invalid picks. Prebaked homepage cards are enriched with `Character.stepfunVoiceId` via `scripts/enrich-firstacts-stepfun.mjs` so a card works under either server provider.
 - `MOCK_IMAGE=true` skips image generation and returns a placeholder for cheap local iteration.
 - `NEXT_PUBLIC_IMAGE_PROXY_URL` and `NEXT_PUBLIC_IMAGE_PROXY_ALLOWED_HOSTS` opt into browser-side image proxying for allowed hosts.
 - Analytics uses optional Umami `NEXT_PUBLIC_UMAMI_*` values and must stay content-free/privacy-preserving.
@@ -147,7 +150,7 @@ Use `.env.example` as the source of truth. Never commit `.env.local`, API keys,

 ## File Dependency Map

-If modifying Writer, also check `director.ts`, `prompts.ts`, WriterPlan/StoryState types, and Cinematographer/Painter consumers. If modifying CharacterDesigner, check Director scheduling/merge logic, portrait prompts, voice provisioning, and Painter reference collection. If modifying Cinematographer or Painter, check Director, prompt builders, provider image options, orientation handling, and reference priority. If modifying Architect, check `orchestrator.ts`, `prompts.ts`, and StoryState patch rules. If modifying `lib/types/index.ts`, check all agents, Director, Orchestrator, API routes, and client consumers in `app/page.tsx`, `app/play/page.tsx`, and `components/PlayCanvas.tsx`. If modifying TTS, check server `beat-audio`, BYO client TTS, voice stripping/merging, and payload privacy. If modifying image delivery, check Painter, `lib/ai-client/image.ts`, mock images, orientation dimensions, preload/proxy logic, and style-reference validation.
+If modifying Writer, also check `director.ts`, `prompts.ts`, WriterPlan/StoryState types, and Cinematographer/Painter consumers. If modifying CharacterDesigner, check Director scheduling/merge logic, portrait prompts, voice provisioning, Painter reference collection, and (StepFun path) the `buildCharacterDesignerSystem` catalog injection + `stepfunVoiceId` validation. If modifying the StepFun voice catalog (`lib/tts-client/stepfun-voices.json`), also check `formatStepfunCatalogForPrompt`, `isValidStepfunVoiceId`, the CharacterDesigner system prompt, and the enrich script. If modifying Cinematographer or Painter, check Director, prompt builders, provider image options, orientation handling, and reference priority. If modifying Architect, check `orchestrator.ts`, `prompts.ts`, and StoryState patch rules. If modifying `lib/types/index.ts`, check all agents, Director, Orchestrator, API routes, and client consumers in `app/page.tsx`, `app/play/page.tsx`, and `components/PlayCanvas.tsx`. If modifying TTS, check server `beat-audio` (including the `resolveVoice` provider-mismatch normalization), `/api/tts-provider`, BYO client TTS, voice stripping/merging, payload privacy, and the StepFun voice-id flow (CharacterDesigner → provision → synth). If modifying image delivery, check Painter, `lib/ai-client/image.ts`, mock images, orientation dimensions, preload/proxy logic, and style-reference validation.

 ## Guide Maintenance

@@ -197,15 +197,24 @@ See the [Bring-your-own voice Key guide](docs/xiaomi-tts-key.md) for how to obta

 ## Roadmap

- [ ] Make generation latency imperceptible
- [ ] Compatibility with more model providers
- [ ] Free-form player input mid-story
- [ ] Mobile browser support
- [ ] User accounts and login
- [ ] Upgrade from static images to motion video
- [ ] Voice interaction
- [ ] Share the story you're playing
- [ ] Mobile app
+**Completed**
+
+- [x] Latency optimized to ~10s
+- [x] Vision-based image interaction
+- [x] One-click deploy & custom model config
+- [x] Frontend API Key & model setup
+- [x] Mobile web support
+- [x] Story sharing (`.infiplot` format)
+
+**To Do**
+
+- [ ] Mobile app & creator platform
+- [ ] ComfyUI custom image generation
+- [ ] Open Deploy quick deployment
+- [ ] Reduce latency to under 5s
+- [ ] Story save & resume
+- [ ] Custom character cards & world settings
+- [ ] Prompt cache hit-rate optimization

 ---

@@ -196,15 +196,24 @@ Xiaomi は TTS モデルに RPM/TPM 制限を設けています。公開デプ

 ## Roadmap

- [ ] 生成遅延を体感できないレベルまで下げる
- [ ] より多くのモデルプロバイダに対応
- [ ] プレイ中の自由入力対応
- [ ] モバイルブラウザ対応
- [ ] ユーザー登録・ログイン機能
- [ ] 静止画から動画へのアップグレード
- [ ] 音声インタラクション
- [ ] プレイ中のストーリーを共有
- [ ] モバイルアプリ
+**実装済み**
+
+- [x] レイテンシを約 10 秒に最適化
+- [x] ビジョンベース画像インタラクション
+- [x] ワンクリックデプロイ＆カスタムモデル設定
+- [x] フロントエンドで API Key・モデル設定
+- [x] モバイル Web 対応
+- [x] ストーリー共有（`.infiplot` 形式）
+
+**未実装**
+
+- [ ] モバイルアプリ＆クリエイタープラットフォーム
+- [ ] ComfyUI カスタム画像生成対応
+- [ ] Open Deploy クイックデプロイ
+- [ ] レイテンシを 5 秒以内に短縮
+- [ ] ストーリーの保存・再開
+- [ ] カスタムキャラクターカード＆世界観設定
+- [ ] プロンプトキャッシュヒット率の最適化

 ---

@@ -132,7 +132,7 @@ docker compose up -d

 ## 团队与愿景

-我们是一群来自清华大学、兰州大学、西安交通大学等高校的年轻人。
+我们是一群来自清华大学、兰州大学等高校的年轻人。

 一方面，我们本来就是galgame、乙女游戏、FMV、AI角色扮演游戏这类游戏的深度用户，在享受游戏体验的同时，也会想象如果能选择不被预设的剧情选项，或者和对话的AI角色深度互动而不只是通过聊天软件聊天，该是多么愉快刺激的体验。

@@ -145,9 +145,9 @@ docker compose up -d

 联系方式：hi@infiplot.com

-欢迎扫码加入 **InfiPlot 内测交流群**（QQ 群号 `575404333`），一起反馈体验、参与共建：
+欢迎扫码加入 **InfiPlot 公测交流群**（QQ 群号 `575404333`），一起反馈体验、参与共建：

-<img src="public/qq-group.webp" alt="InfiPlot 内测交流群 QQ 二维码" width="200" />
+<img src="public/qq-group.webp" alt="InfiPlot 公测交流群 QQ 二维码" width="200" />

 ---

@@ -208,15 +208,24 @@ InfiPlot 会与四类模型供应商通信。**文本（Text）和视觉（Visio

 ## Roadmap

- [ ] 让用户感知不到生成延迟
- [ ] 兼容更多模型 provider
- [ ] 游玩过程中支持用户自定义输入
- [ ] 移动端浏览器适配
- [ ] 用户注册登录系统
- [ ] 由静态图升级为动态视频
- [ ] 语音交互
- [ ] 分享正在游玩的故事
- [ ] 移动端 app
+**已实现**
+
+- [x] 延迟优化至约 10 秒
+- [x] 视觉识图交互
+- [x] 一键部署与自定义模型配置
+- [x] 前端直配 API Key 与模型
+- [x] 移动端 Web 适配
+- [x] 剧情分享（`.infiplot` 格式）
+
+**未实现**
+
+- [ ] 移动端 App 与创作平台
+- [ ] 兼容 ComfyUI 自定义生图
+- [ ] Open Deploy 快速部署
+- [ ] 延迟压缩至 5 秒以内
+- [ ] 剧情存档与续玩
+- [ ] 自定义角色卡与世界观
+- [ ] Prompt 缓存命中率优化

 ---

@@ -2,10 +2,14 @@ import { requestBeatAudio } from "@infiplot/engine";
 import type { BeatAudioRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: BeatAudioRequest;
  try {
    body = (await req.json()) as BeatAudioRequest;
@@ -13,9 +17,26 @@ export async function POST(req: Request) {
    return NextResponse.json({ error: "Invalid JSON" }, { status: 400 });
  }

-  if (!body.beat?.id || !body.beat?.line || !body.voice?.referenceAudioBase64) {
+  // Voice is now optional — when the server runs StepFun, the client omits
+  // the ~220KB Xiaomi reference audio and sends stepfunVoiceId /
+  // voiceDescription instead (saves Fast Origin Transfer bandwidth). The
+  // engine's resolveVoice re-provisions on a provider mismatch. We only
+  // require the beat text + SOMETHING to synthesize from.
+  const VALID_TTS_PROVIDERS = ["xiaomi", "stepfun"];
+  const hasInvalidVoiceProvider =
+    !!body.voice?.provider && !VALID_TTS_PROVIDERS.includes(body.voice.provider);
+  const hasVoice =
+    !!body.voice?.provider && VALID_TTS_PROVIDERS.includes(body.voice.provider);
+  const hasFallback =
+    !!body.stepfunVoiceId || !!body.voiceDescription;
+  if (
+    !body.beat?.id ||
+    !body.beat?.line ||
+    hasInvalidVoiceProvider ||
+    (!hasVoice && !hasFallback)
+  ) {
    return NextResponse.json(
-      { error: "beat.id, beat.line and voice.referenceAudioBase64 are required" },
+      { error: "beat.id and beat.line are required, plus either voice.provider (xiaomi|stepfun) or stepfunVoiceId/voiceDescription" },
      { status: 400 },
    );
  }
@@ -2,10 +2,14 @@ import { classifyFreeform } from "@infiplot/engine";
 import type { FreeformClassifyRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: FreeformClassifyRequest;
  try {
    body = (await req.json()) as FreeformClassifyRequest;
@@ -4,8 +4,8 @@ export const runtime = "nodejs";

 // Cap a bit above pack's MAX_DOC_BYTES — ciphertext adds the 16-byte GCM tag
 // and the 17-byte header; some slack accommodates near-cap docs without
-// rejecting them at unpack time.
-const MAX_FILE_BYTES = 6_000_000;
+// rejecting them at unpack time. Bumped to fit pre-baked beat audio.
+const MAX_FILE_BYTES = 13_000_000;

 // Decrypt a `.infiplot` share file back to its doc JSON string. Returns the
 // plaintext as a JSON field (not raw bytes) so the client can chain it through
@@ -2,10 +2,14 @@ import { requestInsertBeat } from "@infiplot/engine";
 import type { InsertBeatRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: InsertBeatRequest;
  try {
    body = (await req.json()) as InsertBeatRequest;
@@ -5,6 +5,7 @@ import type {
 } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

@@ -26,6 +27,9 @@ Do NOT describe the characters, objects, or scene contents. Output exactly one J
 {"stylePrompt": "<comma-separated English visual-style attributes, ~30-60 words>"}`;

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: ParseStyleImageRequest;
  try {
    body = (await req.json()) as ParseStyleImageRequest;
@@ -2,6 +2,7 @@ import { requestScene } from "@infiplot/engine";
 import type { Character, SceneRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 function stripKnownVoices(
  characters: Character[],
@@ -15,6 +16,9 @@ function stripKnownVoices(
 export const runtime = "nodejs";

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: SceneRequest;
  try {
    body = (await req.json()) as SceneRequest;
@@ -2,6 +2,7 @@ import { startSession } from "@infiplot/engine";
 import type { StartRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

@@ -11,6 +12,9 @@ export const runtime = "nodejs";
 const MAX_STYLE_REF_BYTES = 3 * 1024 * 1024;

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: StartRequest;
  try {
    body = (await req.json()) as StartRequest;
@@ -0,0 +1,25 @@
+import type { TtsProviderResponse } from "@infiplot/types";
+import { inferTtsProvider } from "@infiplot/tts-client";
+import { NextResponse } from "next/server";
+import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";
+
+export const runtime = "nodejs";
+
+// GET /api/tts-provider — tells the client which TTS provider the server is
+// configured for, so the play page can shape /api/beat-audio request bodies
+// accordingly (skip the ~220KB Xiaomi reference audio when the server runs
+// StepFun → saves Fast Origin Transfer bandwidth; the response itself is a
+// few dozen bytes). Runs once at /play mount; same auth as other routes so
+// the provider (a server-config fact, not user data) isn't leaked publicly.
+// BYO client TTS (clientTts:true) takes precedence and bypasses this signal.
+export async function GET() {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
+  const cfg = loadEngineConfig();
+  const provider = cfg.tts ? inferTtsProvider(cfg.tts) : null;
+
+  const body: TtsProviderResponse = { provider };
+  return NextResponse.json(body);
+}
@@ -2,6 +2,7 @@ import { visionDecide } from "@infiplot/engine";
 import type { VisionRequest } from "@infiplot/types";
 import { NextResponse } from "next/server";
 import { loadEngineConfig } from "@/lib/config";
+import { requireUser } from "@/lib/supabase/guard";

 export const runtime = "nodejs";

@@ -11,6 +12,9 @@ export const runtime = "nodejs";
 const MAX_ANNOTATED_BYTES = 3 * 1024 * 1024;

 export async function POST(req: Request) {
+  const auth = await requireUser();
+  if (auth instanceof NextResponse) return auth;
+
  let body: VisionRequest;
  try {
    body = (await req.json()) as VisionRequest;
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from "next/server";
+import { AUTH_ENABLED } from "@/lib/supabase/config";
+import { createClient } from "@/lib/supabase/server";
+
+// Only allow same-origin relative paths. Rejects `//evil.com`, `/\evil.com`,
+// and absolute URLs that would otherwise turn `${origin}${next}` into an
+// open redirect (CWE-601).
+function safeNext(raw: string | null): string {
+  if (!raw || !raw.startsWith("/")) return "/";
+  if (raw.startsWith("//") || raw.startsWith("/\\")) return "/";
+  // Reject control chars (CR/LF etc.) — defense-in-depth against header
+  // injection if `next` ever reaches a context that doesn't re-encode it.
+  for (let i = 0; i < raw.length; i++) {
+    const code = raw.charCodeAt(i);
+    if (code < 0x20 || code === 0x7f) return "/";
+  }
+  return raw;
+}
+
+export async function GET(request: NextRequest) {
+  const { searchParams, origin } = request.nextUrl;
+
+  // Auth not configured: nothing can legitimately hit this route, so just
+  // bounce home instead of constructing a Supabase client from blank env vars.
+  if (!AUTH_ENABLED) {
+    return NextResponse.redirect(`${origin}/`);
+  }
+
+  const code = searchParams.get("code");
+  const next = safeNext(searchParams.get("next"));
+
+  if (code) {
+    const supabase = await createClient();
+    const { error } = await supabase.auth.exchangeCodeForSession(code);
+    if (!error) {
+      return NextResponse.redirect(`${origin}${next}`);
+    }
+  }
+  return NextResponse.redirect(`${origin}/?auth_error=1`);
+}
@@ -57,8 +57,11 @@ export type GalleryScene = {
 };

 export type GalleryDoc = {
-  /** v1 = scenes only (initial export). v2 = + alternates + characters. */
-  v: 1 | 2;
+  /** v1 = scenes only (initial export). v2 = + alternates + characters.
+   *  v3 = + beat audio (stored in a sidecar localStorage key so the main
+   *  doc stays small and the first paint isn't blocked by JSON.parse-ing
+   *  several MB of base64). */
+  v: 1 | 2 | 3;
  id: string;
  createdAt: number;
  orientation: Orientation;
@@ -71,13 +74,18 @@ export type GalleryDoc = {
 };

 const STORAGE_PREFIX = "infiplot:gallery:";
+const AUDIO_SUFFIX = ":audio";
+const MUTED_STORAGE_KEY = "infiplot:gallery:muted";

 function readDoc(id: string): GalleryDoc | null {
  try {
    const raw = window.localStorage.getItem(STORAGE_PREFIX + id);
    if (!raw) return null;
    const parsed = JSON.parse(raw) as GalleryDoc;
-    if ((parsed.v !== 1 && parsed.v !== 2) || !Array.isArray(parsed.scenes)) {
+    if (
+      (parsed.v !== 1 && parsed.v !== 2 && parsed.v !== 3) ||
+      !Array.isArray(parsed.scenes)
+    ) {
      return null;
    }
    return parsed;
@@ -86,6 +94,23 @@ function readDoc(id: string): GalleryDoc | null {
  }
 }

+function readSidecarAudio(id: string): Record<string, string> {
+  try {
+    const raw = window.localStorage.getItem(
+      STORAGE_PREFIX + id + AUDIO_SUFFIX,
+    );
+    if (!raw) return {};
+    const parsed = JSON.parse(raw) as Record<string, string>;
+    const out: Record<string, string> = {};
+    for (const [k, v] of Object.entries(parsed)) {
+      if (typeof v === "string" && v.startsWith("data:")) out[k] = v;
+    }
+    return out;
+  } catch {
+    return {};
+  }
+}
+
 function detectOrientation(): Orientation {
  if (typeof window === "undefined") return "landscape";
  const portrait = window.matchMedia("(orientation: portrait)").matches;
@@ -352,6 +377,8 @@ function Slide({
  beatId,
  orientation,
  alternates,
+  audioByBeatId,
+  muted,
  dialogueOpen,
  setDialogueOpen,
  onAdvanceBeat,
@@ -361,6 +388,8 @@ function Slide({
  beatId: string;
  orientation: Orientation;
  alternates: Record<string, GalleryScene>;
+  audioByBeatId: Record<string, string>;
+  muted: boolean;
  dialogueOpen: boolean;
  setDialogueOpen: (b: boolean) => void;
  onAdvanceBeat: (nextBeatId: string) => void;
@@ -372,6 +401,24 @@ function Slide({

  const beat = findBeat(scene, beatId) ?? findBeat(scene, scene.entryBeatId);

+  const audioSrc =
+    beat && scene.id && !muted
+      ? (audioByBeatId[`${scene.id}:${beat.id}`] ?? null)
+      : null;
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  useEffect(() => {
+    const el = audioRef.current;
+    if (!el) return;
+    if (!audioSrc) {
+      el.pause();
+      return;
+    }
+    el.currentTime = 0;
+    void el.play().catch(() => {
+      // Browsers can refuse autoplay until user interacts — silent fail is fine.
+    });
+  }, [audioSrc]);
+
  const choices: BeatChoice[] =
    beat?.next.type === "choice"
      ? (beat.next as { type: "choice"; choices: BeatChoice[] }).choices
@@ -533,6 +580,16 @@ function Slide({
          onClose={() => setDialogueOpen(false)}
        />
      )}
+
+      {audioSrc && (
+        <audio
+          ref={audioRef}
+          src={audioSrc}
+          autoPlay
+          preload="auto"
+          className="hidden"
+        />
+      )}
    </div>
  );
 }
@@ -561,6 +618,20 @@ function GalleryInner() {
  const [downloadingPortraits, setDownloadingPortraits] = useState(false);
  const [orientation, setOrientation] = useState<Orientation>("landscape");
  const [presentation, setPresentation] = useState(false);
+  // Audio map keyed by `${sceneId}:${beatId}`. Loaded in two phases: the
+  // sidecar localStorage key (gallery export path) is read lazily after first
+  // paint so the multi-MB JSON.parse doesn't block the first scene image's
+  // progressive paint. Imports from `.infiplot` files set this synchronously
+  // since the data is already in memory.
+  const [audioByBeatId, setAudioByBeatId] = useState<Record<string, string>>({});
+  const [muted, setMuted] = useState<boolean>(() => {
+    if (typeof window === "undefined") return false;
+    try {
+      return window.localStorage.getItem(MUTED_STORAGE_KEY) === "1";
+    } catch {
+      return false;
+    }
+  });
  // Top toolbar auto-hide while in fullscreen — it shows briefly on entry,
  // retracts upward, and pops back down when the cursor approaches the top
  // edge. Outside presentation mode the bar is always visible.
@@ -609,6 +680,17 @@ function GalleryInner() {
    setOrientation(d.orientation ?? detectOrientation());
    const first = d.scenes[0]!;
    setStack([{ scene: first, beatId: first.entryBeatId, mainIdx: 0 }]);
+
+    // Lazy-load the audio sidecar AFTER first paint so its JSON.parse (~MBs
+    // of base64) doesn't stall the main thread and let the first image
+    // paint row-by-row. setTimeout(0) yields back to the renderer first.
+    if (d.v === 3) {
+      const t = window.setTimeout(() => {
+        const audio = readSidecarAudio(id);
+        if (Object.keys(audio).length > 0) setAudioByBeatId(audio);
+      }, 0);
+      return () => window.clearTimeout(t);
+    }
  }, []);

  // Prefer the doc's stored orientation; fall back to the device.
@@ -1035,6 +1117,8 @@ function GalleryInner() {
        beatId={top.beatId}
        orientation={orientation}
        alternates={alternates}
+        audioByBeatId={audioByBeatId}
+        muted={muted}
        dialogueOpen={dialogueOpen}
        setDialogueOpen={setDialogueOpen}
        onAdvanceBeat={onAdvanceBeat}
@@ -1080,6 +1164,27 @@ function GalleryInner() {
        </div>

        <div className="pointer-events-auto flex items-center gap-2">
+          {Object.keys(audioByBeatId).length > 0 && (
+            <button
+              type="button"
+              onClick={() => {
+                const next = !muted;
+                setMuted(next);
+                try {
+                  window.localStorage.setItem(MUTED_STORAGE_KEY, next ? "1" : "0");
+                } catch {
+                  // ignore
+                }
+              }}
+              className="flex h-9 w-9 items-center justify-center rounded-full bg-black/40 text-white/80 backdrop-blur-sm transition-colors hover:text-white"
+              aria-label={muted ? "取消静音" : "静音"}
+              title={muted ? "取消静音" : "静音"}
+            >
+              <i
+                className={`fa-solid ${muted ? "fa-volume-xmark" : "fa-volume-high"} text-[12px]`}
+              />
+            </button>
+          )}
          <button
            type="button"
            onClick={() => void togglePresentation()}
@@ -88,6 +88,30 @@
  .vn-scrollbar::-webkit-scrollbar-corner {
    background: transparent;
  }
+
+  /* 极细滚动条 · 无轨道背景 */
+  .thin-scrollbar {
+    scrollbar-width: thin;
+    scrollbar-color: rgba(195, 155, 75, 0.5) transparent;
+  }
+
+  .thin-scrollbar::-webkit-scrollbar {
+    width: 4px;
+    height: 4px;
+  }
+
+  .thin-scrollbar::-webkit-scrollbar-track {
+    background: transparent;
+  }
+
+  .thin-scrollbar::-webkit-scrollbar-thumb {
+    background: rgba(195, 155, 75, 0.45);
+    border-radius: 999px;
+  }
+
+  .thin-scrollbar::-webkit-scrollbar-thumb:hover {
+    background: rgba(220, 180, 95, 0.7);
+  }
 }

@keyframes infiplot-ripple {
@@ -12,7 +12,14 @@ import {
 } from "@/lib/options";
 import { readStoredTtsConfig } from "@/lib/clientTtsConfig";
 import { SettingsModal, readStoredPlayerName, readStoredVisionClick } from "@/components/SettingsModal";
+import { analyzeImageDataUrl } from "@infiplot/ai-client";
+import { readStoredModelConfig, resolveEngineConfig } from "@/lib/clientModelConfig";
+import { STYLE_EXTRACTION_PROMPT } from "@/lib/styleExtraction";
 import { STORY_SHARE_STORAGE_KEY, parseStoryShareDoc } from "@/lib/storyShare";
+import { AUTH_ENABLED } from "@/lib/supabase/config";
+import { isAuthed, writeResumeSnapshot } from "@/lib/authResume";
+import { AuthModal } from "@/components/AuthModal";
+import { UserChip } from "@/components/UserChip";

 /* ============================================================================
   InfiPlot · 首页（编辑式视觉风格 · 居中构图，呼应低保真原型）
@@ -821,7 +828,7 @@ function CategorySelect({
        />
      </button>
      {open && (
-        <div className="absolute left-0 top-full mt-2 z-30 min-w-[150px] py-1.5 bg-cream-50 border border-clay-900/15 rounded-sm shadow-xl shadow-clay-900/10">
+        <div className="absolute left-0 top-full mt-2 z-30 min-w-[150px] max-w-[calc(100vw-2rem)] py-1.5 bg-cream-50 border border-clay-900/15 rounded-sm shadow-xl shadow-clay-900/10">
          {items.map((it, i) => (
            <button
              key={i}
@@ -844,6 +851,42 @@ function CategorySelect({

 /* ---------- style picker modal ---------- */

+const PENDING_START_KEY = "infiplot:pending-start";
+const PENDING_PARSE_KEY = "infiplot:pending-parse";
+
+// Shared by the StyleModal uploader and the post-login resume path: turns a
+// resized data URL into an English style prompt, via the browser engine when a
+// BYO model config is present, otherwise the server route.
+async function extractStylePromptFromImage(resized: string): Promise<string> {
+  const modelCfg = readStoredModelConfig();
+  if (modelCfg) {
+    const config = resolveEngineConfig(modelCfg, null);
+    const raw = await analyzeImageDataUrl(
+      config.vision,
+      resized,
+      STYLE_EXTRACTION_PROMPT,
+    );
+    let parsed: { stylePrompt?: string };
+    try {
+      parsed = JSON.parse(raw);
+    } catch {
+      parsed = { stylePrompt: raw };
+    }
+    return (parsed.stylePrompt ?? "").trim();
+  }
+  const r = await fetch("/api/parse-style-image", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ imageDataUrl: resized }),
+  });
+  if (!r.ok) {
+    const data = (await r.json().catch(() => ({}))) as { error?: string };
+    throw new Error(data.error || `HTTP ${r.status}`);
+  }
+  const data = (await r.json()) as { stylePrompt?: string };
+  return (data.stylePrompt ?? "").trim();
+}
+
 function StyleModal({
  items,
  value,
@@ -853,6 +896,7 @@ function StyleModal({
  setCustomStyleGuide,
  customStyleRefImage,
  setCustomStyleRefImage,
+  onRequireAuth,
 }: {
  items: string[];
  value: number;
@@ -862,6 +906,7 @@ function StyleModal({
  setCustomStyleGuide: (s: string) => void;
  customStyleRefImage: string;
  setCustomStyleRefImage: (s: string) => void;
+  onRequireAuth: () => void;
 }) {
  const [q, setQ] = useState("");
  const [shown, setShown] = useState(false);
@@ -870,7 +915,7 @@ function StyleModal({
  const [parsing, setParsing] = useState(false);
  const [parseError, setParseError] = useState<string | null>(null);
  const fileInputRef = useRef<HTMLInputElement>(null);
-  const thumbV = "v5";
+  const thumbV = "v6";
  const STYLE_THUMB: Record<string, string> = {
    "自动": `/home/styles/auto.webp?${thumbV}`,
    "自定义风格": `/home/styles/custom.webp?${thumbV}`,
@@ -976,17 +1021,20 @@ function StyleModal({
    setParsing(true);
    try {
      const resized = await resizeImageToDataUrl(file);
-      const res = await fetch("/api/parse-style-image", {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ imageDataUrl: resized }),
-      });
-      if (!res.ok) {
-        const j = (await res.json().catch(() => ({}))) as { error?: string };
-        throw new Error(j.error ?? `${res.status}`);
+      // The parse is a paid vision call, so require login first. The resize is
+      // already done — stash it so login can auto-resume the parse on return.
+      if (!(await isAuthed())) {
+        try {
+          sessionStorage.setItem(PENDING_PARSE_KEY, resized);
+        } catch {
+          /* too big to stash — user re-uploads after login */
+        }
+        onRequireAuth();
+        return;
      }
-      const data = (await res.json()) as { stylePrompt: string };
-      setDraft(data.stylePrompt);
+      const stylePrompt = await extractStylePromptFromImage(resized);
+      if (!stylePrompt) throw new Error("视觉模型返回了空的风格描述");
+      setDraft(stylePrompt);
      setCustomStyleRefImage(resized);
      track("style_image_upload", { ok: true });
    } catch (err) {
@@ -1023,7 +1071,7 @@ function StyleModal({
          (shown ? "opacity-100 scale-100" : "opacity-0 scale-95")
        }
      >
-        <div className="flex items-center gap-5 px-6 md:px-8 py-5 border-b border-clay-900/10">
+        <div className="flex items-center gap-3 md:gap-5 px-5 md:px-8 py-4 md:py-5 border-b border-clay-900/10">
          {view === "custom" ? (
            <div className="flex flex-1 items-center gap-3">
              <button
@@ -1040,11 +1088,11 @@ function StyleModal({
            <>
              <div className="flex flex-1 flex-col">
                <span className="font-serif text-xl md:text-2xl text-clay-900">选择绘画风格</span>
-                <span className="text-[11px] text-clay-500 mt-1 tracking-wide">
+                <span className="hidden md:block text-[11px] text-clay-500 mt-1 tracking-wide">
                  默认「自动」· 由 AI 根据故事自动匹配画风；选择「自定义风格」可输入描述或上传参考图
                </span>
              </div>
-              <div className="relative w-[280px] max-w-[46vw]">
+              <div className="relative w-[150px] max-w-[40vw] md:w-[280px] md:max-w-[46vw]">
                <input
                  value={q}
                  onChange={(e) => setQ(e.target.value)}
@@ -1093,7 +1141,7 @@ function StyleModal({
                {parseError}
              </span>
            )}
-            <div className="flex items-center gap-2">
+            <div className="flex flex-wrap items-center gap-2">
              {customStyleRefImage ? (
                <div className="flex items-center gap-2">
                  {/* eslint-disable-next-line @next/next/no-img-element */}
@@ -1149,7 +1197,7 @@ function StyleModal({
                  const v = e.target.value;
                  if (v && STYLE_MAP[v]) setDraft(STYLE_MAP[v]);
                }}
-                className="h-8 w-44 rounded-sm border border-clay-900/15 bg-cream-50 px-2 font-sans text-[12px] text-clay-700 outline-none transition-colors focus:border-ember-500"
+                className="h-8 w-36 md:w-44 rounded-sm border border-clay-900/15 bg-cream-50 px-2 font-sans text-[12px] text-clay-700 outline-none transition-colors focus:border-ember-500"
              >
                <option value="">从预设风格导入…</option>
                {Object.keys(STYLE_MAP).map((s) => (
@@ -1256,11 +1304,15 @@ export default function HomePage() {
  // 顶部使用提示：默认展示，用户可点 × 永久关闭（localStorage:infiplot:hintClosed）。
  const [hintClosed, setHintClosed] = useState(false);

-  // 统一设置弹窗（名字 + 识图 + TTS Key）：可选增强，数据只存浏览器。
+  // 统一设置弹窗（通用 + 模型）：可选增强，数据只存浏览器。
  const [settingsOpen, setSettingsOpen] = useState(false);
+  const [settingsTab, setSettingsTab] = useState<"general" | "models">("general");
  const [ttsConfigured, setTtsConfigured] = useState(false);
  const [playerName, setPlayerName] = useState("");
  const [visionClickEnabled, setVisionClickEnabled] = useState(true);
+  const [authModalOpen, setAuthModalOpen] = useState(false);
+  const [pendingAction, setPendingAction] = useState<"start" | null>(null);
+

  const styleRow = OPTS.findIndex((o) => o.modal);
  const voiceRow = OPTS.findIndex((o) => o.label === "语音配音");
@@ -1330,7 +1382,114 @@ export default function HomePage() {
    }
  };

-  const start = () => {
+  // ── Auth-gated resume (OAuth round-trips lose all React state) ──────────
+  // An OAuth login unmounts the homepage and discards everything the user
+  // typed. We snapshot the form before redirecting and replay it on return.
+  // The email-OTP path keeps state in place and resumes synchronously via
+  // AuthModal.onSuccess instead.
+  const [autoStartPending, setAutoStartPending] = useState(false);
+
+  const persistPendingStart = () => {
+    const snap = { prompt, sel, customStyleGuide, customStyleRefImage, playerName };
+    // Quota fallback: the data-URL style ref (~100KB) is the usual culprit —
+    // drop it first; text-only form still resumes the start.
+    writeResumeSnapshot(PENDING_START_KEY, snap, [
+      { ...snap, customStyleRefImage: "" },
+    ]);
+  };
+
+  const resumePendingParse = async () => {
+    const resized = sessionStorage.getItem(PENDING_PARSE_KEY);
+    if (!resized) return;
+    sessionStorage.removeItem(PENDING_PARSE_KEY);
+    try {
+      const stylePrompt = await extractStylePromptFromImage(resized);
+      if (!stylePrompt) return;
+      setCustomStyleGuide(stylePrompt);
+      setCustomStyleRefImage(resized);
+      const customIdx = ART_STYLES.indexOf("自定义风格");
+      if (styleRow >= 0 && customIdx >= 0) {
+        setSel((s) => s.map((v, j) => (j === styleRow ? customIdx : v)));
+      }
+      track("style_image_upload", { ok: true });
+    } catch {
+      /* resume parse failed — stay silent, user can re-upload */
+    }
+  };
+
+  const resumePendingStart = () => {
+    const raw = sessionStorage.getItem(PENDING_START_KEY);
+    if (!raw) return;
+    sessionStorage.removeItem(PENDING_START_KEY);
+    try {
+      const snap = JSON.parse(raw) as {
+        prompt?: string;
+        sel?: number[];
+        customStyleGuide?: string;
+        customStyleRefImage?: string;
+        playerName?: string;
+      };
+      setPrompt(snap.prompt ?? "");
+      if (Array.isArray(snap.sel)) setSel(snap.sel);
+      setCustomStyleGuide(snap.customStyleGuide ?? "");
+      setCustomStyleRefImage(snap.customStyleRefImage ?? "");
+      if (snap.playerName) setPlayerName(snap.playerName);
+      // Defer start() to the next render so it reads the restored state.
+      setAutoStartPending(true);
+    } catch {
+      /* corrupt snapshot — ignore */
+    }
+  };
+
+  // On mount after an OAuth redirect: if a pending action was left and the user
+  // is now signed in, restore and continue; otherwise clear stale snapshots.
+  useEffect(() => {
+    if (!AUTH_ENABLED) return;
+    const hasStart = sessionStorage.getItem(PENDING_START_KEY) !== null;
+    const hasParse = sessionStorage.getItem(PENDING_PARSE_KEY) !== null;
+    if (!hasStart && !hasParse) return;
+    let cancelled = false;
+    void (async () => {
+      // Gate BOTH snapshots on auth: a stale leftover from an abandoned login
+      // must not resurrect a half-flow. The parse key stores a raw data URL
+      // with its own restore path (resumePendingParse), so both are gated
+      // manually here rather than via consumeResumeSnapshot.
+      if (!(await isAuthed())) {
+        sessionStorage.removeItem(PENDING_START_KEY);
+        sessionStorage.removeItem(PENDING_PARSE_KEY);
+        return;
+      }
+      if (cancelled) return;
+      await resumePendingParse();
+      resumePendingStart();
+    })();
+    return () => {
+      cancelled = true;
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  // Run the resumed start() only after restored form state has committed.
+  useEffect(() => {
+    if (!autoStartPending) return;
+    setAutoStartPending(false);
+    void start();
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [autoStartPending]);
+
+  const start = async () => {
+    if (AUTH_ENABLED) {
+      if (!(await isAuthed())) {
+        // Don't snapshot here — persistPendingStart fires via
+        // AuthModal.onBeforeOAuth at redirect time, so the form is captured
+        // for BOTH OAuth and (harmlessly) OTP paths at the single source of
+        // truth. OTP's onSuccess resumes in-place without needing the snapshot.
+        setPendingAction("start");
+        setAuthModalOpen(true);
+        return;
+      }
+    }
+
    // 空输入时落回 Typewriter 当前闪动的示例——用户看到啥就玩啥，
    // 不会再出现「点开始 → 剧情和占位文字毫无关系」的体验断层。
    const userPrompt =
@@ -1467,17 +1626,22 @@ export default function HomePage() {
    router.push(`/play?card=${imgPrefix}${idx}`);
  };

+  // overflow-x-hidden 在 wrapper 层兜底：body 的 overflow-x-hidden 在移动端会因
+  // 规范的 overflow 传播而失效，wrapper 是最靠近溢出源（右下操作集群）的块级剪裁点。
  return (
-    <div className="min-h-screen flex flex-col">
+    <div className="min-h-screen flex flex-col overflow-x-hidden">
      {/* ================== HEADER ================== */}
      <header className="mx-auto w-full max-w-[1640px] px-6 md:px-16 pt-7 md:pt-10 flex items-center justify-between">
        <span className="font-serif text-2xl md:text-[34px] leading-none tracking-tight text-clay-900">
          Infi<em className="italic font-light text-ember-500">Plot</em>
        </span>
-        <div className="flex items-center gap-5">
+        <div className="flex items-center gap-4 md:gap-5">
          <button
            type="button"
-            onClick={() => setSettingsOpen(true)}
+            onClick={() => {
+              setSettingsTab("general");
+              setSettingsOpen(true);
+            }}
            aria-label="设置"
            title="设置"
            className="text-base text-clay-500 hover:text-ember-500 transition-colors"
@@ -1489,7 +1653,7 @@ export default function HomePage() {
            target="_blank"
            rel="noopener noreferrer"
            aria-label="GitHub"
-            className="text-lg text-clay-500 hover:text-ember-500 transition-colors"
+            className="inline-flex text-lg text-clay-500 hover:text-ember-500 transition-colors"
          >
            <i className="fa-brands fa-github" />
          </a>
@@ -1498,15 +1662,16 @@ export default function HomePage() {
            target="_blank"
            rel="noopener noreferrer"
            aria-label="X / Twitter"
-            className="text-base text-clay-500 hover:text-ember-500 transition-colors"
+            className="inline-flex text-base text-clay-500 hover:text-ember-500 transition-colors"
          >
            <i className="fa-brands fa-x-twitter" />
          </a>
+          <UserChip onLoginClick={() => setAuthModalOpen(true)} />
        </div>
      </header>

      {/* ================== HERO 控制区（居中，呼应原型布局） ================== */}
-      <section className="px-6 md:px-16 pt-16 md:pt-24 pb-10 md:pb-14">
+      <section className="px-6 md:px-16 pt-12 md:pt-24 pb-10 md:pb-14">
        <div className="mx-auto max-w-[1100px] text-center">
          <h1 className="font-serif font-light text-[32px] md:text-[56px] leading-[1.12] tracking-tight text-clay-900">
            今天想体验什么故事？
@@ -1534,10 +1699,10 @@ export default function HomePage() {
                rows={1}
                placeholder=" "
                spellCheck={false}
-                className="block w-full resize-none overflow-hidden border-b border-clay-900/25 bg-transparent py-3 md:py-4 pr-28 font-serif text-lg md:text-2xl lining-nums text-clay-900 outline-none transition-colors focus:border-ember-500"
+                className="block w-full resize-none overflow-hidden border-b border-clay-900/25 bg-transparent py-3 md:py-4 pr-36 font-serif text-lg md:text-2xl lining-nums text-clay-900 outline-none transition-colors focus:border-ember-500"
              />
              {!prompt && (
-                <div className="pointer-events-none absolute left-0 right-0 top-0 overflow-hidden whitespace-nowrap py-3 md:py-4 pr-28 font-serif text-lg md:text-2xl text-clay-400">
+                <div className="pointer-events-none absolute left-0 right-0 top-0 overflow-hidden whitespace-nowrap py-3 md:py-4 pr-36 font-serif text-lg md:text-2xl text-clay-400">
                  <Typewriter
                    phrase={phrases[phraseIdx] ?? ""}
                    onCycle={() =>
@@ -1546,13 +1711,6 @@ export default function HomePage() {
                  />
                </div>
              )}
-              <button
-                type="submit"
-                className="absolute right-0 bottom-2 md:bottom-3 inline-flex items-center gap-2 rounded-sm bg-clay-900 px-5 py-2 md:py-2.5 font-sans text-sm md:text-[15px] text-cream-50 transition-colors hover:bg-ember-500"
-              >
-                开始
-                <i className="fa-solid fa-arrow-right text-xs" />
-              </button>
              <input
                ref={storyImportRef}
                type="file"
@@ -1560,16 +1718,27 @@ export default function HomePage() {
                className="hidden"
                onChange={(e) => void handleStoryImport(e.target.files?.[0])}
              />
-              <button
-                type="button"
-                onClick={() => storyImportRef.current?.click()}
-                className="group absolute right-[-2.25rem] bottom-2 md:bottom-3 inline-flex items-center justify-center rounded-sm border border-clay-900/20 px-2 py-2 md:py-2.5 text-clay-400 transition-colors hover:border-ember-500 hover:text-ember-500"
-              >
-                <i className="fa-solid fa-file-import text-sm" />
-                <span className="pointer-events-none absolute -bottom-8 left-1/2 -translate-x-1/2 whitespace-nowrap rounded bg-clay-900 px-2 py-1 font-sans text-[11px] text-cream-50 opacity-0 transition-opacity group-hover:opacity-100">
-                  载入剧情
-                </span>
-              </button>
+              {/* 右下操作集群：载入剧情 + 开始，统一锚定 right-0，杜绝 right-[-...]
+                  负偏移导致的移动端横向溢出。 */}
+              <div className="absolute right-0 bottom-2 md:bottom-3 flex items-center gap-2">
+                <button
+                  type="button"
+                  onClick={() => storyImportRef.current?.click()}
+                  className="group relative inline-flex items-center justify-center rounded-sm border border-clay-900/15 bg-cream-50/70 backdrop-blur-sm px-2 py-2 md:py-2.5 text-clay-400 transition-colors hover:border-ember-500 hover:bg-cream-50/90 hover:text-ember-500"
+                >
+                  <i className="fa-solid fa-file-import text-sm" />
+                  <span className="pointer-events-none absolute -bottom-8 left-1/2 -translate-x-1/2 whitespace-nowrap rounded bg-clay-900 px-2 py-1 font-sans text-[11px] text-cream-50 opacity-0 transition-opacity group-hover:opacity-100">
+                    载入剧情
+                  </span>
+                </button>
+                <button
+                  type="submit"
+                  className="inline-flex items-center gap-2 rounded-sm bg-clay-900 px-5 py-2 md:py-2.5 font-sans text-sm md:text-[15px] text-cream-50 transition-colors hover:bg-ember-500"
+                >
+                  开始
+                  <i className="fa-solid fa-arrow-right text-xs" />
+                </button>
+              </div>
            </div>
            {storyImportError && (
              <p className="mt-2 text-right text-xs leading-relaxed text-ember-500">
@@ -1608,14 +1777,16 @@ export default function HomePage() {
            ))}
          </div>

+
+
          {/* 使用提示：可被用户永久关闭（localStorage:infiplot:hintClosed） */}
          {!hintClosed && (
-            <div className="relative mx-auto mt-10 md:mt-12 max-w-[640px] rounded-sm border border-clay-900/10 bg-cream-100/50 px-8 py-3.5">
+            <div className="relative mx-auto mt-10 md:mt-12 max-w-[640px] rounded-sm border border-clay-900/10 bg-cream-100/50 px-5 md:px-8 py-3.5">
              <p className="font-serif text-[13px] md:text-sm leading-relaxed text-clay-500">
-                输入你的想象、配置风格，点击「开始」即可游玩；也可以从下方的精选故事集，挑一篇快速体验{" "}
+                输入想法、配置风格，点击「开始」即可游玩{AUTH_ENABLED && "（测试期间，登录即可免费畅玩）"}；也可以从下方精选故事集挑一篇快速体验{" "}
                <em className="not-italic text-ember-500">InfiPlot</em>。
-                点击「<span className="text-ember-500">设置</span>」可以配置你的名字和配音
-                API Key，让角色以你的名字称呼你，配音体验也更稳定。
+                点击「<span className="inline-flex items-center gap-1 text-ember-500"><i className="fa-solid fa-gear text-[10px]" />设置</span>」还能填入你的名字，以及你自己的文本、绘图、识图模型和配音
+                Key——全部只存在本地浏览器，体验更稳定。
              </p>
              <button
                type="button"
@@ -1671,7 +1842,7 @@ export default function HomePage() {
          <div>
            <p className="text-[10px] smallcaps text-clay-500 mb-3">团 队</p>
            <p className="font-serif italic text-clay-700 text-base leading-relaxed">
-              我们来自清华大学、兰州大学、西安交通大学等高校，希望探索多模态模型在「直接生成图片、视频」这类 <span className="not-italic">one-shot</span> 能力之外，更多的可能性。本项目目前仍处于早期阶段，我们还在招募成员，如果你也感兴趣，欢迎联系我们，期待你的加入。
+              我们来自清华大学、兰州大学等高校，希望探索多模态模型在「直接生成图片、视频」这类 <span className="not-italic">oneshot</span> 能力之外，更多的可能性。本项目目前仍处于早期阶段，我们还在招募成员，如果你也感兴趣，欢迎联系我们，期待你的加入。
            </p>
          </div>

@@ -1713,7 +1884,7 @@ export default function HomePage() {
            <p className="text-[10px] smallcaps text-clay-500 mb-3">内 测 用 户 群</p>
            <img
              src="/qq-group.webp"
-              alt="InfiPlot 内测交流群 QQ 群二维码（群号 575404333）"
+              alt="InfiPlot 公测交流群 QQ 群二维码（群号 575404333）"
              width={760}
              height={760}
              loading="lazy"
@@ -1728,9 +1899,9 @@ export default function HomePage() {

        <div className="hairline-full w-full mt-14 md:mt-20 mb-12 md:mb-16" />
        <p className="mx-auto max-w-3xl text-center font-sans text-xs md:text-[13px] leading-[1.85] text-clay-500">
-          内测期间本产品可免费使用，但稳定性可能会随并发用户数量而有波动。寻找算力赞助商ing，欢迎联系^-^
+          公测期间本产品可免费使用，但稳定性可能会随并发用户数量而有波动。
          <br />
-          目前，内测期间生成的内容不会被保存，如有需要，请通过录屏或截图等方式保存游玩体验，并记录下生成故事时的提示词与风格选项等。
+          公测期间生成的内容不会在服务器上保存。如需留存，请在游玩结束后使用导出图集或分享剧情功能保存您的游玩体验。
          <br />
          AI 生成的内容不代表本团队立场。
          {analyticsOn && (
@@ -1753,8 +1924,13 @@ export default function HomePage() {

      <footer className="mx-auto w-full max-w-[1640px] px-6 md:px-16 pb-10 mt-auto">
        <div className="hairline-full w-full mb-5" />
-        <div className="flex flex-col items-center text-[10px] smallcaps text-clay-500">
+        <div className="flex flex-col items-center gap-2 text-[10px] smallcaps text-clay-500">
          <span>© 2026 InfiPlot. All rights reserved.</span>
+          <span className="flex items-center gap-3 normal-case tracking-normal text-[11px]">
+            <a href="/privacy" className="hover:text-ember-500 transition-colors">隐私政策</a>
+            <span className="text-clay-300">·</span>
+            <a href="/terms" className="hover:text-ember-500 transition-colors">服务条款</a>
+          </span>
        </div>
      </footer>

@@ -1771,16 +1947,18 @@ export default function HomePage() {
          setCustomStyleGuide={setCustomStyleGuide}
          customStyleRefImage={customStyleRefImage}
          setCustomStyleRefImage={setCustomStyleRefImage}
+          onRequireAuth={() => setAuthModalOpen(true)}
        />
      )}
      {settingsOpen && (
        <SettingsModal
+          initialTab={settingsTab}
          initialVisionClickEnabled={visionClickEnabled}
          onClose={() => setSettingsOpen(false)}
          onSaved={(settings) => {
-            setTtsConfigured(settings.ttsConfigured);
            setPlayerName(settings.playerName);
            setVisionClickEnabled(settings.visionClickEnabled);
+            setTtsConfigured(settings.ttsConfigured);
            if (settings.ttsConfigured && voiceRow >= 0) {
              const onIdx = OPTS[voiceRow]!.items.indexOf("开启");
              if (onIdx >= 0)
@@ -1789,6 +1967,46 @@ export default function HomePage() {
          }}
        />
      )}
+      {authModalOpen && (
+        <AuthModal
+          onClose={() => {
+            setAuthModalOpen(false);
+            setPendingAction(null);
+            try {
+              sessionStorage.removeItem(PENDING_START_KEY);
+              sessionStorage.removeItem(PENDING_PARSE_KEY);
+            } catch {
+              /* ignore */
+            }
+          }}
+          onSuccess={() => {
+            setAuthModalOpen(false);
+            // Email-OTP stays on the page, so resume inline: parse first (it
+            // reads its own snapshot), then the pending start. OTP never
+            // triggers onBeforeOAuth, so no PENDING_START snapshot was written.
+            void resumePendingParse();
+            if (pendingAction === "start") {
+              setPendingAction(null);
+              try {
+                sessionStorage.removeItem(PENDING_START_KEY);
+              } catch {
+                /* ignore */
+              }
+              start();
+            }
+          }}
+          //
+          // Only snapshot when the user is mid-start: the OAuth redirect also
+          // fires for bare logins (UserChip / StyleModal onRequireAuth), where
+          // the user just wants to sign in — not kick off a game. Guarding on
+          // pendingAction keeps bare logins from auto-starting a session on
+          // return. (start() sets pendingAction="start" right before opening
+          // this modal.)
+          onBeforeOAuth={() => {
+            if (pendingAction === "start") persistPendingStart();
+          }}
+        />
+      )}
    </div>
  );
 }
@@ -0,0 +1,217 @@
+import type { Metadata } from "next";
+import Link from "next/link";
+
+export const metadata: Metadata = {
+  title: "隐私政策 — InfiPlot",
+  description: "InfiPlot 隐私政策：了解我们如何收集、使用和保护您的个人信息。",
+};
+
+export default function PrivacyPage() {
+  return (
+    <main className="mx-auto w-full max-w-3xl px-6 md:px-16 py-16 md:py-24">
+      <Link
+        href="/"
+        className="inline-flex items-center gap-2 text-clay-500 hover:text-ember-500 transition-colors text-sm mb-12"
+      >
+        <i className="fa-solid fa-arrow-left text-xs" />
+        <span>返回首页</span>
+      </Link>
+
+      <h1 className="font-serif text-3xl md:text-4xl text-clay-900 mb-4">
+        隐私政策
+      </h1>
+      <p className="text-sm text-clay-500 mb-12">
+        生效日期：2026 年 6 月 14 日 &nbsp;|&nbsp; 最后更新：2026 年 6 月 14 日
+      </p>
+
+      <div className="hairline-full w-full mb-12" />
+
+      <div className="space-y-10 text-clay-800 text-[15px] leading-[1.85]">
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">概述</h2>
+          <p>
+            InfiPlot（以下简称"我们"）是一款用 AI
+            实时生成内容的交互式剧情游戏。我们重视您的隐私，并致力于以透明的方式处理您的个人信息。本隐私政策说明了我们在您使用
+            InfiPlot 服务时如何收集、使用、存储和保护您的数据。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            我们收集的信息
+          </h2>
+          <p className="mb-3">
+            当您通过第三方账号（Google 或 GitHub）登录时，我们会接收以下信息：
+          </p>
+          <ul className="list-disc pl-6 space-y-1">
+            <li>您的电子邮件地址</li>
+            <li>您的显示名称</li>
+            <li>您的头像图片 URL</li>
+          </ul>
+          <p className="mt-3">
+            当您通过电子邮件验证码登录时，我们仅收集您的电子邮件地址。
+          </p>
+          <p className="mt-3">
+            您在游戏中输入的故事提示词和对话选择会被传输至服务器以供 AI 模型实时处理，但不会在我们的服务器上持久存储。游戏会话结束后，相关数据不会被保留。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            我们如何使用您的信息
+          </h2>
+          <p>我们仅将收集到的信息用于以下目的：</p>
+          <ul className="list-disc pl-6 space-y-1 mt-3">
+            <li>
+              <strong>身份验证</strong>
+              ：验证您的身份并维持登录状态。
+            </li>
+            <li>
+              <strong>个性化显示</strong>
+              ：在界面中展示您的用户名和头像。
+            </li>
+            <li>
+              <strong>服务沟通</strong>
+              ：使用您的电子邮件地址向您发送与服务相关的重要通知，例如产品更新、功能变更或运营信息。
+            </li>
+          </ul>
+          <p className="mt-3">
+            我们不会将您的个人信息用于广告投放、用户画像、行为分析或任何其他未在本政策中明确说明的用途。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            数据存储与安全
+          </h2>
+          <p>
+            您的账户信息存储在{" "}
+            <a
+              href="https://supabase.com/"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-ember-500 hover:text-ember-400 transition-colors underline decoration-clay-900/20 underline-offset-2"
+            >
+              Supabase
+            </a>{" "}
+            提供的托管数据库中。Supabase
+            采用行业标准的安全措施来保护数据，包括传输加密（TLS）和静态加密。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            第三方共享
+          </h2>
+          <p>
+            我们不会出售、出租或以其他方式向第三方共享您的个人信息。我们不会将您的数据用于第三方广告或营销目的。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            数据保留与删除
+          </h2>
+          <p>
+            我们在您持有有效账户期间保留您的账户信息。您可以随时通过发送邮件至{" "}
+            <a
+              href="mailto:hi@infiplot.com"
+              className="text-ember-500 hover:text-ember-400 transition-colors"
+            >
+              hi@infiplot.com
+            </a>{" "}
+            请求删除您的账户及所有相关数据。我们将在收到请求后的 30
+            个自然日内完成数据删除。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            匿名统计分析
+          </h2>
+          <p>
+            本站可能使用开源的{" "}
+            <a
+              href="https://umami.is/"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-ember-500 hover:text-ember-400 transition-colors underline decoration-clay-900/20 underline-offset-2"
+            >
+              Umami
+            </a>{" "}
+            进行隐私友好的匿名访问与交互统计。该分析工具不使用
+            Cookie、不收集个人信息、不发送任何您输入的内容、不做跨站追踪。此功能为可选配置，可能不会在所有部署中启用。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            Google API 服务用户数据政策
+          </h2>
+          <p>
+            InfiPlot 对通过 Google API
+            获取的用户数据的使用和转移，遵守{" "}
+            <a
+              href="https://developers.google.com/terms/api-services-user-data-policy"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-ember-500 hover:text-ember-400 transition-colors underline decoration-clay-900/20 underline-offset-2"
+            >
+              Google API Services User Data Policy
+            </a>
+            ，包括有限使用（Limited Use）要求。具体而言：
+          </p>
+          <ul className="list-disc pl-6 space-y-1 mt-3">
+            <li>我们仅请求提供服务所必需的最小权限范围（电子邮件、个人资料）。</li>
+            <li>我们不会将 Google 用户数据用于广告投放或再营销。</li>
+            <li>我们不会将 Google 用户数据出售给第三方。</li>
+            <li>我们不会将 Google 用户数据用于信用评估或贷款。</li>
+            <li>我们不会将 Google 用户数据用于训练通用 AI/ML 模型。</li>
+          </ul>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            儿童隐私
+          </h2>
+          <p>
+            InfiPlot
+            不面向 13 岁以下的儿童。我们不会有意收集 13
+            岁以下儿童的个人信息。如果您认为我们无意中收集了儿童的信息，请联系我们，我们将立即采取措施删除相关数据。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            隐私政策的变更
+          </h2>
+          <p>
+            我们可能会不时更新本隐私政策。如有重大变更，我们将通过在网站上发布更新后的政策并修改"最后更新"日期来通知您。继续使用我们的服务即表示您接受更新后的隐私政策。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">联系我们</h2>
+          <p>
+            如果您对本隐私政策有任何疑问或希望行使您的数据权利，请通过以下方式联系我们：
+          </p>
+          <p className="mt-3">
+            邮箱：{" "}
+            <a
+              href="mailto:hi@infiplot.com"
+              className="text-ember-500 hover:text-ember-400 transition-colors"
+            >
+              hi@infiplot.com
+            </a>
+          </p>
+        </section>
+      </div>
+
+      <div className="hairline-full w-full mt-16 mb-8" />
+
+      <footer className="text-center text-[10px] smallcaps text-clay-500 pb-10">
+        <span>© 2026 InfiPlot. All rights reserved.</span>
+      </footer>
+    </main>
+  );
+}
@@ -0,0 +1,196 @@
+import type { Metadata } from "next";
+import Link from "next/link";
+
+export const metadata: Metadata = {
+  title: "服务条款 — InfiPlot",
+  description: "InfiPlot 服务条款：使用 InfiPlot 服务前请阅读本条款。",
+};
+
+export default function TermsPage() {
+  return (
+    <main className="mx-auto w-full max-w-3xl px-6 md:px-16 py-16 md:py-24">
+      <Link
+        href="/"
+        className="inline-flex items-center gap-2 text-clay-500 hover:text-ember-500 transition-colors text-sm mb-12"
+      >
+        <i className="fa-solid fa-arrow-left text-xs" />
+        <span>返回首页</span>
+      </Link>
+
+      <h1 className="font-serif text-3xl md:text-4xl text-clay-900 mb-4">
+        服务条款
+      </h1>
+      <p className="text-sm text-clay-500 mb-12">
+        生效日期：2026 年 6 月 14 日 &nbsp;|&nbsp; 最后更新：2026 年 6 月 14 日
+      </p>
+
+      <div className="hairline-full w-full mb-12" />
+
+      <div className="space-y-10 text-clay-800 text-[15px] leading-[1.85]">
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            服务说明
+          </h2>
+          <p>
+            InfiPlot（以下简称"我们"或"本服务"）是一款用 AI
+            实时生成图片、语音与剧情分支的交互式剧情游戏。本服务目前处于公测阶段，功能和可用性可能随时发生变化。
+          </p>
+          <p className="mt-3">
+            使用本服务即表示您同意遵守本服务条款。如果您不同意本条款的任何部分，请停止使用本服务。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            账户与登录
+          </h2>
+          <p>
+            您可以通过 Google、GitHub 账户或电子邮件验证码登录本服务。您有责任保管好自己的账户凭证，并对通过您的账户进行的所有活动负责。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            用户行为准则
+          </h2>
+          <p>使用本服务时，您同意不会：</p>
+          <ul className="list-disc pl-6 space-y-1 mt-3">
+            <li>利用本服务生成违反法律法规的内容。</li>
+            <li>尝试对服务进行逆向工程、攻击或以非正常方式使用 API。</li>
+            <li>干扰或破坏服务的正常运行，或对服务基础设施造成不合理的负担。</li>
+            <li>冒充他人或虚假陈述您与任何个人或实体的关系。</li>
+          </ul>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            AI 生成内容
+          </h2>
+          <p>
+            本服务中的图片、文字、语音等内容均由 AI
+            实时生成。AI 生成的内容不代表本团队的观点或立场。我们无法完全控制 AI
+            生成内容的准确性、适当性或完整性。
+          </p>
+          <p className="mt-3">
+            您理解并同意，AI
+            生成的内容可能存在不准确、不恰当或令人不适的情况。您应自行判断和承担使用
+            AI 生成内容的风险。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            知识产权
+          </h2>
+          <p>
+            InfiPlot 的源代码基于{" "}
+            <a
+              href="https://www.gnu.org/licenses/agpl-3.0.html"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-ember-500 hover:text-ember-400 transition-colors underline decoration-clay-900/20 underline-offset-2"
+            >
+              AGPL-3.0
+            </a>{" "}
+            许可证开源。
+          </p>
+          <p className="mt-3">
+            通过本服务生成的游戏内容（包括故事文本、图片和语音）由您在游戏会话期间创造性地引导产生。我们不主张对您个人游戏会话中生成的内容拥有所有权。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            数据与隐私
+          </h2>
+          <p>
+            公测期间生成的游戏内容不会被持久保存在我们的服务器上。为提供 AI 生成服务，相关内容会在请求处理过程中临时传输和处理，处理完成后不会被保留。有关我们如何处理您的个人信息，请参阅我们的{" "}
+            <Link
+              href="/privacy"
+              className="text-ember-500 hover:text-ember-400 transition-colors underline decoration-clay-900/20 underline-offset-2"
+            >
+              隐私政策
+            </Link>
+            。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            服务可用性
+          </h2>
+          <p>
+            本服务目前处于公测阶段，免费提供使用。我们不保证服务的持续可用性、稳定性或性能。服务可能会因维护、升级或不可抗力因素而中断。
+          </p>
+          <p className="mt-3">
+            我们保留随时修改、暂停或终止服务（或其任何部分）的权利，无论是否通知。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            免责声明
+          </h2>
+          <p>
+            本服务按"现状"和"可用"的基础提供，不附带任何明示或暗示的保证。在法律允许的最大范围内，我们明确否认所有保证，包括但不限于对适销性、特定用途适用性和非侵权性的暗示保证。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            责任限制
+          </h2>
+          <p>
+            在法律允许的最大范围内，InfiPlot
+            团队及其成员在任何情况下均不对因使用或无法使用本服务而产生的任何间接、附带、特殊、后果性或惩罚性损害负责。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            账户终止
+          </h2>
+          <p>
+            我们保留在以下情况下暂停或终止您的账户的权利：
+          </p>
+          <ul className="list-disc pl-6 space-y-1 mt-3">
+            <li>您违反了本服务条款。</li>
+            <li>您的行为对服务或其他用户构成风险。</li>
+            <li>法律法规要求我们这样做。</li>
+          </ul>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">
+            条款变更
+          </h2>
+          <p>
+            我们可能会不时更新本服务条款。如有重大变更，我们将通过在网站上发布更新后的条款来通知您。继续使用本服务即表示您接受更新后的条款。
+          </p>
+        </section>
+
+        <section>
+          <h2 className="font-serif text-xl text-clay-900 mb-3">联系我们</h2>
+          <p>
+            如果您对本服务条款有任何疑问，请通过以下方式联系我们：
+          </p>
+          <p className="mt-3">
+            邮箱：{" "}
+            <a
+              href="mailto:hi@infiplot.com"
+              className="text-ember-500 hover:text-ember-400 transition-colors"
+            >
+              hi@infiplot.com
+            </a>
+          </p>
+        </section>
+      </div>
+
+      <div className="hairline-full w-full mt-16 mb-8" />
+
+      <footer className="text-center text-[10px] smallcaps text-clay-500 pb-10">
+        <span>© 2026 InfiPlot. All rights reserved.</span>
+      </footer>
+    </main>
+  );
+}
@@ -0,0 +1,257 @@
+"use client";
+
+import { useCallback, useEffect, useState } from "react";
+import { createClient } from "@/lib/supabase/client";
+import { track } from "@/lib/analytics";
+
+type AuthStep = "pick" | "email-input" | "otp-verify";
+
+export function AuthModal({
+  onClose,
+  onSuccess,
+  onBeforeOAuth,
+}: {
+  onClose: () => void;
+  onSuccess: () => void;
+  // Fires synchronously before the OAuth full-page redirect (signInWithOAuth
+  // navigates the browser away, unmounting the whole React tree). Hosts that
+  // need to survive the round-trip (e.g. play page carrying in-memory game
+  // state) snapshot into sessionStorage here — sessionStorage.setItem is
+  // synchronous, so it completes before the navigation begins.
+  onBeforeOAuth?: () => void;
+}) {
+  const [step, setStep] = useState<AuthStep>("pick");
+  const [email, setEmail] = useState("");
+  const [otp, setOtp] = useState("");
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState("");
+
+  useEffect(() => {
+    function onKeyDown(e: KeyboardEvent) {
+      if (e.key === "Escape") onClose();
+    }
+    window.addEventListener("keydown", onKeyDown);
+    return () => window.removeEventListener("keydown", onKeyDown);
+  }, [onClose]);
+
+  const handleOAuth = useCallback(
+    async (provider: "google" | "github") => {
+      setLoading(true);
+      setError("");
+      // Snapshot before navigating away — the redirect below unmounts the app,
+      // so any host state must be persisted to sessionStorage *now*.
+      // Non-fatal: if the snapshot fails (e.g. sessionStorage is blocked in
+      // privacy mode), the OAuth flow still proceeds — the user just won't
+      // have their in-progress state restored on return.
+      try {
+        onBeforeOAuth?.();
+      } catch {
+        /* snapshot failure is non-fatal */
+      }
+      const supabase = createClient();
+      const { error: oauthError } = await supabase.auth.signInWithOAuth({
+        provider,
+        options: {
+          redirectTo: `${window.location.origin}/auth/callback?next=${encodeURIComponent(window.location.pathname + window.location.search)}`,
+        },
+      });
+      if (oauthError) {
+        setError(oauthError.message);
+        setLoading(false);
+      }
+    },
+    [onBeforeOAuth],
+  );
+
+  const handleSendOtp = useCallback(async () => {
+    const trimmed = email.trim();
+    if (!trimmed) return;
+    setLoading(true);
+    setError("");
+    const supabase = createClient();
+    const { error: otpError } = await supabase.auth.signInWithOtp({
+      email: trimmed,
+    });
+    setLoading(false);
+    if (otpError) {
+      setError(otpError.message);
+    } else {
+      setStep("otp-verify");
+    }
+  }, [email]);
+
+  const handleVerifyOtp = useCallback(async () => {
+    const trimmedOtp = otp.trim();
+    if (!trimmedOtp) return;
+    setLoading(true);
+    setError("");
+    const supabase = createClient();
+    const { error: verifyError } = await supabase.auth.verifyOtp({
+      email: email.trim(),
+      token: trimmedOtp,
+      type: "email",
+    });
+    setLoading(false);
+    if (verifyError) {
+      setError(verifyError.message);
+    } else {
+      track("login_success", { provider: "email" });
+      onSuccess();
+    }
+  }, [email, otp, onSuccess]);
+
+  return (
+    <div
+      className="fixed inset-0 z-50 flex items-center justify-center px-4"
+      style={{ background: "rgba(0,0,0,0.55)" }}
+      onClick={onClose}
+    >
+      <div
+        className="w-full max-w-sm overflow-hidden"
+        onClick={(e) => e.stopPropagation()}
+        style={{
+          background: "rgba(14, 10, 6, 0.92)",
+          border: "1.5px solid rgba(175, 138, 72, 0.72)",
+          borderRadius: "8px",
+          backdropFilter: "blur(14px)",
+          WebkitBackdropFilter: "blur(14px)",
+          boxShadow:
+            "0 10px 42px rgba(0,0,0,0.62), inset 0 1px 0 rgba(200,165,90,0.12)",
+        }}
+        role="dialog"
+        aria-modal="true"
+        aria-label="登录"
+      >
+        {/* header */}
+        <div className="flex items-center justify-between border-b border-cream-50/10 px-5 py-3.5">
+          <div className="flex items-center gap-2 text-[11px] smallcaps text-cream-50/70">
+            <i className="fa-solid fa-right-to-bracket text-[11px]" />
+            {step === "pick" && "登录以继续"}
+            {step === "email-input" && "邮箱登录"}
+            {step === "otp-verify" && "验证码"}
+          </div>
+          <button
+            type="button"
+            onClick={onClose}
+            className="flex h-7 w-7 items-center justify-center text-cream-50/60 transition-colors hover:text-cream-50"
+            aria-label="关闭"
+          >
+            <i className="fa-solid fa-xmark text-[12px]" />
+          </button>
+        </div>
+
+        <div className="px-5 py-5 space-y-3">
+          {error && (
+            <p className="text-[12px] text-red-400/90 leading-snug">{error}</p>
+          )}
+
+          {step === "pick" && (
+            <>
+              <button
+                type="button"
+                disabled={loading}
+                onClick={() => handleOAuth("google")}
+                className="flex w-full items-center justify-center gap-2.5 rounded-md border border-cream-50/15 bg-cream-50/[0.06] px-4 py-2.5 text-[13px] text-cream-50/90 transition-colors hover:bg-cream-50/[0.12] disabled:opacity-50"
+              >
+                <i className="fa-brands fa-google text-[14px]" />
+                Google 登录
+              </button>
+              <button
+                type="button"
+                disabled={loading}
+                onClick={() => handleOAuth("github")}
+                className="flex w-full items-center justify-center gap-2.5 rounded-md border border-cream-50/15 bg-cream-50/[0.06] px-4 py-2.5 text-[13px] text-cream-50/90 transition-colors hover:bg-cream-50/[0.12] disabled:opacity-50"
+              >
+                <i className="fa-brands fa-github text-[14px]" />
+                GitHub 登录
+              </button>
+              <div className="flex items-center gap-3 py-1">
+                <div className="h-px flex-1 bg-cream-50/10" />
+                <span className="text-[10px] text-cream-50/40">或</span>
+                <div className="h-px flex-1 bg-cream-50/10" />
+              </div>
+              <button
+                type="button"
+                onClick={() => setStep("email-input")}
+                className="flex w-full items-center justify-center gap-2.5 rounded-md border border-cream-50/15 bg-cream-50/[0.06] px-4 py-2.5 text-[13px] text-cream-50/90 transition-colors hover:bg-cream-50/[0.12]"
+              >
+                <i className="fa-solid fa-envelope text-[13px]" />
+                邮箱验证码登录
+              </button>
+            </>
+          )}
+
+          {step === "email-input" && (
+            <>
+              <input
+                type="email"
+                value={email}
+                onChange={(e) => setEmail(e.target.value)}
+                onKeyDown={(e) => e.key === "Enter" && handleSendOtp()}
+                placeholder="your@email.com"
+                autoFocus
+                className="w-full rounded-md border border-cream-50/15 bg-cream-50/[0.06] px-3.5 py-2.5 text-[13px] text-cream-50/90 placeholder:text-cream-50/30 outline-none focus:border-[rgba(175,138,72,0.6)]"
+              />
+              <button
+                type="button"
+                disabled={loading || !email.trim()}
+                onClick={handleSendOtp}
+                className="w-full rounded-md bg-[rgba(175,138,72,0.85)] px-4 py-2.5 text-[13px] font-medium text-cream-50 transition-colors hover:bg-[rgba(175,138,72,1)] disabled:opacity-50"
+              >
+                {loading ? "发送中..." : "发送验证码"}
+              </button>
+              <button
+                type="button"
+                onClick={() => {
+                  setStep("pick");
+                  setError("");
+                }}
+                className="w-full text-center text-[12px] text-cream-50/50 transition-colors hover:text-cream-50/80"
+              >
+                返回
+              </button>
+            </>
+          )}
+
+          {step === "otp-verify" && (
+            <>
+              <p className="text-[12px] text-cream-50/60 leading-snug">
+                验证码已发送至 <span className="text-cream-50/90">{email.trim()}</span>
+              </p>
+              <input
+                type="text"
+                inputMode="numeric"
+                maxLength={6}
+                value={otp}
+                onChange={(e) => setOtp(e.target.value.replace(/\D/g, ""))}
+                onKeyDown={(e) => e.key === "Enter" && handleVerifyOtp()}
+                placeholder="6 位验证码"
+                autoFocus
+                className="w-full rounded-md border border-cream-50/15 bg-cream-50/[0.06] px-3.5 py-2.5 text-center text-[16px] tracking-[0.35em] text-cream-50/90 placeholder:text-cream-50/30 placeholder:tracking-normal outline-none focus:border-[rgba(175,138,72,0.6)]"
+              />
+              <button
+                type="button"
+                disabled={loading || otp.length < 6}
+                onClick={handleVerifyOtp}
+                className="w-full rounded-md bg-[rgba(175,138,72,0.85)] px-4 py-2.5 text-[13px] font-medium text-cream-50 transition-colors hover:bg-[rgba(175,138,72,1)] disabled:opacity-50"
+              >
+                {loading ? "验证中..." : "确认"}
+              </button>
+              <button
+                type="button"
+                onClick={() => {
+                  setStep("email-input");
+                  setOtp("");
+                  setError("");
+                }}
+                className="w-full text-center text-[12px] text-cream-50/50 transition-colors hover:text-cream-50/80"
+              >
+                重新发送
+              </button>
+            </>
+          )}
+        </div>
+      </div>
+    </div>
+  );
+}
@@ -19,9 +19,6 @@ const SHADOW =

 const DEFAULT_CHAR_MS = 28;
 const MIN_CHAR_MS = 30;
-// Voice playback speed multiplier. >1 speeds up the (somewhat slow) MiMo voice
-// while preserving pitch. Typewriter pacing is divided by the same factor.
-const SPEECH_RATE = 1.2;
 // If audio metadata never arrives within this window, give up waiting and
 // let the typewriter run at default speed.
 const AUDIO_WAIT_TIMEOUT_MS = 2500;
@@ -183,6 +180,7 @@ export function PlayCanvas({
  playerName,
  visionClickEnabled = true,
  onOpenSettings,
+  onImageReady,
  aboveCanvas,
  aboveCanvasLeft,
  belowCanvas,
@@ -207,6 +205,7 @@ export function PlayCanvas({
  // 选择节点点击背景是否触发识图。关闭时背景点击保持静默，用户只能点选项。
  visionClickEnabled?: boolean;
  onOpenSettings?: () => void;
+  onImageReady?: () => void;
  // 渲染在图片正上方、右对齐的 slot（画面外、紧贴右上角）。
  aboveCanvas?: ReactNode;
  // 渲染在图片正上方、左对齐的 slot（画面外、紧贴左上角），与 aboveCanvas 水平镜像。
@@ -259,7 +258,6 @@ export function PlayCanvas({
    const el = audioRef.current;
    if (!el) return;
    el.muted = muted;
-    el.playbackRate = SPEECH_RATE;
    if (!muted && audioSrc && el.paused) {
      el.play().catch(() => {
        // autoplay blocked — silent until next interaction
@@ -270,11 +268,7 @@ export function PlayCanvas({
  function handleAudioMetadata() {
    const el = audioRef.current;
    if (!el) return;
-    el.playbackRate = SPEECH_RATE;
-    // Effective playback time is shorter once sped up — keep the typewriter in sync.
-    const ms = Number.isFinite(el.duration)
-      ? (el.duration * 1000) / SPEECH_RATE
-      : 0;
+    const ms = Number.isFinite(el.duration) ? el.duration * 1000 : 0;
    setAudioDurationMs(ms > 0 ? ms : 0);
    if (!muted) {
      el.play().catch(() => {
@@ -349,10 +343,20 @@ export function PlayCanvas({
  // the 9:16 image matches the exact device/window — no letterbox. Landscape
  // keeps the prior contain-style sizing so the full 16:9 frame stays visible.
  const sizeStyle: React.CSSProperties = portrait
-    ? { width: "100vw", height: "100dvh", objectFit: "cover" }
+    ? { width: "100%", height: "100%", objectFit: "cover" }
    : fullViewport
-      ? { maxWidth: "100vw", maxHeight: "100dvh" }
-      : { maxWidth: "96vw", maxHeight: "calc(100dvh - 200px)" };
+      ? { width: "100%", height: "100%", objectFit: "contain" }
+      : { width: "100%", height: "100%" };
+
+  const canvasStyle: React.CSSProperties = portrait
+    ? { width: "100vw", height: "100dvh" }
+    : {
+        width: fullViewport
+          ? "min(100vw, calc(100dvh * 16 / 9))"
+          : "min(96vw, calc((100dvh - 200px) * 16 / 9))",
+        aspectRatio: "16 / 9",
+        maxHeight: fullViewport ? "100dvh" : "calc(100dvh - 200px)",
+      };

  const placeholderStyle: React.CSSProperties = portrait
    ? { width: "100vw", height: "100dvh" }
@@ -382,17 +386,15 @@ export function PlayCanvas({

      {imageUrl ? (
        <div
-          className="relative inline-block"
-          style={{ boxShadow: fullViewport ? "none" : SHADOW }}
+          className="relative"
+          style={{ ...canvasStyle, boxShadow: fullViewport ? "none" : SHADOW }}
        >
-          {/* Background image — Runware CDN URL or data URI (mock mode).
-              The width/height attributes give the browser the intrinsic aspect
-              ratio (1792:1024 landscape / 1024:1792 portrait) so that, while the
-              bytes are still arriving from the CDN, the <img> reserves the right
-              box instead of collapsing to a one-pixel sliver — fixes the
-              "等很久 → 一根线 → 突然出图" jank. Landscape uses w-auto/h-auto +
-              maxWidth/maxHeight (contain); portrait switches sizeStyle to
-              100vw×100dvh with object-fit:cover (full-bleed, no letterbox). */}
+          {/* The stable wrapper owns the frame size. Keeping overlay geometry
+              independent of <img> decode/source swaps prevents controls from
+              jumping when a newly generated image is committed. The key uses
+              a short high-entropy slice (matching the <audio> element) so data
+              URIs from the gpt-image/mock paths — which can be several MB —
+              don't become React's reconciliation key. */}
          <img
            key={imageUrl.slice(-48)}
            ref={imgRef}
@@ -402,7 +404,14 @@ export function PlayCanvas({
            alt="Generated scene"
            onClick={handleImageClick}
            draggable={false}
-            className={`block ${portrait ? "" : "w-auto h-auto"} select-none animate-fade-in transition-opacity duration-700 ease-out ${
+            onLoad={() => {
+              if (!onImageReady) return;
+              const el = imgRef.current;
+              if (!el) { onImageReady(); return; }
+              const notify = () => { if (imgRef.current === el) onImageReady(); };
+              el.decode().then(notify, notify);
+            }}
+            className={`block select-none animate-fade-in transition-opacity duration-700 ease-out ${
              imageClickable ? "cursor-pointer" : interactive ? "cursor-default" : "cursor-wait"
            } ${dimmed ? "opacity-40" : "opacity-100"}`}
            style={sizeStyle}
@@ -1,6 +1,12 @@
 "use client";

 import { type ReactNode, useEffect, useState } from "react";
+import type { ProviderProtocol } from "@infiplot/types";
+import {
+  clearStoredModelConfig,
+  readStoredModelConfig,
+  writeStoredModelConfig,
+} from "@/lib/clientModelConfig";
 import {
  clearStoredTtsConfig,
  readStoredTtsConfig,
@@ -44,17 +50,81 @@ export function readStoredVisionClick(): boolean {
  }
 }

+const PROVIDER_OPTIONS: { value: ProviderProtocol | ""; label: string }[] = [
+  { value: "", label: "自动推断（推荐）" },
+  { value: "openai_compatible", label: "OpenAI Compatible" },
+  { value: "runware", label: "Runware" },
+];
+
+type ModelGroup = {
+  key: "text" | "image" | "vision";
+  label: string;
+  icon: string;
+  baseUrl: string;
+  apiKey: string;
+  model: string;
+  provider: string;
+};
+
+type TabKey = "general" | "models";
+
 export function SettingsModal({
+  initialTab = "general",
  initialVisionClickEnabled = true,
  onClose,
  onSaved,
  footerNote,
 }: {
+  initialTab?: TabKey;
  initialVisionClickEnabled?: boolean;
  onClose: () => void;
-  onSaved: (settings: { ttsConfigured: boolean; playerName: string; visionClickEnabled: boolean }) => void;
+  onSaved: (settings: {
+    playerName: string;
+    visionClickEnabled: boolean;
+    ttsConfigured: boolean;
+  }) => void;
  footerNote?: ReactNode;
 }) {
+  const [activeTab, setActiveTab] = useState<TabKey>(initialTab);
+
+  // ── General tab state ──
+  const [playerName, setPlayerName] = useState(() => readStoredPlayerName());
+  const [visionClick, setVisionClick] = useState(initialVisionClickEnabled);
+
+  // ── Models tab state ──
+  const initial = readStoredModelConfig();
+  const [groups, setGroups] = useState<ModelGroup[]>([
+    {
+      key: "text",
+      label: "文本模型",
+      icon: "fa-solid fa-pen-nib",
+      baseUrl: initial?.textBaseUrl ?? "",
+      apiKey: initial?.textApiKey ?? "",
+      model: initial?.textModel ?? "",
+      provider: initial?.textProvider ?? "",
+    },
+    {
+      key: "image",
+      label: "绘图模型",
+      icon: "fa-solid fa-palette",
+      baseUrl: initial?.imageBaseUrl ?? "",
+      apiKey: initial?.imageApiKey ?? "",
+      model: initial?.imageModel ?? "",
+      provider: initial?.imageProvider ?? "",
+    },
+    {
+      key: "vision",
+      label: "识图模型",
+      icon: "fa-solid fa-eye",
+      baseUrl: initial?.visionBaseUrl ?? "",
+      apiKey: initial?.visionApiKey ?? "",
+      model: initial?.visionModel ?? "",
+      provider: initial?.visionProvider ?? "",
+    },
+  ]);
+  const [showKeys, setShowKeys] = useState<Record<string, boolean>>({});
+
+  // TTS state
  const [initialTts] = useState(() => readStoredTtsConfig());
  const initialKind = findTtsPreset(initialTts?.presetId)?.kind ?? "payg";
  const [keyType, setKeyType] = useState<"token-plan" | "payg">(initialKind);
@@ -63,61 +133,130 @@ export function SettingsModal({
      ? (initialTts?.presetId ?? TTS_REGION_PRESETS[0]!.id)
      : TTS_REGION_PRESETS[0]!.id,
  );
-  const [apiKey, setApiKey] = useState<string>(initialTts?.apiKey ?? "");
-  const [showKey, setShowKey] = useState(false);
-  const ttsAlreadyConfigured = initialTts != null;
-
-  const [playerName, setPlayerName] = useState(() => readStoredPlayerName());
-  const [visionClick, setVisionClick] = useState(initialVisionClickEnabled);
-
-  const [shown, setShown] = useState(false);
+  const [ttsApiKey, setTtsApiKey] = useState<string>(initialTts?.apiKey ?? "");
+  const [showTtsKey, setShowTtsKey] = useState(false);

  const expectedPrefix = keyType === "payg" ? "sk-" : "tp-";
  const prefixMismatch =
-    apiKey.trim().length > 0 && !apiKey.trim().startsWith(expectedPrefix);
+    ttsApiKey.trim().length > 0 && !ttsApiKey.trim().startsWith(expectedPrefix);

+  // ── Animation ──
+  const [shown, setShown] = useState(false);
  useEffect(() => {
    const id = requestAnimationFrame(() => setShown(true));
    return () => cancelAnimationFrame(id);
  }, []);

+  useEffect(() => {
+    const handler = (e: KeyboardEvent) => {
+      if (e.key === "Escape") close();
+    };
+    window.addEventListener("keydown", handler);
+    return () => window.removeEventListener("keydown", handler);
+  }, []);
+
  const close = () => {
    setShown(false);
    setTimeout(onClose, 280);
  };

-  const save = () => {
+  // ── General actions ──
+  const saveGeneral = () => {
    const name = playerName.trim();
    writeStoredPlayerName(name);
-
    try {
      localStorage.setItem(VISION_CLICK_STORAGE_KEY, visionClick ? "1" : "0");
    } catch { /* ignore */ }
+  };

-    const key = apiKey.trim();
-    let ttsConfigured = false;
+  const clearGeneral = () => {
+    writeStoredPlayerName("");
+    try { localStorage.removeItem(VISION_CLICK_STORAGE_KEY); } catch { /* ignore */ }
+    setPlayerName("");
+    setVisionClick(true);
+  };
+
+  const hasGeneralSetting = readStoredPlayerName().length > 0;
+
+  // ── Models actions ──
+  const updateGroup = (
+    key: string,
+    field: keyof Omit<ModelGroup, "key" | "label" | "icon">,
+    value: string,
+  ) => {
+    setGroups((prev) =>
+      prev.map((g) => (g.key === key ? { ...g, [field]: value } : g)),
+    );
+  };
+
+  const saveModels = () => {
+    const [text, image, vision] = groups;
+    if (text && image && vision) {
+      writeStoredModelConfig({
+        textBaseUrl: text.baseUrl,
+        textApiKey: text.apiKey,
+        textModel: text.model,
+        textProvider: (text.provider as ProviderProtocol) || undefined,
+        imageBaseUrl: image.baseUrl,
+        imageApiKey: image.apiKey,
+        imageModel: image.model,
+        imageProvider: (image.provider as ProviderProtocol) || undefined,
+        visionBaseUrl: vision.baseUrl,
+        visionApiKey: vision.apiKey,
+        visionModel: vision.model,
+        visionProvider: (vision.provider as ProviderProtocol) || undefined,
+      });
+    }
+
+    const key = ttsApiKey.trim();
    if (key) {
      const presetId = keyType === "payg" ? PAYG_PRESET_ID : regionId;
      writeStoredTtsConfig({ presetId, apiKey: key });
-      ttsConfigured = true;
    } else {
      clearStoredTtsConfig();
-      ttsConfigured = false;
    }
+  };

-    onSaved({ ttsConfigured, playerName: name, visionClickEnabled: visionClick });
+  const clearModels = () => {
+    clearStoredModelConfig();
+    clearStoredTtsConfig();
+    setGroups((prev) =>
+      prev.map((g) => ({ ...g, baseUrl: "", apiKey: "", model: "", provider: "" })),
+    );
+    setTtsApiKey("");
+  };
+
+  const hasModelSetting =
+    groups.some((g) => g.baseUrl.trim() && g.apiKey.trim() && g.model.trim()) ||
+    initialTts != null;
+
+  // ── Global save / clear ──
+  const save = () => {
+    saveGeneral();
+    saveModels();
+
+    const ttsConfigured = ttsApiKey.trim().length > 0;
+    onSaved({
+      playerName: playerName.trim(),
+      visionClickEnabled: visionClick,
+      ttsConfigured,
+    });
    close();
  };

  const clearAll = () => {
-    clearStoredTtsConfig();
-    writeStoredPlayerName("");
-    try { localStorage.removeItem(VISION_CLICK_STORAGE_KEY); } catch { /* ignore */ }
-    onSaved({ ttsConfigured: false, playerName: "", visionClickEnabled: true });
+    clearGeneral();
+    clearModels();
+    onSaved({ playerName: "", visionClickEnabled: true, ttsConfigured: false });
    close();
  };

-  const hasAnySetting = ttsAlreadyConfigured || readStoredPlayerName().length > 0;
+  const hasAnySetting = hasGeneralSetting || hasModelSetting;
+
+  const tabs: { key: TabKey; label: string; icon: string }[] = [
+    { key: "general", label: "通用", icon: "fa-solid fa-sliders" },
+    { key: "models", label: "模型", icon: "fa-solid fa-microchip" },
+  ];

  return (
    <div
@@ -132,7 +271,7 @@ export function SettingsModal({
      <div
        onMouseDown={(e) => e.stopPropagation()}
        className={
-          "flex w-[560px] max-w-[94vw] max-h-[88vh] flex-col overflow-hidden rounded-sm border border-clay-900/15 bg-cream-50 shadow-2xl shadow-clay-900/25 transition-all duration-300 " +
+          "flex w-[640px] max-w-[96vw] max-h-[90vh] flex-col overflow-hidden rounded-sm border border-clay-900/15 bg-cream-50 shadow-2xl shadow-clay-900/25 transition-all duration-300 " +
          (shown ? "opacity-100 scale-100" : "opacity-0 scale-95")
        }
      >
@@ -156,226 +295,368 @@ export function SettingsModal({
          </button>
        </div>

-        <div className="flex flex-col gap-0 overflow-y-auto">
-          {/* ── Player Name Section ── */}
-          <div className="flex flex-col gap-3 px-6 md:px-8 py-5">
-            <div className="flex items-center gap-2.5">
-              <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
-                <i className="fa-solid fa-user-pen text-[11px]" />
-              </span>
-              <span className="font-serif text-base text-clay-900">
-                玩家名字
-              </span>
-            </div>
-            <input
-              value={playerName}
-              onChange={(e) => setPlayerName(e.target.value)}
-              type="text"
-              maxLength={20}
-              autoComplete="off"
-              spellCheck={false}
-              placeholder="不填则使用「你」"
-              className="h-11 w-full rounded-sm border border-clay-900/15 bg-cream-100 px-4 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
-            />
-            <span className="text-[11px] text-clay-400">
-              NPC 会在对话中用这个名字称呼你。不填则默认以「你」称呼。
-            </span>
-          </div>
+        {/* Tab bar */}
+        <div className="flex border-b border-clay-900/8 px-6 md:px-8">
+          {tabs.map((t) => {
+            const active = activeTab === t.key;
+            return (
+              <button
+                key={t.key}
+                type="button"
+                onClick={() => setActiveTab(t.key)}
+                className={
+                  "flex items-center gap-2 px-4 py-3 text-[13px] font-sans transition-colors border-b-2 -mb-px " +
+                  (active
+                    ? "border-ember-500 text-clay-900"
+                    : "border-transparent text-clay-500 hover:text-clay-700")
+                }
+              >
+                <i className={`${t.icon} text-[11px]`} />
+                {t.label}
+              </button>
+            );
+          })}
+        </div>

-          <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
-
-          {/* ── Vision Click Section ── */}
-          <div className="flex flex-col gap-3 px-6 md:px-8 py-5">
-            <div className="flex items-center gap-2.5">
-              <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
-                <i className="fa-solid fa-eye text-[11px]" />
-              </span>
-              <span className="font-serif text-base text-clay-900">
-                点击画面识别
-              </span>
-            </div>
-            <div className="grid grid-cols-2 gap-2">
-              {(
-                [
-                  { on: true, label: "开启", icon: "fa-solid fa-wand-magic-sparkles" },
-                  { on: false, label: "关闭", icon: "fa-solid fa-ban" },
-                ] as const
-              ).map((t) => {
-                const active = visionClick === t.on;
-                return (
-                  <button
-                    key={String(t.on)}
-                    type="button"
-                    onClick={() => setVisionClick(t.on)}
-                    className={
-                      "flex items-center justify-center gap-2 rounded-sm border px-3 py-2.5 text-[13px] transition-all " +
-                      (active
-                        ? "border-ember-500 bg-ember-500/5 text-clay-900"
-                        : "border-clay-900/12 text-clay-600 hover:border-clay-900/35 hover:bg-cream-100")
-                    }
-                  >
-                    <i className={t.icon + " text-[11px]"} />
-                    {t.label}
-                  </button>
-                );
-              })}
-            </div>
-            <span className="text-[11px] text-clay-400">
-              开启后，在选择节点点击画面会触发 AI 识图并生成新的剧情分支。
-            </span>
-          </div>
-
-          <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
-
-          {/* ── TTS Key Section ── */}
-          <div className="flex flex-col gap-3 px-6 md:px-8 pt-5 pb-5">
-            <div className="flex items-center gap-2.5">
-              <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
-                <i className="fa-solid fa-key text-[11px]" />
-              </span>
-              <span className="font-serif text-base text-clay-900">
-                自带配音 Key
-              </span>
-              <span className="text-[10px] text-clay-400">可选</span>
-            </div>
-            <p className="text-[12px] leading-relaxed text-clay-500">
-              填入你自己的
-              <span className="text-clay-800"> 小米 MiMo API Key</span>
-              ，配音将在浏览器本地合成，Key 只保存在本地、绝不经过服务器。MiMo
-              TTS 目前
-              <span className="text-clay-800">限时免费</span>
-              ，申请即可使用。
-            </p>
-
-            <div className="flex flex-col gap-2">
-              <span className="text-[10px] smallcaps text-clay-500">
-                K e y · 类 型
-              </span>
-              <div className="grid grid-cols-2 gap-2">
-                {(
-                  [
-                    {
-                      kind: "payg",
-                      label: "按量付费 Pay-as-you-go",
-                      sub: "sk- 开头",
-                    },
-                    {
-                      kind: "token-plan",
-                      label: "套餐 Token Plan",
-                      sub: "tp- 开头",
-                    },
-                  ] as const
-                ).map((t) => {
-                  const active = keyType === t.kind;
-                  return (
-                    <button
-                      key={t.kind}
-                      type="button"
-                      onClick={() => setKeyType(t.kind)}
-                      className={
-                        "flex flex-col gap-0.5 rounded-sm border px-3 py-2.5 text-left transition-all " +
-                        (active
-                          ? "border-ember-500 bg-ember-500/5 text-clay-900"
-                          : "border-clay-900/12 text-clay-600 hover:border-clay-900/35 hover:bg-cream-100")
-                      }
-                    >
-                      <span className="text-[13px]">{t.label}</span>
-                      <span className="text-[10px] text-clay-400">
-                        {t.sub}
-                      </span>
-                    </button>
-                  );
-                })}
-              </div>
-            </div>
-
-            {keyType === "token-plan" && (
-              <div className="flex flex-col gap-2">
-                <span className="text-[10px] smallcaps text-clay-500">
-                  区 域 节 点
+        {/* Content */}
+        <div className="thin-scrollbar flex flex-col gap-0 overflow-y-auto flex-1">
+          {activeTab === "general" && (
+            <>
+              {/* ── Player Name Section ── */}
+              <div className="flex flex-col gap-3 px-6 md:px-8 py-5">
+                <div className="flex items-center gap-2.5">
+                  <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
+                    <i className="fa-solid fa-user-pen text-[11px]" />
+                  </span>
+                  <span className="font-serif text-base text-clay-900">
+                    玩家名字
+                  </span>
+                </div>
+                <input
+                  value={playerName}
+                  onChange={(e) => setPlayerName(e.target.value)}
+                  type="text"
+                  maxLength={20}
+                  autoComplete="off"
+                  spellCheck={false}
+                  placeholder="不填则使用「你」"
+                  className="h-11 w-full rounded-sm border border-clay-900/15 bg-cream-100 px-4 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
+                />
+                <span className="text-[11px] text-clay-400">
+                  NPC 会在对话中用这个名字称呼你。不填则默认以「你」称呼。
                </span>
-                <div className="grid grid-cols-1 gap-2 sm:grid-cols-3">
-                  {TTS_REGION_PRESETS.map((p) => {
-                    const active = p.id === regionId;
+              </div>
+
+              <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
+
+              {/* ── Vision Click Section ── */}
+              <div className="flex flex-col gap-3 px-6 md:px-8 py-5">
+                <div className="flex items-center gap-2.5">
+                  <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
+                    <i className="fa-solid fa-eye text-[11px]" />
+                  </span>
+                  <span className="font-serif text-base text-clay-900">
+                    点击画面识别
+                  </span>
+                </div>
+                <div className="grid grid-cols-2 gap-2">
+                  {(
+                    [
+                      { on: true, label: "开启", icon: "fa-solid fa-wand-magic-sparkles" },
+                      { on: false, label: "关闭", icon: "fa-solid fa-ban" },
+                    ] as const
+                  ).map((t) => {
+                    const active = visionClick === t.on;
                    return (
                      <button
-                        key={p.id}
+                        key={String(t.on)}
                        type="button"
-                        onClick={() => setRegionId(p.id)}
+                        onClick={() => setVisionClick(t.on)}
                        className={
-                          "rounded-sm border px-3 py-2.5 text-left text-[13px] transition-all " +
+                          "flex items-center justify-center gap-2 rounded-sm border px-3 py-2.5 text-[13px] transition-all " +
                          (active
                            ? "border-ember-500 bg-ember-500/5 text-clay-900"
                            : "border-clay-900/12 text-clay-600 hover:border-clay-900/35 hover:bg-cream-100")
                        }
                      >
-                        {p.label}
+                        <i className={t.icon + " text-[11px]"} />
+                        {t.label}
                      </button>
                    );
                  })}
                </div>
                <span className="text-[11px] text-clay-400">
-                  选择与你的套餐订阅地区一致的节点（通常也是延迟最低的那个）。
+                  开启后，在选择节点点击画面会触发 AI 识图并生成新的剧情分支。
                </span>
              </div>
-            )}

-            <div className="flex flex-col gap-2">
-              <span className="text-[10px] smallcaps text-clay-500">
-                A P I · K e y
-              </span>
-              <div className="relative">
-                <input
-                  value={apiKey}
-                  onChange={(e) => setApiKey(e.target.value)}
-                  type={showKey ? "text" : "password"}
-                  autoComplete="off"
-                  spellCheck={false}
-                  placeholder={
-                    keyType === "payg"
-                      ? "粘贴 sk- 开头的按量 Key"
-                      : "粘贴 tp- 开头的套餐 Key"
-                  }
-                  className="h-11 w-full rounded-sm border border-clay-900/15 bg-cream-100 pl-4 pr-11 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
-                />
-                <button
-                  type="button"
-                  onClick={() => setShowKey((v) => !v)}
-                  aria-label={showKey ? "隐藏" : "显示"}
-                  className="absolute right-3 top-1/2 -translate-y-1/2 text-clay-400 hover:text-clay-700 transition-colors"
-                >
-                  <i
-                    className={`fa-solid ${showKey ? "fa-eye-slash" : "fa-eye"} text-sm`}
-                  />
-                </button>
-              </div>
-              {prefixMismatch && (
-                <span className="flex items-start gap-1.5 text-[11px] leading-relaxed text-ember-500">
-                  <i className="fa-solid fa-triangle-exclamation mt-0.5 text-[10px]" />
-                  此 Key 不是 {expectedPrefix} 开头，可能与所选「
-                  {keyType === "payg"
-                    ? "按量付费 Pay-as-you-go"
-                    : "套餐 Token Plan"}
-                  」类型不符，请确认是否填错。
-                </span>
+              {footerNote && (
+                <div className="px-6 md:px-8 pb-5">
+                  <p className="text-[11px] leading-relaxed text-clay-400">
+                    {footerNote}
+                  </p>
+                </div>
              )}
-              <a
-                href={TTS_KEY_DOC_URL}
-                target="_blank"
-                rel="noopener noreferrer"
-                className="inline-flex items-center gap-1.5 text-[11px] text-ember-500 hover:text-ember-400 transition-colors"
-              >
-                <i className="fa-brands fa-github text-[11px]" />
-                如何免费申请 Key？查看图文教程
-              </a>
-            </div>
+            </>
+          )}

-            {footerNote && (
-              <p className="text-[11px] leading-relaxed text-clay-400">
-                {footerNote}
-              </p>
-            )}
-          </div>
+          {activeTab === "models" && (
+            <>
+              <div className="px-6 md:px-8 py-4">
+                <p className="text-[11px] leading-relaxed text-clay-400">
+                  <i className="fa-solid fa-circle-info mr-1.5" />
+                  请确保你的 API 端点支持浏览器跨域请求（CORS）。大多数主流提供商（OpenAI、Anthropic、Gemini、Runware 等）已默认支持。
+                </p>
+              </div>
+
+              <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
+
+              {groups.map((g, idx) => (
+                <div key={g.key}>
+                  {idx > 0 && (
+                    <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
+                  )}
+                  <div className="flex flex-col gap-3 px-6 md:px-8 py-5">
+                    <div className="flex items-center gap-2.5">
+                      <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
+                        <i className={`${g.icon} text-[11px]`} />
+                      </span>
+                      <span className="font-serif text-base text-clay-900">
+                        {g.label}
+                      </span>
+                    </div>
+
+                    <div className="flex flex-col gap-2">
+                      <span className="text-[10px] smallcaps text-clay-500">
+                        BASE URL
+                      </span>
+                      <input
+                        value={g.baseUrl}
+                        onChange={(e) => updateGroup(g.key, "baseUrl", e.target.value)}
+                        type="text"
+                        autoComplete="off"
+                        spellCheck={false}
+                        placeholder="https://api.example.com/v1"
+                        className="h-10 w-full rounded-sm border border-clay-900/15 bg-cream-100 px-4 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
+                      />
+                    </div>
+
+                    <div className="flex flex-col gap-2">
+                      <span className="text-[10px] smallcaps text-clay-500">
+                        API Key
+                      </span>
+                      <div className="relative">
+                        <input
+                          value={g.apiKey}
+                          onChange={(e) => updateGroup(g.key, "apiKey", e.target.value)}
+                          type={showKeys[g.key] ? "text" : "password"}
+                          autoComplete="off"
+                          spellCheck={false}
+                          placeholder="sk-..."
+                          className="h-10 w-full rounded-sm border border-clay-900/15 bg-cream-100 pl-4 pr-11 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
+                        />
+                        <button
+                          type="button"
+                          onClick={() =>
+                            setShowKeys((prev) => ({
+                              ...prev,
+                              [g.key]: !prev[g.key],
+                            }))
+                          }
+                          aria-label={showKeys[g.key] ? "隐藏" : "显示"}
+                          className="absolute right-3 top-1/2 -translate-y-1/2 text-clay-400 hover:text-clay-700 transition-colors"
+                        >
+                          <i
+                            className={`fa-solid ${showKeys[g.key] ? "fa-eye-slash" : "fa-eye"} text-sm`}
+                          />
+                        </button>
+                      </div>
+                    </div>
+
+                    <div className="flex flex-col gap-2">
+                      <span className="text-[10px] smallcaps text-clay-500">
+                        Model
+                      </span>
+                      <input
+                        value={g.model}
+                        onChange={(e) => updateGroup(g.key, "model", e.target.value)}
+                        type="text"
+                        autoComplete="off"
+                        spellCheck={false}
+                        placeholder="gpt-4o / claude-3-5-sonnet / flux-1-dev ..."
+                        className="h-10 w-full rounded-sm border border-clay-900/15 bg-cream-100 px-4 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
+                      />
+                    </div>
+
+                    <div className="flex flex-col gap-2">
+                      <span className="text-[10px] smallcaps text-clay-500">
+                        Provider（可选）
+                      </span>
+                      <select
+                        value={g.provider}
+                        onChange={(e) => updateGroup(g.key, "provider", e.target.value)}
+                        className="h-10 w-full rounded-sm border border-clay-900/15 bg-cream-100 px-4 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500"
+                      >
+                        {PROVIDER_OPTIONS.map((opt) => (
+                          <option key={opt.value || "auto"} value={opt.value}>
+                            {opt.label}
+                          </option>
+                        ))}
+                      </select>
+                      <span className="text-[11px] text-clay-400">
+                        留空时系统会根据 Base URL 自动推断协议。
+                      </span>
+                    </div>
+                  </div>
+                </div>
+              ))}
+
+              <div className="border-t border-clay-900/8 mx-6 md:mx-8" />
+
+              {/* ── TTS Key Section ── */}
+              <div className="flex flex-col gap-3 px-6 md:px-8 pt-5 pb-5">
+                <div className="flex items-center gap-2.5">
+                  <span className="flex h-7 w-7 items-center justify-center rounded-sm border border-clay-900/10 bg-cream-100 text-clay-400">
+                    <i className="fa-solid fa-volume-high text-[11px]" />
+                  </span>
+                  <span className="font-serif text-base text-clay-900">
+                    配音模型
+                  </span>
+                </div>
+                <p className="text-[12px] leading-relaxed text-clay-500">
+                  填入你自己的
+                  <span className="text-clay-800"> 小米 MiMo API Key</span>
+                  ，配音将在浏览器本地合成，Key 只保存在本地、绝不经过服务器。MiMo
+                  TTS 目前
+                  <span className="text-clay-800">限时免费</span>
+                  ，申请即可使用。
+                </p>
+
+                <div className="flex flex-col gap-2">
+                  <span className="text-[10px] smallcaps text-clay-500">
+                    Key 类型
+                  </span>
+                  <div className="grid grid-cols-2 gap-2">
+                    {(
+                      [
+                        {
+                          kind: "payg",
+                          label: "按量付费 Pay-as-you-go",
+                          sub: "sk- 开头",
+                        },
+                        {
+                          kind: "token-plan",
+                          label: "套餐 Token Plan",
+                          sub: "tp- 开头",
+                        },
+                      ] as const
+                    ).map((t) => {
+                      const active = keyType === t.kind;
+                      return (
+                        <button
+                          key={t.kind}
+                          type="button"
+                          onClick={() => setKeyType(t.kind)}
+                          className={
+                            "flex flex-col gap-0.5 rounded-sm border px-3 py-2.5 text-left transition-all " +
+                            (active
+                              ? "border-ember-500 bg-ember-500/5 text-clay-900"
+                              : "border-clay-900/12 text-clay-600 hover:border-clay-900/35 hover:bg-cream-100")
+                          }
+                        >
+                          <span className="text-[13px]">{t.label}</span>
+                          <span className="text-[10px] text-clay-400">
+                            {t.sub}
+                          </span>
+                        </button>
+                      );
+                    })}
+                  </div>
+                </div>
+
+                {keyType === "token-plan" && (
+                  <div className="flex flex-col gap-2">
+                    <span className="text-[10px] smallcaps text-clay-500">
+                      区域节点
+                    </span>
+                    <div className="grid grid-cols-1 gap-2 sm:grid-cols-3">
+                      {TTS_REGION_PRESETS.map((p) => {
+                        const active = p.id === regionId;
+                        return (
+                          <button
+                            key={p.id}
+                            type="button"
+                            onClick={() => setRegionId(p.id)}
+                            className={
+                              "rounded-sm border px-3 py-2.5 text-left text-[13px] transition-all " +
+                              (active
+                                ? "border-ember-500 bg-ember-500/5 text-clay-900"
+                                : "border-clay-900/12 text-clay-600 hover:border-clay-900/35 hover:bg-cream-100")
+                            }
+                          >
+                            {p.label}
+                          </button>
+                        );
+                      })}
+                    </div>
+                    <span className="text-[11px] text-clay-400">
+                      选择与你的套餐订阅地区一致的节点（通常也是延迟最低的那个）。
+                    </span>
+                  </div>
+                )}
+
+                <div className="flex flex-col gap-2">
+                  <span className="text-[10px] smallcaps text-clay-500">
+                    API Key
+                  </span>
+                  <div className="relative">
+                    <input
+                      value={ttsApiKey}
+                      onChange={(e) => setTtsApiKey(e.target.value)}
+                      type={showTtsKey ? "text" : "password"}
+                      autoComplete="off"
+                      spellCheck={false}
+                      placeholder={
+                        keyType === "payg"
+                          ? "粘贴 sk- 开头的按量 Key"
+                          : "粘贴 tp- 开头的套餐 Key"
+                      }
+                      className="h-11 w-full rounded-sm border border-clay-900/15 bg-cream-100 pl-4 pr-11 font-sans text-sm text-clay-900 outline-none transition-colors focus:border-ember-500 placeholder:text-clay-400"
+                    />
+                    <button
+                      type="button"
+                      onClick={() => setShowTtsKey((v) => !v)}
+                      aria-label={showTtsKey ? "隐藏" : "显示"}
+                      className="absolute right-3 top-1/2 -translate-y-1/2 text-clay-400 hover:text-clay-700 transition-colors"
+                    >
+                      <i
+                        className={`fa-solid ${showTtsKey ? "fa-eye-slash" : "fa-eye"} text-sm`}
+                      />
+                    </button>
+                  </div>
+                  {prefixMismatch && (
+                    <span className="flex items-start gap-1.5 text-[11px] leading-relaxed text-ember-500">
+                      <i className="fa-solid fa-triangle-exclamation mt-0.5 text-[10px]" />
+                      此 Key 不是 {expectedPrefix} 开头，可能与所选「
+                      {keyType === "payg"
+                        ? "按量付费 Pay-as-you-go"
+                        : "套餐 Token Plan"}
+                      」类型不符，请确认是否填错。
+                    </span>
+                  )}
+                  <a
+                    href={TTS_KEY_DOC_URL}
+                    target="_blank"
+                    rel="noopener noreferrer"
+                    className="inline-flex items-center gap-1.5 text-[11px] text-ember-500 hover:text-ember-400 transition-colors"
+                  >
+                    <i className="fa-brands fa-github text-[11px]" />
+                    如何免费申请 Key？查看图文教程
+                  </a>
+                </div>
+              </div>
+            </>
+          )}
        </div>

        {/* Footer */}
@@ -0,0 +1,106 @@
+"use client";
+
+import { useCallback, useEffect, useState } from "react";
+import { AUTH_ENABLED } from "@/lib/supabase/config";
+import { createClient } from "@/lib/supabase/client";
+import type { AuthChangeEvent, Session, User } from "@supabase/supabase-js";
+
+export function UserChip({
+  onLoginClick,
+}: {
+  onLoginClick: () => void;
+}) {
+  const [user, setUser] = useState<User | null>(null);
+  const [menuOpen, setMenuOpen] = useState(false);
+
+  useEffect(() => {
+    if (!AUTH_ENABLED) return;
+    const supabase = createClient();
+    supabase.auth.getUser().then(({ data }: { data: { user: User | null } }) => setUser(data.user));
+    const {
+      data: { subscription },
+    } = supabase.auth.onAuthStateChange((_event: AuthChangeEvent, session: Session | null) => {
+      setUser(session?.user ?? null);
+    });
+    return () => subscription.unsubscribe();
+  }, []);
+
+  const handleLogout = useCallback(async () => {
+    const supabase = createClient();
+    await supabase.auth.signOut();
+    setUser(null);
+    setMenuOpen(false);
+  }, []);
+
+  if (!AUTH_ENABLED) return null;
+
+  if (!user) {
+    return (
+      <button
+        type="button"
+        onClick={onLoginClick}
+        className="flex items-center gap-1.5 rounded-full border border-cream-50/15 bg-cream-50/[0.06] px-3 py-1.5 text-[11px] text-cream-50/70 transition-colors hover:bg-cream-50/[0.12] hover:text-cream-50/90"
+      >
+        <i className="fa-solid fa-right-to-bracket text-[10px]" />
+        登录
+      </button>
+    );
+  }
+
+  const label =
+    user.user_metadata?.full_name ??
+    user.email?.split("@")[0] ??
+    "User";
+  const avatarUrl = user.user_metadata?.avatar_url as string | undefined;
+  const initial = label.charAt(0).toUpperCase();
+
+  return (
+    <div className="relative">
+      <button
+        type="button"
+        onClick={() => setMenuOpen((v) => !v)}
+        className="flex items-center justify-center rounded-full border border-cream-50/15 bg-cream-50/[0.06] p-0.5 text-cream-50/80 transition-colors hover:bg-cream-50/[0.12]"
+        title={label}
+      >
+        {avatarUrl ? (
+          <img
+            src={avatarUrl}
+            alt=""
+            className="h-4 w-4 rounded-full object-cover"
+            referrerPolicy="no-referrer"
+          />
+        ) : (
+          <span className="flex h-4 w-4 items-center justify-center rounded-full bg-[rgba(175,138,72,0.6)] text-[9px] font-medium text-cream-50">
+            {initial}
+          </span>
+        )}
+      </button>
+      {menuOpen && (
+        <>
+          <div
+            className="fixed inset-0 z-40"
+            onClick={() => setMenuOpen(false)}
+          />
+          <div
+            className="absolute right-0 top-full z-50 mt-1 min-w-[120px] overflow-hidden rounded-md"
+            style={{
+              background: "rgba(14, 10, 6, 0.92)",
+              border: "1px solid rgba(175, 138, 72, 0.5)",
+              backdropFilter: "blur(12px)",
+              WebkitBackdropFilter: "blur(12px)",
+            }}
+          >
+            <button
+              type="button"
+              onClick={handleLogout}
+              className="flex w-full items-center gap-2 px-3.5 py-2.5 text-[12px] text-cream-50/70 transition-colors hover:bg-cream-50/[0.08] hover:text-cream-50/90"
+            >
+              <i className="fa-solid fa-right-from-bracket text-[11px]" />
+              退出登录
+            </button>
+          </div>
+        </>
+      )}
+    </div>
+  );
+}
@@ -1,29 +1,29 @@
-import { generateText } from "ai";
-import type { LanguageModelUsage, ModelMessage } from "ai";
+import OpenAI from "openai";
 import type { ProviderConfig } from "@infiplot/types";
-import { createLanguageModel, resolveProtocol } from "./model";
+import { normalizeBaseUrl } from "./normalizeUrl";

 export type ChatMessage = {
  role: "system" | "user" | "assistant";
  content: string;
 };

-// AI SDK 6 unifies cache stats across providers into usage.inputTokenDetails,
-// so a single shape covers Anthropic, Gemini, and OpenAI-compatible providers.
+// Cache observability for the prompt-prefix caching that the Writer stable
+// prefix relies on. The OpenAI usage object reports only cached READS
+// (prompt_tokens_details.cached_tokens) and has no field for cache WRITES
+// (tokens written to the cache on a cold pass), so unlike the old AI SDK
+// path we can show the hit rate but not the create cost. cached_tokens lives
+// directly on the SDK's CompletionUsage type — no cast needed.
 function summarizeSdkUsage(
  tag: string,
-  usage: LanguageModelUsage | undefined,
+  usage: OpenAI.Completions.CompletionUsage | undefined,
 ): string {
  if (!usage) return `[cache] ${tag} no-usage`;
-  const input = usage.inputTokens ?? 0;
-  const output = usage.outputTokens ?? 0;
-  const read = usage.inputTokenDetails?.cacheReadTokens;
-  const write = usage.inputTokenDetails?.cacheWriteTokens;
-  if (typeof read === "number" || typeof write === "number") {
-    const hit = read ?? 0;
-    const create = write ?? 0;
-    const rate = input > 0 ? ((hit / input) * 100).toFixed(1) : "n/a";
-    return `[cache] ${tag} hit=${hit} create=${create} input=${input} rate=${rate}% completion=${output}`;
+  const input = usage.prompt_tokens ?? 0;
+  const output = usage.completion_tokens ?? 0;
+  const cached = usage.prompt_tokens_details?.cached_tokens;
+  if (typeof cached === "number") {
+    const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
+    return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
  }
  return `[cache] ${tag} input=${input} completion=${output} (provider didn't report cache stats)`;
 }
@@ -36,28 +36,28 @@ export async function chat(
    tag?: string;
  },
 ): Promise<string> {
-  const protocol = resolveProtocol(config);
-  const model = createLanguageModel(config, protocol);
-
-  const system = messages.find((m) => m.role === "system")?.content;
-  const convo: ModelMessage[] = messages
-    .filter((m) => m.role !== "system")
-    .map((m) => ({
-      role: m.role as "user" | "assistant",
-      content: m.content,
-    }));
-
-  const { text, usage } = await generateText({
-    model,
-    system,
-    messages: convo,
-    temperature: opts?.temperature ?? 0.9,
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
+    maxRetries: 0,
+    dangerouslyAllowBrowser: true,
  });

-  console.log(summarizeSdkUsage(opts?.tag ?? "chat", usage));
+  const completion = await client.chat.completions.create({
+    model: config.model,
+    messages: messages.map((m) => ({
+      role: m.role as "system" | "user" | "assistant",
+      content: m.content,
+    })),
+    temperature: opts?.temperature ?? 0.9,
+    stream: false,
+  });

-  if (typeof text !== "string" || text.length === 0) {
-    throw new Error(`Chat API (AI SDK ${protocol}) returned no content.`);
+  const text = completion.choices[0]?.message?.content ?? "";
+  console.log(summarizeSdkUsage(opts?.tag ?? "chat", completion.usage ?? undefined));
+
+  if (text.length === 0) {
+    throw new Error(`Chat API returned no content.`);
  }
  return text;
 }
@@ -1,16 +1,35 @@
-type RetryInit = RequestInit & { retries?: number; retryDelayMs?: number };
+type RetryInit = RequestInit & {
+  retries?: number;
+  retryDelayMs?: number;
+  /**
+   * Per-attempt hard deadline. A timed-out attempt counts as a retryable
+   * failure (it consumes retry budget like a 5xx). Unset → no client-side
+   * timeout, preserving the historical behavior.
+   */
+  timeoutMs?: number;
+};

 export async function fetchWithRetry(
  url: string,
  init: RetryInit,
 ): Promise<Response> {
-  const { retries = 2, retryDelayMs = 1500, ...fetchInit } = init;
+  const { retries = 2, retryDelayMs = 1500, timeoutMs, ...fetchInit } = init;
  if (!fetchInit.redirect) fetchInit.redirect = "manual";
+  // Caller-supplied signal (e.g. a hedge loser being cancelled) must abort
+  // immediately and permanently — it is NOT retryable, unlike our own
+  // per-attempt timeout below.
+  const externalSignal = fetchInit.signal ?? undefined;

  let lastError: unknown;
  for (let attempt = 0; attempt <= retries; attempt++) {
+    if (externalSignal?.aborted) throw abortError(externalSignal);
+    const attemptSignal = timeoutMs
+      ? externalSignal
+        ? AbortSignal.any([externalSignal, AbortSignal.timeout(timeoutMs)])
+        : AbortSignal.timeout(timeoutMs)
+      : externalSignal;
    try {
-      const res = await fetch(url, fetchInit);
+      const res = await fetch(url, { ...fetchInit, signal: attemptSignal });
      if (res.ok) return res;
      // Don't retry 4xx (client errors won't fix themselves)
      if (res.status >= 400 && res.status < 500) return res;
@@ -22,9 +41,10 @@ export async function fetchWithRetry(
      return res;
    } catch (err) {
      lastError = err;
-      const isAbort =
-        err instanceof DOMException && err.name === "AbortError";
+      if (externalSignal?.aborted) throw err;
+      const isAbort = err instanceof DOMException && err.name === "AbortError";
      if (isAbort) throw err;
+      // TimeoutError (from AbortSignal.timeout) falls through as retryable.
      if (attempt < retries) {
        await sleep(retryDelayMs * (attempt + 1));
        continue;
@@ -35,6 +55,12 @@ export async function fetchWithRetry(
  throw lastError;
 }

+function abortError(signal: AbortSignal): unknown {
+  return signal.reason instanceof Error
+    ? signal.reason
+    : new DOMException("This operation was aborted", "AbortError");
+}
+
 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
 }
@@ -1,6 +1,4 @@
-import { generateImage as generateImageSdk } from "ai";
-import { createOpenAI } from "@ai-sdk/openai";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import OpenAI, { toFile, type Uploadable } from "openai";
 import type { Orientation, ProviderConfig, ProviderProtocol } from "@infiplot/types";
 import { fetchWithRetry } from "./fetchWithRetry";
 import { normalizeBaseUrl } from "./normalizeUrl";
@@ -48,8 +46,8 @@ export type GenerateImageOptions = {
  /**
   * Reference images (UUIDs, URLs, or base64) to condition generation on —
   * typically character portraits + the prior scene image. Runware caps at 4;
-   * we silently truncate beyond that. On the OpenAI/Gemini AI SDK paths these
-   * map to `prompt.images` (the SDK accepts public URLs or data URLs).
+   * we silently truncate beyond that. On the native OpenAI path these are
+   * fetched/decoded and sent to `images.edit`.
   */
  referenceImages?: string[];
  /** 0–1, FLUX needs ≥ 0.8 to actually have an effect. Runware-only. */
@@ -58,16 +56,25 @@ export type GenerateImageOptions = {
   * Output aspect, locked per session. "portrait" → 9:16 vertical for mobile;
   * default/"landscape" → 16:9 widescreen. Mapped to each provider's nearest
   * supported size: Runware 1024×1792, OpenAI-compatible REST 1024x1792,
-   * native gpt-image 1024x1536, Gemini aspectRatio 9:16.
+   * native gpt-image 1024x1536.
   */
  orientation?: Orientation;
+  /**
+   * Per-attempt hard deadline (ms). A timed-out attempt is retryable.
+   * Unset → no client-side timeout (historical behavior).
+   */
+  timeoutMs?: number;
+  /** Retry-attempt override for this call (default 2). 0 = single attempt. */
+  retries?: number;
+  /** External cancellation, e.g. aborting the losing leg of a hedged race. */
+  signal?: AbortSignal;
 };

 export type GenerateImageResult = {
  /**
   * Image the client can render directly. A Runware CDN URL on the Runware
-   * path; a `data:<mime>;base64,...` URI on the AI SDK paths (OpenAI/Gemini
-   * return raw bytes, not a hosted URL).
+   * path; a `data:<mime>;base64,...` URI on the native OpenAI path when GPT
+   * image models return raw bytes instead of a hosted URL.
   */
  imageUrl: string;
  /**
@@ -117,63 +124,135 @@ export async function generateImage(
  const protocol = resolveImageProtocol(config);
  switch (protocol) {
    case "openai":
-    case "google":
-      return generateImageViaAiSdk(config, prompt, options, protocol);
+      return generateImageOpenAi(config, prompt, options);
    case "runware":
      return generateImageRunware(config, prompt, options);
-    case "anthropic":
-      throw new Error(
-        'IMAGE_PROVIDER "anthropic" does not generate images. Use "openai", "google", "runware", or "openai_compatible".',
-      );
    case "openai_compatible":
    default:
      return generateImageOpenAiCompatible(config, prompt, options);
  }
 }

-// Native OpenAI (gpt-image) / Gemini (Nano Banana) via the Vercel AI SDK.
-// Unlike the fetch path, this supports reference-image editing via
-// `prompt.images`. The SDK returns raw bytes (no hosted URL), so we hand the
-// client a data URI and synthesize a UUID; continuity references reuse the
-// data URI rather than a provider UUID.
-async function generateImageViaAiSdk(
+// Native OpenAI (gpt-image) via the official OpenAI SDK. Unlike the compatible
+// fetch path, this supports reference-image editing through `images.edit`.
+// GPT image models return raw bytes, so we hand the client a data URI and
+// synthesize a UUID; continuity references reuse the data URI rather than a
+// provider UUID.
+async function generateImageOpenAi(
  config: ProviderConfig,
  prompt: string,
-  options: GenerateImageOptions | undefined,
-  protocol: "openai" | "google",
+  options?: GenerateImageOptions,
 ): Promise<GenerateImageResult> {
-  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
-  const imageModel =
-    protocol === "openai"
-      ? createOpenAI({ apiKey: config.apiKey, baseURL }).image(config.model)
-      : createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL }).image(
-          config.model,
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai"),
+    maxRetries: 2,
+    dangerouslyAllowBrowser: true,
+  });
+  const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
+  const portrait = options?.orientation === "portrait";
+  const size = portrait ? "1024x1536" : "1536x1024";
+  const requestOptions = {
+    signal: options?.signal,
+    timeout: options?.timeoutMs,
+    ...(options?.retries !== undefined ? { maxRetries: options.retries } : {}),
+  };
+
+  const response =
+    refs.length > 0
+      ? await client.images.edit(
+          {
+            model: config.model,
+            prompt,
+            image: await Promise.all(refs.map(referenceImageToUploadable)),
+            n: 1,
+            size,
+          },
+          requestOptions,
+        )
+      : await client.images.generate(
+          {
+            model: config.model,
+            prompt,
+            n: 1,
+            size,
+          },
+          requestOptions,
        );

-  const refs = (options?.referenceImages ?? []).slice(0, MAX_REFERENCE_IMAGES);
-  const promptArg =
-    refs.length > 0 ? { text: prompt, images: refs } : prompt;
+  return imageResponseToResult(response);
+}

-  // Session-locked aspect. gpt-image takes an explicit `size` (portrait /
-  // landscape options are 1024x1536 / 1536x1024); Gemini takes an `aspectRatio`.
-  const portrait = options?.orientation === "portrait";
-  const { image } = await generateImageSdk({
-    model: imageModel,
-    prompt: promptArg,
-    ...(protocol === "openai"
-      ? { size: (portrait ? "1024x1536" : "1536x1024") as `${number}x${number}` }
-      : { aspectRatio: (portrait ? "9:16" : "16:9") as `${number}:${number}` }),
-  });
+async function referenceImageToUploadable(ref: string): Promise<Uploadable> {
+  if (ref.startsWith("data:")) {
+    const response = await fetch(ref);
+    if (!response.ok) {
+      throw new Error(`Failed to read data URL reference image.`);
+    }
+    const mediaType = response.headers.get("content-type") ?? "image/png";
+    return toFile(response, `reference.${extensionFromMediaType(mediaType)}`, {
+      type: mediaType,
+    });
+  }

-  return {
-    imageUrl: `data:${image.mediaType};base64,${image.base64}`,
-    imageUuid: crypto.randomUUID(),
-  };
+  if (/^https?:\/\//i.test(ref)) {
+    const response = await fetch(ref);
+    if (!response.ok) {
+      throw new Error(
+        `Failed to fetch reference image ${ref}: HTTP ${response.status}`,
+      );
+    }
+    const mediaType = response.headers.get("content-type") ?? "image/png";
+    return toFile(response, filenameFromUrl(ref, mediaType), {
+      type: mediaType,
+    });
+  }
+
+  throw new Error(
+    `Native OpenAI image editing requires reference image URLs or data URLs; got "${ref.slice(0, 32)}...".`,
+  );
+}
+
+function imageResponseToResult(
+  response: OpenAI.Images.ImagesResponse,
+): GenerateImageResult {
+  const data = response.data?.[0];
+  const b64 = data?.b64_json;
+  if (b64) {
+    const format = response.output_format ?? "png";
+    return {
+      imageUrl: `data:image/${format};base64,${b64}`,
+      imageUuid: crypto.randomUUID(),
+    };
+  }
+
+  const imageUrl = data?.url;
+  if (imageUrl) {
+    return { imageUrl, imageUuid: crypto.randomUUID() };
+  }
+
+  throw new Error(`No image data in OpenAI response.`);
+}
+
+function filenameFromUrl(url: string, mediaType: string): string {
+  try {
+    const name = new URL(url).pathname.split("/").filter(Boolean).at(-1);
+    if (name && /\.[a-z0-9]+$/i.test(name)) return name;
+  } catch {
+    // Fall back to the media type below.
+  }
+  return `reference.${extensionFromMediaType(mediaType)}`;
+}
+
+function extensionFromMediaType(mediaType: string): string {
+  if (mediaType.includes("jpeg") || mediaType.includes("jpg")) return "jpg";
+  if (mediaType.includes("webp")) return "webp";
+  return "png";
 }

 // OpenAI-compatible REST route (GPTGod, DALL-E proxies, etc.). Basic
 // text-to-image only — no reference images on this path; for editing/anchoring
-// set IMAGE_PROVIDER=openai (or google) to take the AI SDK path above.
+// set IMAGE_PROVIDER=openai to take the native OpenAI path above.
 async function generateImageOpenAiCompatible(
  config: ProviderConfig,
  prompt: string,
@@ -198,6 +277,9 @@ async function generateImageOpenAiCompatible(
      // Session-locked aspect (16:9 default, 9:16 portrait for mobile).
      size: options?.orientation === "portrait" ? "1024x1792" : "1792x1024",
    }),
+    retries: options?.retries,
+    timeoutMs: options?.timeoutMs,
+    signal: options?.signal,
  });

  const text = await res.text();
@@ -267,6 +349,9 @@ async function generateImageRunware(
      Authorization: `Bearer ${config.apiKey}`,
    },
    body: JSON.stringify([task]),
+    retries: options?.retries,
+    timeoutMs: options?.timeoutMs,
+    signal: options?.signal,
  });

  const text = await res.text();
@@ -1,23 +0,0 @@
-import { createAnthropic } from "@ai-sdk/anthropic";
-import { createGoogleGenerativeAI } from "@ai-sdk/google";
-import { createOpenAI } from "@ai-sdk/openai";
-import type { ProviderConfig, ProviderProtocol } from "@infiplot/types";
-import { normalizeBaseUrl } from "./normalizeUrl";
-
-export function resolveProtocol(config: ProviderConfig): ProviderProtocol {
-  return config.provider ?? "openai_compatible";
-}
-
-export function createLanguageModel(config: ProviderConfig, protocol: ProviderProtocol) {
-  const baseURL = normalizeBaseUrl(config.baseUrl, protocol);
-  switch (protocol) {
-    case "anthropic":
-      return createAnthropic({ apiKey: config.apiKey, baseURL })(config.model);
-    case "google":
-      return createGoogleGenerativeAI({ apiKey: config.apiKey, baseURL })(config.model);
-    case "openai_compatible":
-    case "openai":
-    default:
-      return createOpenAI({ apiKey: config.apiKey, baseURL }).chat(config.model);
-  }
-}
@@ -31,8 +31,6 @@ const ENDPOINT_SUFFIX =
 const DEFAULT_VERSION_SEGMENT: Record<ProviderProtocol, string | null> = {
  openai_compatible: "v1",
  openai: "v1",
-  anthropic: "v1",
-  google: "v1beta",
  // Runware posts to the bare base URL with no version-pathed sub-resource,
  // so never inject a segment for it.
  runware: null,
@@ -1,7 +1,6 @@
-import { generateText } from "ai";
-import type { ModelMessage } from "ai";
+import OpenAI from "openai";
 import type { ProviderConfig } from "@infiplot/types";
-import { createLanguageModel, resolveProtocol } from "./model";
+import { normalizeBaseUrl } from "./normalizeUrl";

 const VISION_TIMEOUT_MS = 60_000;

@@ -22,34 +21,32 @@ export async function analyzeImageDataUrl(
  imageDataUrl: string,
  prompt: string,
 ): Promise<string> {
-  const protocol = resolveProtocol(config);
-  const model = createLanguageModel(config, protocol);
+  const client = new OpenAI({
+    apiKey: config.apiKey,
+    baseURL: normalizeBaseUrl(config.baseUrl, "openai_compatible"),
+    maxRetries: 0,
+    timeout: VISION_TIMEOUT_MS,
+    dangerouslyAllowBrowser: true,
+  });

-  const messages: ModelMessage[] = [
-    {
-      role: "user",
-      content: [
-        { type: "text", text: prompt },
-        { type: "image", image: imageDataUrl },
-      ],
-    },
-  ];
+  const completion = await client.chat.completions.create({
+    model: config.model,
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: prompt },
+          { type: "image_url", image_url: { url: imageDataUrl } },
+        ],
+      },
+    ],
+    temperature: 0.2,
+    stream: false,
+  });

-  const timeoutCtrl = new AbortController();
-  const timeoutId = setTimeout(() => timeoutCtrl.abort(), VISION_TIMEOUT_MS);
-  try {
-    const { text } = await generateText({
-      model,
-      messages,
-      temperature: 0.2,
-      maxRetries: 0,
-      abortSignal: timeoutCtrl.signal,
-    });
-    if (typeof text !== "string" || text.length === 0) {
-      throw new Error(`Vision API (AI SDK ${protocol}) returned no content.`);
-    }
-    return text;
-  } finally {
-    clearTimeout(timeoutId);
+  const text = completion.choices[0]?.message?.content ?? "";
+  if (text.length === 0) {
+    throw new Error(`Vision API returned no content.`);
  }
+  return text;
 }
@@ -53,7 +53,22 @@ type AnalyticsEventData = {
  tts_toggle: { muted: boolean };
  fullscreen_toggle: { on: boolean };
  play_heartbeat: never;
-  gallery_export: { scene_count: number };
+  gallery_export: { scene_count: number; audio_count: number };
+  login_success: { provider: "google" | "github" | "email" };
+  play_error: {
+    source: "scene" | "start" | "vision" | "insert_beat" | "freeform" | "prefetch";
+    kind: "network" | "timeout" | "http_5xx" | "http_4xx" | "abort" | "unknown";
+    http_status: number;
+    orientation: "portrait" | "landscape";
+    connection: "4g" | "3g" | "2g" | "slow-2g" | "unknown";
+    was_hidden: boolean;
+    scene_index: number;
+    elapsed_bucket: "<5s" | "5-30s" | "30-60s" | "60-120s" | "120s+";
+  };
+  play_visibility_lost: {
+    phase: "loading-first" | "ready" | "transitioning" | "vision-thinking" | "inserting-beat";
+    had_pending_fetch: boolean;
+  };
 };

 export type AnalyticsEvent = keyof AnalyticsEventData;
@@ -0,0 +1,89 @@
+// Shared primitives for surviving an OAuth full-page round-trip.
+//
+// Google / GitHub OAuth is a full-page redirect: it unmounts the React tree
+// and discards all in-memory state (the server is stateless, so the client
+// carries everything). To resume where the user left off after the redirect,
+// a page snapshots its domain state into sessionStorage just before navigating
+// away, then consumes the snapshot on the next mount — but only if the user is
+// now actually signed in.
+//
+// Email-OTP login never redirects (it resolves in-page), so it bypasses this
+// machinery entirely and resumes synchronously via AuthModal.onSuccess.
+//
+// This module holds the three page-agnostic pieces: the login check, a
+// quota-safe sessionStorage write (heavy data-URL fields are stripped on
+// QuotaExceededError), and the consume-once resume gate. Each page keeps its
+// own snapshot shape and restore side effects — only the plumbing is shared.
+
+import { AUTH_ENABLED } from "@/lib/supabase/config";
+import { createClient as createSupabaseClient } from "@/lib/supabase/client";
+
+// True when auth is disabled (self-host with blank Supabase env) or the visitor
+// already has a session. Gates any auth-required action (and the resume path).
+export async function isAuthed(): Promise<boolean> {
+  if (!AUTH_ENABLED) return true;
+  const sb = createSupabaseClient();
+  const { data } = await sb.auth.getUser();
+  return !!data.user;
+}
+
+// Write a resume snapshot to sessionStorage with a quota-safe fallback.
+// `fallbacks` is an ordered list of progressively-lighter payloads to try if
+// the primary write fails (typically QuotaExceededError from a data-URL image).
+// Each fallback drops some non-essential heavy field while keeping the data
+// needed to resume. A dropped field only affects *future* generation (e.g. the
+// painter on later scenes), never the scene being resumed, so degrading is
+// graceful. Returns true if any write succeeded.
+export function writeResumeSnapshot<T>(
+  key: string,
+  primary: T,
+  fallbacks: readonly T[] = [],
+): boolean {
+  const tryWrite = (candidate: T): boolean => {
+    try {
+      sessionStorage.setItem(key, JSON.stringify(candidate));
+      return true;
+    } catch {
+      return false; // QuotaExceededError (or disabled storage)
+    }
+  };
+  if (tryWrite(primary)) return true;
+  for (const fb of fallbacks) {
+    if (tryWrite(fb)) return true;
+  }
+  return false;
+}
+
+// Consume-once resume gate. Returns the parsed snapshot if one exists at `key`
+// AND the user is now signed in (so a stale snapshot from a failed/abandoned
+// login doesn't resurrect a half-flow). Always removes the entry — either it's
+// consumed here, or it's stale and must not linger. Returns null when there's
+// nothing to resume, the user isn't signed in, or the payload is corrupt.
+//
+// `removeItem` intentionally runs before `isAuthed()` so that a network error
+// during the auth check does not leave a zombie snapshot behind. Without this
+// ordering, callers that guard on the snapshot's presence (play-page bootstrap)
+// would re-enter this path on every effect cycle, producing an infinite retry
+// loop. Dropping the snapshot on a transient network glitch is an acceptable
+// trade-off — the worst case is the user lands on the first scene instead of
+// resuming mid-story, which is the same experience as before this feature.
+export async function consumeResumeSnapshot<T>(key: string): Promise<T | null> {
+  const raw = sessionStorage.getItem(key);
+  if (!raw) return null;
+  sessionStorage.removeItem(key);
+  let authed: boolean;
+  try {
+    authed = await isAuthed();
+  } catch {
+    // Network / unexpected error during auth check. Snapshot already removed
+    // (prevents the caller's retry loop); return null so callers fall back to
+    // their default path (normal bootstrap).
+    return null;
+  }
+  if (!authed) return null;
+  try {
+    return JSON.parse(raw) as T;
+  } catch {
+    return null; // corrupt snapshot — ignore
+  }
+}
@@ -0,0 +1,160 @@
+import type { EngineConfig, ProviderProtocol } from "@infiplot/types";
+
+// Bring-your-own model keys — stored CLIENT-SIDE ONLY.
+//
+// When a user supplies their own text/image/vision API credentials, we persist
+// them in localStorage and the browser talks to providers directly. The keys
+// are therefore never sent to our server: no request body, no header, no log.
+
+const STORAGE_KEY = "infiplot:model";
+
+const VALID_PROTOCOLS: ProviderProtocol[] = [
+  "openai_compatible",
+  "openai",
+  "runware",
+];
+
+export type StoredModelConfig = {
+  textBaseUrl: string;
+  textApiKey: string;
+  textModel: string;
+  textProvider?: ProviderProtocol;
+  imageBaseUrl: string;
+  imageApiKey: string;
+  imageModel: string;
+  imageProvider?: ProviderProtocol;
+  visionBaseUrl: string;
+  visionApiKey: string;
+  visionModel: string;
+  visionProvider?: ProviderProtocol;
+};
+
+function isValidProtocol(p: string): p is ProviderProtocol {
+  return (VALID_PROTOCOLS as readonly string[]).includes(p);
+}
+
+function readProtocol(raw: unknown): ProviderProtocol | undefined {
+  if (typeof raw === "string" && isValidProtocol(raw)) return raw;
+  return undefined;
+}
+
+/** Read + validate the persisted model config. Returns null when running on the
+ *  server, when nothing is stored, on parse failure, or when required fields are
+ *  missing. */
+export function readStoredModelConfig(): StoredModelConfig | null {
+  if (typeof window === "undefined") return null;
+  try {
+    const raw = window.localStorage.getItem(STORAGE_KEY);
+    if (!raw) return null;
+    const parsed = JSON.parse(raw) as Partial<StoredModelConfig>;
+
+    const textBaseUrl = typeof parsed.textBaseUrl === "string" ? parsed.textBaseUrl.trim() : "";
+    const textApiKey = typeof parsed.textApiKey === "string" ? parsed.textApiKey.trim() : "";
+    const textModel = typeof parsed.textModel === "string" ? parsed.textModel.trim() : "";
+    const imageBaseUrl = typeof parsed.imageBaseUrl === "string" ? parsed.imageBaseUrl.trim() : "";
+    const imageApiKey = typeof parsed.imageApiKey === "string" ? parsed.imageApiKey.trim() : "";
+    const imageModel = typeof parsed.imageModel === "string" ? parsed.imageModel.trim() : "";
+    const visionBaseUrl = typeof parsed.visionBaseUrl === "string" ? parsed.visionBaseUrl.trim() : "";
+    const visionApiKey = typeof parsed.visionApiKey === "string" ? parsed.visionApiKey.trim() : "";
+    const visionModel = typeof parsed.visionModel === "string" ? parsed.visionModel.trim() : "";
+
+    if (
+      !textBaseUrl ||
+      !textApiKey ||
+      !textModel ||
+      !imageBaseUrl ||
+      !imageApiKey ||
+      !imageModel ||
+      !visionBaseUrl ||
+      !visionApiKey ||
+      !visionModel
+    ) {
+      return null;
+    }
+
+    return {
+      textBaseUrl,
+      textApiKey,
+      textModel,
+      textProvider: readProtocol(parsed.textProvider),
+      imageBaseUrl,
+      imageApiKey,
+      imageModel,
+      imageProvider: readProtocol(parsed.imageProvider),
+      visionBaseUrl,
+      visionApiKey,
+      visionModel,
+      visionProvider: readProtocol(parsed.visionProvider),
+    };
+  } catch {
+    return null;
+  }
+}
+
+/** Persist the model config. Trims all string fields so trailing whitespace
+ *  from pastes never breaks headers. */
+export function writeStoredModelConfig(config: StoredModelConfig): void {
+  if (typeof window === "undefined") return;
+  try {
+    const payload: StoredModelConfig = {
+      textBaseUrl: config.textBaseUrl.trim(),
+      textApiKey: config.textApiKey.trim(),
+      textModel: config.textModel.trim(),
+      textProvider: config.textProvider,
+      imageBaseUrl: config.imageBaseUrl.trim(),
+      imageApiKey: config.imageApiKey.trim(),
+      imageModel: config.imageModel.trim(),
+      imageProvider: config.imageProvider,
+      visionBaseUrl: config.visionBaseUrl.trim(),
+      visionApiKey: config.visionApiKey.trim(),
+      visionModel: config.visionModel.trim(),
+      visionProvider: config.visionProvider,
+    };
+    window.localStorage.setItem(STORAGE_KEY, JSON.stringify(payload));
+  } catch {
+    // Storage disabled / quota / private mode — BYO simply stays off.
+  }
+}
+
+export function clearStoredModelConfig(): void {
+  if (typeof window === "undefined") return;
+  try {
+    window.localStorage.removeItem(STORAGE_KEY);
+  } catch {
+    // ignore
+  }
+}
+
+/** Build a full EngineConfig from stored model config + optional TTS config.
+ *  Throws when model config is missing so callers can surface a friendly
+ *  "please configure" message. */
+export function resolveEngineConfig(
+  model: StoredModelConfig | null,
+  tts: import("@infiplot/types").TtsConfig | null,
+): EngineConfig {
+  if (!model) {
+    throw new Error("模型配置未设置。请返回首页，点击「模型设置」配置 API 参数。");
+  }
+  return {
+    text: {
+      baseUrl: model.textBaseUrl,
+      apiKey: model.textApiKey,
+      model: model.textModel,
+      provider: model.textProvider,
+    },
+    image: {
+      baseUrl: model.imageBaseUrl,
+      apiKey: model.imageApiKey,
+      model: model.imageModel,
+      provider: model.imageProvider,
+    },
+    vision: {
+      baseUrl: model.visionBaseUrl,
+      apiKey: model.visionApiKey,
+      model: model.visionModel,
+      provider: model.visionProvider,
+    },
+    tts: tts ?? undefined,
+    mockImage: false,
+  };
+}
@@ -6,8 +6,6 @@ import type {

 const VALID_PROTOCOLS = [
  "openai_compatible",
-  "anthropic",
-  "google",
  "openai",
  "runware",
 ] as const;
@@ -23,6 +21,15 @@ function readOptionalVar(name: string): string | undefined {
  return v && v.length > 0 ? v : undefined;
 }

+// Invalid/non-positive values are treated as unset (feature stays off) rather
+// than failing boot — these knobs are tuning aids, not required config.
+function readOptionalPositiveInt(name: string): number | undefined {
+  const v = readOptionalVar(name);
+  if (!v) return undefined;
+  const n = Number(v);
+  return Number.isFinite(n) && n > 0 ? Math.floor(n) : undefined;
+}
+
 // Optional *_PROVIDER selector. Unset → undefined, and each ai-client adapter
 // applies its own default (text/vision → openai_compatible; image → inferred
 // from the base URL). Validated eagerly so a typo fails fast at boot rather
@@ -33,8 +40,14 @@ function readProvider(name: string): ProviderProtocol | undefined {
  if ((VALID_PROTOCOLS as readonly string[]).includes(v)) {
    return v as ProviderProtocol;
  }
+  // anthropic/google were removed with the Vercel AI SDK — nudge users who
+  // still set them toward the OpenAI-compatible endpoints (see .env.example).
+  const hint =
+    v === "anthropic" || v === "google"
+      ? ` — use openai_compatible with their OpenAI-compatible endpoint instead`
+      : "";
  throw new Error(
-    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}`,
+    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}${hint}`,
  );
 }

@@ -71,5 +84,7 @@ export function loadEngineConfig(): EngineConfig {
    },
    tts: loadTtsConfig(),
    mockImage: readOptionalVar("MOCK_IMAGE") === "true",
+    imageTimeoutMs: readOptionalPositiveInt("IMAGE_TIMEOUT_MS"),
+    imageHedgeMs: readOptionalPositiveInt("IMAGE_HEDGE_MS"),
  };
 }
@@ -1,5 +1,10 @@
 import { chat, generateImage } from "@infiplot/ai-client";
-import { provisionVoice } from "@infiplot/tts-client";
+import {
+  isStepfun,
+  isValidStepfunVoiceId,
+  provisionVoice,
+  type ProvisionVoiceOptions,
+} from "@infiplot/tts-client";
 import type {
  Character,
  CharacterVoice,
@@ -9,7 +14,7 @@ import type {
 import { parseJsonLoose } from "../jsonParser";
 import { mockImageDataUri } from "../mockImage";
 import {
-  CHARACTER_DESIGNER_SYSTEM,
+  buildCharacterDesignerSystem,
  buildCharacterDesignerUserMessage,
  buildCharacterPortraitPrompt,
 } from "../prompts";
@@ -34,6 +39,10 @@ import {
 type CharacterDesignOutput = {
  visualDescription?: string;
  voiceDescription?: string;
+  /** Only present on the StepFun path (the system prompt asks for it when
+   *  stepfun:true). Hallucinated / out-of-catalog ids are dropped before
+   *  they reach provisioning, falling back to pickStepfunVoiceId. */
+  stepfunVoiceId?: string;
 };

 // TEMP: per-phase timing for latency diagnosis. Same convention as the
@@ -50,7 +59,7 @@ async function runDesignLLM(
  const raw = await chat(
    config.text,
    [
-      { role: "system", content: CHARACTER_DESIGNER_SYSTEM },
+      { role: "system", content: buildCharacterDesignerSystem({ stepfun: stepfunEnabled(config) }) },
      {
        role: "user",
        content: buildCharacterDesignerUserMessage(charName, session),
@@ -61,6 +70,13 @@ async function runDesignLLM(
  return parseJsonLoose<CharacterDesignOutput>(raw);
 }

+/** True when the server's TTS config points at StepFun (so the CharacterDesigner
+ *  should also pick a preset voice id). Returns false when TTS is off or on the
+ *  Xiaomi path — keeping the Xiaomi prompt byte-identical to history. */
+function stepfunEnabled(config: EngineConfig): boolean {
+  return !!config.tts && isStepfun(config.tts);
+}
+
 // Generate the per-character base portrait. The portrait is a "concept
 // sheet" — single character, neutral pose, plain background — so it works
 // well as a Runware referenceImages anchor for later scenes.
@@ -87,7 +103,12 @@ export async function renderCharacterPortrait(
      visualDescription,
      styleGuide,
    );
-    const { imageUrl, imageUuid } = await generateImage(config.image, prompt);
+    // Portraits get the hard timeout but are never hedged — a scene already
+    // runs several portrait paints in parallel, and hedging those would push
+    // burst concurrency past Runware's recommended 2-4 in-flight requests.
+    const { imageUrl, imageUuid } = await generateImage(config.image, prompt, {
+      timeoutMs: config.imageTimeoutMs,
+    });
    return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid };
  } catch (err) {
    const msg = err instanceof Error ? err.message : String(err);
@@ -100,10 +121,11 @@ export async function provisionCharacterVoice(
  config: EngineConfig,
  voiceDescription: string,
  charName: string,
+  opts?: ProvisionVoiceOptions,
 ): Promise<CharacterVoice | undefined> {
  if (!config.tts) return undefined;
  try {
-    return await provisionVoice(config.tts, voiceDescription);
+    return await provisionVoice(config.tts, voiceDescription, charName, opts);
  } catch (err) {
    const msg = err instanceof Error ? err.message : String(err);
    console.error(`[characterDesigner] voice provision failed for ${charName}: ${msg}`);
@@ -115,10 +137,18 @@ export async function provisionCharacterVoice(
 // call. The director then schedules renderCharacterPortrait /
 // provisionCharacterVoice around the Painter. Multiple new characters in the
 // same scene run this stage in parallel at the director level.
+//
+// On the StepFun path the same call ALSO yields stepfunVoiceId (the model
+// picks from the 32-preset catalog it sees in the system prompt). An invalid
+// pick is dropped here so the downstream provision falls back to the keyword
+// scorer — never trust an LLM-hallucinated id at the synth boundary.
 export type CharacterCard = {
  name: string;
  visualDescription?: string;
  voiceDescription: string;
+  /** Only set on the StepFun path AND only when the LLM picked a valid catalog
+   *  id. Threads through provisionCharacterVoice → stepfunProvision. */
+  stepfunVoiceId?: string;
 };

 export async function designCharacterCard(
@@ -130,12 +160,19 @@ export async function designCharacterCard(
  const design = await runDesignLLM(config, session, charName);
  tlog(`[charDesigner ${charName}] design LLM`, tDesign);

+  // Drop invalid catalog picks before they reach provision/synth. A hallucinated
+  // id would 4xx at synth time; better to fall back to pickStepfunVoiceId now.
+  const stepfunVoiceId = isValidStepfunVoiceId(design.stepfunVoiceId)
+    ? design.stepfunVoiceId
+    : undefined;
+
  return {
    name: charName,
    visualDescription: design.visualDescription?.trim() || undefined,
    voiceDescription:
      design.voiceDescription?.trim() ||
      `请根据角色名「${charName}」推断其性别、年龄与气质，生成最贴合的音色。所属世界观：${session.worldSetting}`,
+    stepfunVoiceId,
  };
 }

@@ -123,6 +123,10 @@ export function collectReferenceImages(
  return refs.slice(0, MAX_REFERENCE_IMAGES);
 }

+function errMsg(err: unknown): string {
+  return err instanceof Error ? err.message : String(err);
+}
+
 async function tryGenerate(
  config: ProviderConfig,
  prompt: string,
@@ -132,12 +136,93 @@ async function tryGenerate(
  try {
    return await generateImage(config, prompt, options);
  } catch (err) {
-    const msg = err instanceof Error ? err.message : String(err);
-    console.warn(`[painter] ${label} failed: ${msg}`);
+    console.warn(`[painter] ${label} failed: ${errMsg(err)}`);
    return null;
  }
 }

+// Hedged Tier-A: fire leg 1; if it hasn't settled after hedgeMs, race an
+// identical leg 2 and take whichever finishes first. This rescues straggler
+// paints (a single task stuck on a slow worker) without waiting out the
+// provider's own gateway limit (Runware kills tasks at ~55s with a 504).
+//
+// Deliberately NOT retry-on-error: a leg that fails fast (429/503 queue
+// saturation, 4xx) falls through to Tier B immediately — hedging into a
+// saturated queue only adds load. Each leg runs with retries=0 so the hedge
+// itself is the only retry layer (no retry×retry multiplication).
+async function tryGenerateHedged(
+  config: ProviderConfig,
+  prompt: string,
+  options: GenerateImageOptions,
+  label: string,
+  hedgeMs: number,
+): Promise<GenerateImageResult | null> {
+  type Settled =
+    | { leg: 1 | 2; ok: GenerateImageResult }
+    | { leg: 1 | 2; err: unknown };
+
+  const t0 = Date.now();
+  const controllers: (AbortController | undefined)[] = [undefined, undefined];
+  const fire = (leg: 1 | 2): Promise<Settled> => {
+    const ac = new AbortController();
+    controllers[leg - 1] = ac;
+    return generateImage(config, prompt, {
+      ...options,
+      retries: 0,
+      signal: ac.signal,
+    }).then(
+      (ok) => ({ leg, ok }) as Settled,
+      (err) => ({ leg, err }) as Settled,
+    );
+  };
+
+  const leg1 = fire(1);
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  const hedgeTimer = new Promise<"hedge">((resolve) => {
+    timer = setTimeout(() => resolve("hedge"), hedgeMs);
+  });
+
+  const first = await Promise.race([leg1, hedgeTimer]);
+  if (first !== "hedge") {
+    clearTimeout(timer);
+    if ("ok" in first) return first.ok;
+    console.warn(`[painter] ${label} failed: ${errMsg(first.err)}`);
+    return null;
+  }
+
+  console.warn(
+    `[painter] hedge fired: ${label} still pending after ${hedgeMs}ms`,
+  );
+  const leg2 = fire(2);
+
+  let result = await Promise.race([leg1, leg2]);
+  if ("err" in result) {
+    // First settler failed — give the survivor its full chance.
+    console.warn(
+      `[painter] hedge leg${result.leg} failed: ${errMsg(result.err)}`,
+    );
+    result = await (result.leg === 1 ? leg2 : leg1);
+  }
+
+  if ("ok" in result) {
+    const loserIdx = result.leg === 1 ? 1 : 0;
+    controllers[loserIdx]?.abort();
+    const loser = result.leg === 1 ? leg2 : leg1;
+    loser.then(
+      (s) => "err" in s && console.debug(`[painter] hedge loser leg${s.leg} aborted`),
+      () => {},
+    );
+    console.log(
+      `[painter] hedge won by leg${result.leg} in ${Date.now() - t0}ms`,
+    );
+    return result.ok;
+  }
+  console.warn(
+    `[painter] ${label} failed (both hedge legs): ${errMsg(result.err)}`,
+  );
+  return null;
+}
+
 export type PainterResult =
  | { kind: "real"; imageUrl: string; imageUuid: string }
  | { kind: "mock"; imageUrl: string };
@@ -167,14 +252,25 @@ export async function runPainter(

  // Tier A — with referenceImages (priorSceneImage + character portraits).
  // FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
-  // ~2.5× compared to the non-KV variant.
+  // ~2.5× compared to the non-KV variant. When IMAGE_HEDGE_MS is configured,
+  // the scene paint is hedged (see tryGenerateHedged); portraits are not.
  if (refs.length > 0) {
-    const r = await tryGenerate(
-      config.image,
-      prompt,
-      { referenceImages: refs, orientation: input.orientation },
-      `referenceImages (${refs.length})`,
-    );
+    const tierAOptions: GenerateImageOptions = {
+      referenceImages: refs,
+      orientation: input.orientation,
+      timeoutMs: config.imageTimeoutMs,
+    };
+    const label = `referenceImages (${refs.length})`;
+    const r =
+      config.imageHedgeMs && config.imageHedgeMs > 0
+        ? await tryGenerateHedged(
+            config.image,
+            prompt,
+            tierAOptions,
+            label,
+            config.imageHedgeMs,
+          )
+        : await tryGenerate(config.image, prompt, tierAOptions, label);
    if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
  }

@@ -183,6 +279,7 @@ export async function runPainter(
  // Errors here propagate to the caller.
  const r = await generateImage(config.image, prompt, {
    orientation: input.orientation,
+    timeoutMs: config.imageTimeoutMs,
  });
  return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
 }
@@ -305,12 +305,21 @@ export async function directScene(
  }

  // Kick off voice provisioning for every NEW char (never on the paint path).
+  // On the StepFun path, thread the LLM-selected stepfunVoiceId from the card
+  // into provision — it lets stepfunProvision honor the catalog pick instead
+  // of falling back to the keyword scorer (same network cost: still zero).
+  // ALSO persist it onto the Character so the client can echo it back on a
+  // StepFun server (where it skips the ~220KB voice payload) and the server
+  // resolveVoice honors the LLM pick at synth time instead of re-scoring.
  const voicePromises = cards.map((card) =>
-    provisionCharacterVoice(config, card.voiceDescription, card.name).then(
+    provisionCharacterVoice(config, card.voiceDescription, card.name, {
+      stepfunVoiceId: card.stepfunVoiceId,
+    }).then(
      (voice): Character => ({
        name: card.name,
        voiceDescription: card.voiceDescription,
        voice,
+        stepfunVoiceId: card.stepfunVoiceId,
      }),
    ),
  );
@@ -3,8 +3,9 @@ import { jsonrepair, JSONRepairError } from "jsonrepair";
 // Strict-then-forgiving JSON parser for LLM output. Tries in order:
 //   1. Direct JSON.parse on the trimmed text.
 //   2. Extract from ```json``` fenced block.
-//   3. Slice between first { and last } and parse.
-//   4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
+//   3. Parse the first complete JSON value prefix (handles duplicated objects).
+//   4. Slice between first { and last } and parse.
+//   5. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
 //
 // On final failure, logs the first 800 chars of the raw model output so we
 // can diagnose the actual syntax error without flooding logs or leaking
@@ -40,6 +41,67 @@ function preRepair(s: string): string {
  return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
 }

+function firstJsonStart(s: string): number {
+  const objectStart = s.indexOf("{");
+  const arrayStart = s.indexOf("[");
+  if (objectStart === -1) return arrayStart;
+  if (arrayStart === -1) return objectStart;
+  return Math.min(objectStart, arrayStart);
+}
+
+function firstCompleteJsonValue(s: string): string | undefined {
+  const start = firstJsonStart(s);
+  if (start === -1) return undefined;
+
+  const stack: string[] = [];
+  let inString = false;
+  let escaped = false;
+
+  for (let i = start; i < s.length; i += 1) {
+    const ch = s[i]!;
+
+    if (inString) {
+      if (escaped) {
+        escaped = false;
+      } else if (ch === "\\") {
+        escaped = true;
+      } else if (ch === "\"") {
+        inString = false;
+      }
+      continue;
+    }
+
+    if (ch === "\"") {
+      inString = true;
+      continue;
+    }
+
+    if (ch === "{") {
+      stack.push("}");
+      continue;
+    }
+
+    if (ch === "[") {
+      stack.push("]");
+      continue;
+    }
+
+    if (ch === "}" || ch === "]") {
+      if (stack.at(-1) !== ch) return undefined;
+      stack.pop();
+      if (stack.length === 0) return s.slice(start, i + 1);
+    }
+  }
+
+  return undefined;
+}
+
+function parseFirstCompleteJsonValue<T>(s: string): T | undefined {
+  const value = firstCompleteJsonValue(s);
+  if (!value) return undefined;
+  return JSON.parse(value) as T;
+}
+
 export function parseJsonLoose<T>(raw: string): T {
  const trimmed = raw.trim();

@@ -54,10 +116,22 @@ export function parseJsonLoose<T>(raw: string): T {
    try {
      return JSON.parse(fenced[1]) as T;
    } catch {
-      // fall through
+      try {
+        const parsed = parseFirstCompleteJsonValue<T>(fenced[1]);
+        if (parsed !== undefined) return parsed;
+      } catch {
+        // fall through
+      }
    }
  }

+  try {
+    const parsed = parseFirstCompleteJsonValue<T>(trimmed);
+    if (parsed !== undefined) return parsed;
+  } catch {
+    // fall through
+  }
+
  const first = trimmed.indexOf("{");
  const last = trimmed.lastIndexOf("}");
  const slice =
@@ -1,6 +1,7 @@
 import type {
  BeatAudioRequest,
  BeatAudioResponse,
+  CharacterVoice,
  EngineConfig,
  FreeformClassify,
  FreeformClassifyRequest,
@@ -17,6 +18,7 @@ import type {
 } from "@infiplot/types";
 import { coerceOrientation } from "@infiplot/types";
 import { chat } from "@infiplot/ai-client";
+import { isStepfun, isValidStepfunVoiceId, provisionVoice } from "@infiplot/tts-client";
 import { runArchitect } from "./agents/architect";
 import { selectStyle } from "./agents/styleSelector";
 import { directInsertBeat, directScene } from "./director";
@@ -241,11 +243,73 @@ export async function requestInsertBeat(
 //  timeout / failure / TTS disabled, so the client just plays silent.
 // ──────────────────────────────────────────────────────────────────────

+// Resolve a synth-ready voice for the request, normalizing provider
+// mismatches. The client usually sends a voice whose provider matches the
+// server's TTS (the common case). The mismatch case is mainly prebaked
+// homepage cards: they ship a Xiaomi voice baked at build time, but the
+// server may now run StepFun — so the client skips the ~220KB reference
+// audio (saving FOT) and sends stepfunVoiceId / voiceDescription instead.
+// We re-provision against the SERVER's provider so the right voice synth runs.
+// Returns undefined when there's nothing to synthesize from (caller plays
+// silent).
+async function resolveVoice(
+  config: EngineConfig,
+  req: BeatAudioRequest,
+): Promise<CharacterVoice | undefined> {
+  const serverStepfun = !!config.tts && isStepfun(config.tts);
+  const voiceProvider = req.voice?.provider;
+  const voiceMatchesServer =
+    (voiceProvider === "stepfun" && serverStepfun) ||
+    (voiceProvider === "xiaomi" && !serverStepfun);
+
+  // Fast path: the client sent a matching voice. (Also covers the legacy
+  // xiaomi card + xiaomi server case where the 220KB was unavoidable anyway.)
+  if (req.voice && voiceMatchesServer) {
+    return req.voice;
+  }
+
+  // Mismatch (or voice omitted). Re-provision against the server's provider.
+  if (!config.tts) return undefined;
+
+  // StepFun server: prefer an LLM-picked / prebaked id (zero-cost), else
+  // fall back to the keyword scorer over the voiceDescription.
+  if (serverStepfun) {
+    if (isValidStepfunVoiceId(req.stepfunVoiceId)) {
+      return provisionVoice(config.tts, req.voiceDescription ?? "", req.characterName, {
+        stepfunVoiceId: req.stepfunVoiceId,
+      });
+    }
+    if (req.voiceDescription) {
+      return provisionVoice(config.tts, req.voiceDescription, req.characterName);
+    }
+    return undefined;
+  }
+
+  // Xiaomi server but client sent a StepFun voice (or nothing). Re-design via
+  // voicedesign using the description; no description → can't synthesize.
+  //
+  // NOTE: this re-provision runs OUTSIDE synthesizeBeat's 15s withTimeout — a
+  // hung MiMo voicedesign tail (~30-70s) could hang /api/beat-audio until the
+  // platform timeout. Accepted because: (1) this path only fires on a rare
+  // cross-provider replay (.infiplot carrying a stepfun voice, opened on a
+  // Xiaomi-server deploy) or a mid-session provider flip — NOT the common
+  // prebaked-card + stepfun-server case, which is a pure-function provision
+  // with no network; (2) it degrades to silence rather than crashing. If it
+  // ever bites in practice, wrap resolve+synth in one withTimeout in voice.ts
+  // (requires threading an AbortSignal through provisionVoice → xiaomiProvision).
+  if (req.voiceDescription) {
+    return provisionVoice(config.tts, req.voiceDescription, req.characterName);
+  }
+  return undefined;
+}
+
 export async function requestBeatAudio(
  config: EngineConfig,
  req: BeatAudioRequest,
 ): Promise<BeatAudioResponse> {
  if (!config.tts) return { audio: null };
-  const audio = await synthesizeBeat(config.tts, req.voice, req.beat);
+  const voice = await resolveVoice(config, req);
+  if (!voice) return { audio: null };
+  const audio = await synthesizeBeat(config.tts, voice, req.beat);
  return { audio };
 }
@@ -7,6 +7,7 @@ import type {
  StoryState,
  WriterPlan,
 } from "@infiplot/types";
+import { formatStepfunCatalogForPrompt } from "@infiplot/tts-client";

 // ══════════════════════════════════════════════════════════════════════
 //  Multi-agent scene generation pipeline:
@@ -599,7 +600,14 @@ function collectPriorSceneKeys(session: Session): string[] {
 //  (e.g., gentle-looking character with energetic voice).
 // ──────────────────────────────────────────────────────────────────────

-export const CHARACTER_DESIGNER_SYSTEM = `你是视觉小说的「角色设定师」。给你一个**新登场角色的名字**，你要为这个角色同时设计两份卡片：
+// CHARACTER_DESIGNER_SYSTEM is split into a provider-agnostic CORE (visual +
+// voice-text rules) and a provider-specific TAIL (the JSON contract). When the
+// server runs StepFun, the tail additionally asks the model to pick a preset
+// voice id from the 32-entry catalog — so the SAME LLM call that designs the
+// character also selects its voice, at zero extra latency. When StepFun is
+// off (Xiaomi / no TTS), the tail is byte-identical to the historical prompt
+// (Xiaomi path is cache- and behavior-preserving).
+const CHARACTER_DESIGNER_SYSTEM_CORE = `你是视觉小说的「角色设定师」。给你一个**新登场角色的名字**，你要为这个角色同时设计两份卡片：
 1. **视觉设定卡（英文）**——给生图模型 FLUX 用，遵循 prompt engineering 风格
 2. **音色设定卡（中文）**——给小米 MiMo 配音设计用

@@ -608,18 +616,56 @@ export const CHARACTER_DESIGNER_SYSTEM = `你是视觉小说的「角色设定
 视觉设定卡 visualDescription 规则：
 - **必须完全用英文**
 - 风格：用形容词 + 短语，**英文逗号分隔**，符合 FLUX/Stable Diffusion prompt 习惯
- 包含：年龄段、发型发色、眼睛 / 神情基调、面部特征、标志性服饰（款式 + 配色 + 花纹）、整体气质
- **不要写瞬时姿势或表情**（这些由编剧/分镜每帧实时控制）
 - **必须融入全局画风** styleGuide 的美术指向（比如 styleGuide 是「赛博朋克」时，服饰要赛博朋克化）
- 长度：80–150 个英文词为宜
+- **不要写瞬时姿势或表情**（这些由编剧/分镜每帧实时控制）
 - 不要包含背景环境（这不是场景图，是角色立绘卡）
+- 长度：100–180 个英文词为宜
+
+**必须覆盖的 6 大要素 — 缺一项都会让角色撞脸：**
+1. **HAIR（头发）** — 同时写明四点：
+   ① 发色 hair color（具体到色相 + 明度，例 "platinum blonde" / "deep navy blue" / "warm chestnut brown"，不要只写 "dark hair"）
+   ② 发型 hairstyle（具体款式：twin tails / side ponytail / hime cut / undercut / messy bob / long straight / wolf cut...）
+   ③ 头发长度 hair length（chin-length / shoulder-length / waist-length / cropped 等明确量级）
+   ④ 发饰或刘海特征（blunt bangs / curtain bangs / side-swept / hair ribbon / hairpin，可省但建议有一项）
+2. **EYES（眼睛）** — 同时写明：
+   ① 瞳色 eye color（具体色相，例 "amber" / "violet" / "icy blue"，不要只写 "dark eyes"）
+   ② 眼型 eye shape（almond / round / sharp upturned / droopy / hooded）
+   ③ 神情基调 default gaze tone（gentle / piercing / sleepy / mischievous，不写瞬时表情）
+3. **FACE & BUILD（脸型 + 体格）** — 写 1–2 条标志性特征：
+   - 脸型轮廓（oval / heart-shaped / sharp jawline / soft round）
+   - 身高与体型相对感（tall and slim / petite / athletic build / broad shoulders）
+   - 一个独特识别点（small mole below left eye / faint freckles / round glasses / fang teeth / scar across brow），用来在画面里第一眼区分
+4. **OUTFIT（服饰）** — 同时写明：
+   ① 主体款式（school uniform / casual streetwear / formal suit / kimono / lab coat / military / cyberpunk jacket...）
+   ② 配色（主色 + 强调色，例 "navy blazer with crimson tie"，不要只写 "dark uniform"）
+   ③ 至少一个标志性细节（collar shape / asymmetric hem / layered scarf / fingerless gloves / chunky boots / accessory like a pendant or earring）
+   ④ 必须与 styleGuide 美术指向一致
+5. **PERSONALITY-DRIVEN VIBE（性格→气质映射）** — 一句话：
+   - 用 2–3 个性格关键词（gentle and reserved / sharp and aloof / cheerful and brash / cool and analytical / lazy and easygoing）
+   - 说明这个性格如何投射到整体气场与氛围（approachable warmth / intimidating presence / quiet confidence / carefree aura / scholarly composure），不要写具体姿势动作
+6. **OVERALL SILHOUETTE & VIBE TAG（整体剪影 + 一句气质标签）** — 一句话总结这个角色"远远一看就能认出来"的剪影特征
+
+**差异化硬规则 — 避免与已设定角色撞型：**
+你会收到「已设定角色清单」，每个条目包含 name + visualDescription。在落笔前**先在心里扫一遍**清单，提取每个角色的 hair color / hair length / eye color / outfit style，然后为新角色挑选**明显对比**的属性组合：
+- **发色不能撞**：已有黑发 → 新角色避免黑、深棕；已有金发 → 新角色避免银、浅栗；至少跨一个色系（黑/棕/金/红/橙/银/灰/蓝/紫/绿）
+- **瞳色不能撞**：同发色规则，跨色系挑选
+- **剪影不能撞**：已有长直发 → 新角色用短发 / 双马尾 / 卷发 / 扎发；用"发长 × 发型"两个维度造差异
+- **服饰风格至少一处明显差异**：款式（制服 vs 便服 vs 正装）、主色（暖 vs 冷）、轮廓（紧身 vs 宽松 / 长 vs 短）三者中至少一项明显不同
+- 若剧情强制视觉相似（如双胞胎），必须在配饰或配色上做一处显著识别点
+
+落笔顺序建议：先决定 personality keywords → 由性格反推合适的发色 / 服饰倾向 → 再与已有角色对照确认差异 → 最后写成英文 tag 串。

 音色设定卡 voiceDescription 规则：
 - **必须以明确性别开头**："女性，…" / "男性，…"
 - 随后描述：年龄段（如「约17岁少女」「30 出头男性」）、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言
 - 用中文，整段连续描述，不分段
 - 长度：50–80 个中文字为宜
- 例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗外向但容易害羞，语速偏快，标准普通话"
+- 例："女性，约17岁少女，音色清亮带点稚嫩甜美，性格开朗外向但容易害羞，语速偏快，标准普通话"`;
+
+// JSON-contract tail for the NON-stepfun path (Xiaomi voicedesign / no TTS).
+// Byte-identical to the historical prompt so the Xiaomi path keeps its cache
+// hit rate and voice quality unchanged.
+const CHARACTER_DESIGNER_TAIL_DEFAULT = `

 必须输出严格 JSON：
 {
@@ -629,6 +675,43 @@ export const CHARACTER_DESIGNER_SYSTEM = `你是视觉小说的「角色设定

 不要输出 JSON 以外的任何文本。`;

+// JSON-contract tail for the StepFun path. Same core output, plus the model
+// picks a preset voice id from the catalog. The id must match the SAME person
+// the voiceDescription describes (gender / age / vibe) — designed together so
+// appearance and voice stay coherent (the same invariant the CORE enforces).
+const CHARACTER_DESIGNER_TAIL_STEPFUN = `
+
+**StepFun 预设音色选择（必做）：**
+除 voiceDescription 外，你还必须从下列 StepFun 预设音色清单中，为本角色挑选一个与 voiceDescription 描绘的「同一个人」（性别 / 年龄段 / 气质都要一致）最贴合的预设，并把它的 id 填入 stepfunVoiceId。清单：
+${formatStepfunCatalogForPrompt()}
+
+挑选原则：
+- stepfunVoiceId 必须是上表里某个 id，原样复制（拼写、大小写、连字符都不能变）。
+- 必须与 voiceDescription 的性别一致（男声选 male 行，女声选 female 行）。
+- 年龄段尽量一致；拿不准时优先气质匹配（例如“冷艳御姐”选 lengyanyujie、“软萌萝莉”选 ruanmengnvsheng）。
+- 不允许编造清单外的 id，也不允许留空。
+
+必须输出严格 JSON：
+{
+  "visualDescription": "English visual card, comma-separated tags...",
+  "voiceDescription": "中文音色卡，以性别开头...",
+  "stepfunVoiceId": "清单内某个 id"
+}
+
+不要输出 JSON 以外的任何文本。`;
+
+/** Build the CharacterDesigner system prompt, provider-aware.
+ *  - stepfun:false → identical to the historical Xiaomi/no-TTS prompt.
+ *  - stepfun:true  → additionally asks the model to pick a StepFun preset
+ *    voice id from the 32-entry catalog (see formatStepfunCatalogForPrompt). */
+export function buildCharacterDesignerSystem(opts: {
+  stepfun: boolean;
+}): string {
+  return opts.stepfun
+    ? CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_STEPFUN
+    : CHARACTER_DESIGNER_SYSTEM_CORE + CHARACTER_DESIGNER_TAIL_DEFAULT;
+}
+
 export function buildCharacterDesignerUserMessage(
  charName: string,
  session: Session,
@@ -640,14 +723,19 @@ export function buildCharacterDesignerUserMessage(

  const others = session.characters.filter((c) => c.visualDescription);
  if (others.length > 0) {
-    parts.push("\n已设定角色（外貌应与他们有区分）：");
+    parts.push(
+      "\n已设定角色清单（**新角色的发色 / 瞳色 / 发型轮廓 / 服饰必须与下方每一位都形成明显视觉对比，不允许撞型**）：",
+    );
    for (const c of others) {
      parts.push(`- ${c.name}: ${c.visualDescription}`);
    }
+    parts.push(
+      "\n落笔前先逐个扫一遍上方角色的 hair color / hair length+style / eye color / outfit style，再为新角色挑选有明显跨色系或跨剪影对比的属性组合。",
+    );
  }

  parts.push(
-    "\n请为该角色同时设计 visualDescription（英文）和 voiceDescription（中文），严格以 JSON 格式返回。",
+    "\n请为该角色同时设计 visualDescription（英文，必须覆盖 system 中的 6 大要素清单）和 voiceDescription（中文），严格以 JSON 格式返回。",
  );
  return parts.join("\n");
 }
@@ -0,0 +1,199 @@
+import {
+  startSession as startSessionClient,
+  requestScene as requestSceneClient,
+  visionDecide as visionDecideClient,
+  classifyFreeform as classifyFreeformClient,
+  requestInsertBeat as requestInsertBeatClient,
+} from "@infiplot/engine";
+import {
+  readStoredModelConfig,
+  resolveEngineConfig,
+} from "@/lib/clientModelConfig";
+import { loadClientTtsConfig } from "@/lib/clientTtsConfig";
+import type {
+  Character,
+  FreeformClassifyRequest,
+  FreeformClassifyResponse,
+  EngineConfig,
+  InsertBeatRequest,
+  InsertBeatResponse,
+  SceneRequest,
+  SceneResponse,
+  Session,
+  StartRequest,
+  StartResponse,
+  TtsProvider,
+  VisionRequest,
+  VisionResponse,
+} from "@infiplot/types";
+
+function getClientConfig(): EngineConfig | null {
+  const modelCfg = readStoredModelConfig();
+  const ttsCfg = loadClientTtsConfig();
+  if (!modelCfg) return null;
+  return resolveEngineConfig(modelCfg, ttsCfg);
+}
+
+export class AuthRequiredError extends Error {
+  constructor() {
+    super("Unauthorized");
+    this.name = "AuthRequiredError";
+  }
+}
+
+async function postJson<T>(path: string, body: unknown): Promise<T> {
+  const res = await fetch(path, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(body),
+  });
+  if (!res.ok) {
+    if (res.status === 401) throw new AuthRequiredError();
+    let message = `HTTP ${res.status}`;
+    try {
+      const data = (await res.json()) as { error?: string };
+      if (data.error) message = data.error;
+    } catch {
+      // ignore parse failure, keep HTTP status message
+    }
+    throw new Error(message);
+  }
+  return res.json() as Promise<T>;
+}
+
+// GET variant of postJson — same 401 → AuthRequiredError mapping. Used by
+// getTtsProvider (a tiny config probe, no body).
+async function getJson<T>(path: string): Promise<T> {
+  const res = await fetch(path, { method: "GET" });
+  if (!res.ok) {
+    if (res.status === 401) throw new AuthRequiredError();
+    throw new Error(`HTTP ${res.status}`);
+  }
+  return res.json() as Promise<T>;
+}
+
+// ── FOT reduction helpers (server-fallback path only) ─────────────────
+// The server-fallback POSTs send the whole Session over the wire. Voice
+// data is bulky (~160KB/character via referenceAudioBase64) and the
+// scene-generation / vision / classify pipelines never need it — voices
+// are only consumed by /api/beat-audio, which receives them directly, not
+// via the session. So strip voices before transport.
+function stripVoicesForTransport(session: Session): Session {
+  return {
+    ...session,
+    // Destructure voice out so the serialized payload drops the field
+    // entirely (voice is optional on Character), rather than serializing
+    // it as undefined/null. This is the ~160KB/character referenceAudioBase64
+    // we want off the wire on the server-fallback path.
+    characters: session.characters.map(({ voice: _voice, ...rest }) => rest),
+  };
+}
+
+// The server strips voice from already-known characters before responding
+// (see /api/scene stripKnownVoices and /api/insert-beat's blanket strip) to
+// save bandwidth, so only NEW characters carry voice in the response. For
+// existing characters, re-attach the voice the client already holds locally.
+function mergeCharactersPreserveVoice(
+  local: Character[],
+  remote: Character[],
+): Character[] {
+  const localByName = new Map(local.map((c) => [c.name, c]));
+  return remote.map((c) => {
+    const prev = localByName.get(c.name);
+    if (!prev) return c;
+    return { ...c, voice: c.voice ?? prev.voice };
+  });
+}
+
+// ── Unified entry points ───────────────────────────────────────────────
+// When the browser has a BYO model config in localStorage, these call the
+// client-side engine directly (talking to providers from the browser).
+// Otherwise they fall back to the server-side API routes, which read
+// environment variables — useful for Vercel deploys that already supply keys.
+
+// Probe the server's TTS provider so fetchBeatAudio can shape its request body
+// (skip the ~220KB Xiaomi reference audio when the server runs StepFun).
+//
+// BYO precedence: when the browser has a client model config (BYO mode),
+// voice synthesis always runs locally against the user's own Xiaomi key, so
+// the server provider is irrelevant — return "xiaomi" synchronously without a
+// round-trip. Non-BYO → GET /api/tts-provider. Errors degrade to null (the
+// caller then sends voice fields defensively and the server normalizes).
+export async function getTtsProvider(): Promise<TtsProvider> {
+  if (getClientConfig()) return "xiaomi";
+  try {
+    const data = await getJson<{ provider: TtsProvider }>("/api/tts-provider");
+    return data.provider;
+  } catch (e) {
+    // AuthRequiredError (401) propagates so the caller's handleAuthError can
+    // surface the login modal; other errors (network, 5xx) → null = unknown,
+    // and fetchBeatAudio falls back to sending everything + server normalizes.
+    if (e instanceof AuthRequiredError) throw e;
+    console.warn("[getTtsProvider] probe failed, assuming unknown:", e);
+    return null;
+  }
+}
+
+export async function startSession(req: StartRequest): Promise<StartResponse> {
+  const config = getClientConfig();
+  if (config) {
+    return startSessionClient(config, req);
+  }
+  return postJson<StartResponse>("/api/start", req);
+}
+
+export async function requestScene(req: SceneRequest): Promise<SceneResponse> {
+  const config = getClientConfig();
+  if (config) {
+    return requestSceneClient(config, req);
+  }
+  const data = await postJson<SceneResponse>("/api/scene", {
+    ...req,
+    session: stripVoicesForTransport(req.session),
+  });
+  // Server stripped known-character voices for bandwidth — re-attach the
+  // voices we already hold so fetchBeatAudio can synth them.
+  data.characters = mergeCharactersPreserveVoice(req.session.characters, data.characters);
+  return data;
+}
+
+export async function visionDecide(req: VisionRequest): Promise<VisionResponse> {
+  const config = getClientConfig();
+  if (config) {
+    return visionDecideClient(config, req);
+  }
+  return postJson<VisionResponse>("/api/vision", {
+    ...req,
+    session: stripVoicesForTransport(req.session),
+  });
+}
+
+export async function classifyFreeform(
+  req: FreeformClassifyRequest,
+): Promise<FreeformClassifyResponse> {
+  const config = getClientConfig();
+  if (config) {
+    return classifyFreeformClient(config, req);
+  }
+  return postJson<FreeformClassifyResponse>("/api/classify-freeform", {
+    ...req,
+    session: stripVoicesForTransport(req.session),
+  });
+}
+
+export async function requestInsertBeat(
+  req: InsertBeatRequest,
+): Promise<InsertBeatResponse> {
+  const config = getClientConfig();
+  if (config) {
+    return requestInsertBeatClient(config, req);
+  }
+  const data = await postJson<InsertBeatResponse>("/api/insert-beat", {
+    ...req,
+    session: stripVoicesForTransport(req.session),
+  });
+  // /api/insert-beat strips voice from ALL characters before responding —
+  // re-attach every voice the client already holds so audio keeps working.
+  data.characters = mergeCharactersPreserveVoice(req.session.characters, data.characters);
+  return data;
+}
@@ -0,0 +1,199 @@
+// ──────────────────────────────────────────────────────────────────────
+//  Audio collection for the gallery / .infiplot share exports.
+//
+//  Walks every speaking beat across `session.history` and produces a
+//  Record keyed by `${sceneId}:${beatId}` whose values are inline
+//  data: URIs (base64). Data URIs are the only audio form that survives
+//  transport through localStorage, AES-GCM ciphertext, and a fresh
+//  browser tab — blob: URLs from /api/beat-audio are tied to the document
+//  that created them.
+//
+//  Three sources, in priority order:
+//    1. prebaked  — audio that came in through a .infiplot share file.
+//                   Already a data URI, so just copied through.
+//    2. current beatAudioMap — the play page's per-beat audio for the
+//                   scene the player is on right now. Blob URLs get
+//                   converted to data URIs; data URIs pass through.
+//    3. fresh synth — BYO client TTS (browser-direct Xiaomi/StepFun) when
+//                   a key is configured, otherwise /api/beat-audio.
+//
+//  Concurrency 4 to keep TTS providers happy when a long session has
+//  dozens of speaking beats. Errors are silently skipped — a missing beat
+//  just plays without voice; we never block the export on a TTS hiccup.
+// ──────────────────────────────────────────────────────────────────────
+
+import { provisionVoice, synthesize } from "@infiplot/tts-client";
+import type {
+  Beat,
+  Character,
+  CharacterVoice,
+  Session,
+  TtsConfig,
+} from "@infiplot/types";
+
+const CONCURRENCY = 4;
+
+export type CollectBeatAudioOptions = {
+  session: Session;
+  /** Current-scene audio already loaded by the play page (keyed by bare beat id). */
+  beatAudioMap: Record<string, string>;
+  /** Scene id `beatAudioMap` belongs to (so we can promote its entries into the full key). */
+  currentSceneId: string | null;
+  /** BYO TTS config when the user supplied their own key; null for server-side TTS. */
+  byoTts: TtsConfig | null;
+  /** Cache of in-flight BYO voice provisions, keyed by character name. Reused across calls. */
+  byoVoiceCache: Map<string, Promise<CharacterVoice>>;
+  /** Audio carried in from a `.infiplot` share file (already keyed by `sceneId:beatId`). */
+  prebakedAudio?: Record<string, string>;
+  /** Progress callback (done/total). Fired after every beat (success or failure). */
+  onProgress?: (done: number, total: number) => void;
+  signal?: AbortSignal;
+};
+
+type Job = {
+  key: string;
+  scene: Session["history"][number]["scene"];
+  beat: Beat;
+};
+
+export async function collectBeatAudioForExport(
+  opts: CollectBeatAudioOptions,
+): Promise<Record<string, string>> {
+  const out: Record<string, string> = {};
+
+  if (opts.prebakedAudio) {
+    for (const [k, v] of Object.entries(opts.prebakedAudio)) {
+      if (typeof v === "string" && v.startsWith("data:")) out[k] = v;
+    }
+  }
+
+  const jobs: Job[] = [];
+  for (const entry of opts.session.history) {
+    const scene = entry.scene;
+    for (const beat of scene.beats) {
+      if (!beat.speaker || !beat.line) continue;
+      const key = `${scene.id}:${beat.id}`;
+      if (out[key]) continue;
+      jobs.push({ key, scene, beat });
+    }
+  }
+
+  // Hoist current-scene blob/data URLs first so the play page's already-
+  // synthesized audio is reused instead of re-billed. Blob URLs are local to
+  // this document — convert to base64 so they survive export.
+  if (opts.currentSceneId) {
+    for (const job of jobs) {
+      if (job.scene.id !== opts.currentSceneId) continue;
+      const local = opts.beatAudioMap[job.beat.id];
+      if (!local) continue;
+      try {
+        out[job.key] = await urlToDataUri(local);
+      } catch {
+        // ignore — falls through to synth below
+      }
+    }
+  }
+
+  const remaining = jobs.filter((j) => !out[j.key]);
+  const total = jobs.length;
+  let done = jobs.length - remaining.length;
+  opts.onProgress?.(done, total);
+
+  const charByName = new Map(opts.session.characters.map((c) => [c.name, c]));
+
+  let cursor = 0;
+  async function worker(): Promise<void> {
+    while (cursor < remaining.length) {
+      if (opts.signal?.aborted) return;
+      const job = remaining[cursor++]!;
+      try {
+        const audio = await synthesizeBeatForExport(
+          job.beat,
+          charByName.get(job.beat.speaker!),
+          opts.byoTts,
+          opts.byoVoiceCache,
+          opts.signal,
+        );
+        if (audio) out[job.key] = audio;
+      } catch {
+        // silent — beat will play without voice
+      }
+      done++;
+      opts.onProgress?.(done, total);
+    }
+  }
+
+  const workers = Array.from(
+    { length: Math.min(CONCURRENCY, Math.max(1, remaining.length)) },
+    () => worker(),
+  );
+  await Promise.all(workers);
+  return out;
+}
+
+async function synthesizeBeatForExport(
+  beat: Beat,
+  speaker: Character | undefined,
+  byo: TtsConfig | null,
+  voiceCache: Map<string, Promise<CharacterVoice>>,
+  signal?: AbortSignal,
+): Promise<string | null> {
+  if (!speaker || !beat.line) return null;
+
+  if (byo) {
+    let voiceP = voiceCache.get(speaker.name);
+    if (!voiceP) {
+      if (speaker.voice) {
+        voiceP = Promise.resolve(speaker.voice);
+      } else if (speaker.voiceDescription) {
+        voiceP = provisionVoice(byo, speaker.voiceDescription, speaker.name);
+      } else {
+        return null;
+      }
+      voiceCache.set(speaker.name, voiceP);
+    }
+    let voice: CharacterVoice;
+    try {
+      voice = await voiceP;
+    } catch {
+      voiceCache.delete(speaker.name);
+      return null;
+    }
+    const out = await synthesize(byo, voice, beat.line, beat.lineDelivery, signal);
+    return `data:${out.mimeType};base64,${out.audioBase64}`;
+  }
+
+  if (!speaker.voice) return null;
+  const res = await fetch("/api/beat-audio", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
+      voice: speaker.voice,
+    }),
+    signal,
+  });
+  if (res.status === 204 || !res.ok) return null;
+  const blob = await res.blob();
+  return await blobToDataUri(blob);
+}
+
+async function urlToDataUri(url: string): Promise<string> {
+  if (url.startsWith("data:")) return url;
+  const res = await fetch(url);
+  const blob = await res.blob();
+  return await blobToDataUri(blob);
+}
+
+function blobToDataUri(blob: Blob): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onloadend = () => {
+      const v = reader.result;
+      if (typeof v === "string") resolve(v);
+      else reject(new Error("FileReader produced non-string result"));
+    };
+    reader.onerror = () => reject(reader.error ?? new Error("FileReader failed"));
+    reader.readAsDataURL(blob);
+  });
+}
@@ -11,7 +11,7 @@ import type {
 export const STORY_SHARE_STORAGE_KEY = "infiplot:story-import";

 export type StoryShareDoc = {
-  v: 1;
+  v: 1 | 2;
  kind: "infiplot-story";
  exportedAt: number;
  current: {
@@ -19,6 +19,11 @@ export type StoryShareDoc = {
    beatId?: string;
  };
  session: Session;
+  /** Pre-synthesized per-beat audio (data:audio/...;base64,...). Keyed by
+   *  `${sceneId}:${beatId}`. v2+ only — older files just have no audio and
+   *  play silent on replay. Embedding keeps the share file self-contained
+   *  so a friend can hear the recorded voices without their own TTS key. */
+  audioByBeatId?: Record<string, string>;
 };

 type JsonRecord = Record<string, unknown>;
@@ -133,13 +138,16 @@ function sanitizeSessionForShare(session: Session): Session {
 export function createStoryShareDoc(
  session: Session,
  current: { sceneIndex: number; beatId?: string },
+  audioByBeatId?: Record<string, string>,
 ): StoryShareDoc {
+  const hasAudio = !!audioByBeatId && Object.keys(audioByBeatId).length > 0;
  return {
-    v: 1,
+    v: hasAudio ? 2 : 1,
    kind: "infiplot-story",
    exportedAt: Date.now(),
    current,
    session: sanitizeSessionForShare(session),
+    ...(hasAudio ? { audioByBeatId } : {}),
  };
 }

@@ -149,7 +157,7 @@ export function storyShareFilename(doc: StoryShareDoc): string {

 export function parseStoryShareDoc(value: unknown): StoryShareDoc {
  if (!isRecord(value)) throw new Error("这不是有效的剧情分享文件");
-  if (value.kind !== "infiplot-story" || value.v !== 1) {
+  if (value.kind !== "infiplot-story" || (value.v !== 1 && value.v !== 2)) {
    throw new Error("剧情分享文件格式不支持");
  }
  if (typeof value.exportedAt !== "number" || !Number.isFinite(value.exportedAt)) {
@@ -211,9 +219,22 @@ export function parseStoryShareDoc(value: unknown): StoryShareDoc {
    }
  }

+  let audioByBeatId: Record<string, string> | undefined;
+  if (value.audioByBeatId !== undefined) {
+    if (!isRecord(value.audioByBeatId)) {
+      throw new Error("剧情分享文件配音数据不合法");
+    }
+    const cleaned: Record<string, string> = {};
+    for (const [k, v] of Object.entries(value.audioByBeatId)) {
+      if (typeof v === "string" && v.startsWith("data:")) cleaned[k] = v;
+    }
+    if (Object.keys(cleaned).length > 0) audioByBeatId = cleaned;
+  }
+
  const doc = value as StoryShareDoc;
  return {
    ...doc,
    session: sanitizeSessionForShare(doc.session),
+    ...(audioByBeatId ? { audioByBeatId } : {}),
  };
 }
@@ -0,0 +1,11 @@
+export const STYLE_EXTRACTION_PROMPT = `You are a senior concept artist helping describe an image's visual style so that a text-to-image diffusion model (FLUX) can reproduce the same aesthetic on different subjects.
+
+Look at the attached image and produce a single English style-prompt string that captures ONLY its visual style — NOT its subject matter. Focus on:
+- Medium / technique (e.g., watercolor, oil painting, cel-shaded anime, 3D render, pixel art)
+- Line work and rendering (sharp ink outlines, soft shading, painterly brushstrokes, flat colors)
+- Color palette and lighting (pastel, saturated, monochrome, warm golden-hour, cool neon, high contrast)
+- Mood and atmosphere (dreamy, melancholic, cinematic, nostalgic, gritty)
+- Any recognizable artistic influence (Ghibli, Makoto Shinkai, ukiyo-e, vaporwave, cyberpunk anime, etc.)
+
+Do NOT describe the characters, objects, or scene contents. Output exactly one JSON object:
+{"stylePrompt": "<comma-separated English visual-style attributes, ~30-60 words>"}`;
@@ -0,0 +1,12 @@
+import { createBrowserClient } from "@supabase/ssr";
+
+let client: ReturnType<typeof createBrowserClient> | null = null;
+
+export function createClient() {
+  if (client) return client;
+  client = createBrowserClient(
+    process.env.NEXT_PUBLIC_SUPABASE_URL!,
+    process.env.NEXT_PUBLIC_SUPABASE_PUBLISHABLE_KEY!,
+  );
+  return client;
+}
@@ -0,0 +1,3 @@
+export const AUTH_ENABLED =
+  !!process.env.NEXT_PUBLIC_SUPABASE_URL &&
+  !!process.env.NEXT_PUBLIC_SUPABASE_PUBLISHABLE_KEY;
@@ -0,0 +1,15 @@
+import { NextResponse } from "next/server";
+import { AUTH_ENABLED } from "./config";
+import { createClient } from "./server";
+
+export async function requireUser(): Promise<
+  { userId: string } | NextResponse
+> {
+  if (!AUTH_ENABLED) return { userId: "anonymous" };
+  const supabase = await createClient();
+  const claims = await supabase.auth.getClaims();
+  if (claims.error || !claims.data?.claims?.sub) {
+    return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
+  }
+  return { userId: claims.data.claims.sub };
+}
@@ -0,0 +1,26 @@
+import { createServerClient } from "@supabase/ssr";
+import { cookies } from "next/headers";
+
+export async function createClient() {
+  const cookieStore = await cookies();
+  return createServerClient(
+    process.env.NEXT_PUBLIC_SUPABASE_URL!,
+    process.env.NEXT_PUBLIC_SUPABASE_PUBLISHABLE_KEY!,
+    {
+      cookies: {
+        getAll: () => cookieStore.getAll(),
+        setAll: (cookiesToSet) => {
+          try {
+            for (const { name, value, options } of cookiesToSet) {
+              cookieStore.set(name, value, options);
+            }
+          } catch {
+            // `setAll` can be invoked from a Server Component, where the cookie
+            // store is read-only and throws. Safe to ignore — the proxy
+            // middleware refreshes the session on the next request.
+          }
+        },
+      },
+    },
+  );
+}
@@ -1 +1,61 @@
-export { xiaomiProvision as provisionVoice, xiaomiSynthesize as synthesize } from "./xiaomi";
+import type { CharacterVoice, TtsConfig, TtsProvider } from "@infiplot/types";
+import {
+  formatStepfunCatalogForPrompt,
+  isStepfun,
+  isValidStepfunVoiceId,
+  stepfunProvision,
+  type StepfunProvisionOptions,
+  stepfunSynthesize,
+} from "./stepfun";
+import { xiaomiProvision, xiaomiSynthesize } from "./xiaomi";
+
+// Re-export so /api/tts-provider, orchestrator, CharacterDesigner prompt, and
+// the client all share ONE provider-detection rule + ONE catalog rendering +
+// ONE validity check with the synth path.
+export { isStepfun, isValidStepfunVoiceId, formatStepfunCatalogForPrompt };
+
+/** Map a configured TtsConfig to its provider tag. Single source of truth for
+ *  the inference rule (host contains stepfun.com → stepfun, else xiaomi) so
+ *  /api/tts-provider and resolveVoice can't drift when a third provider is
+ *  added. A PRESENT TtsConfig always maps to a concrete provider — `null`
+ *  (no TTS configured) is the caller's responsibility to handle separately. */
+export function inferTtsProvider(cfg: TtsConfig): Exclude<TtsProvider, null> {
+  return isStepfun(cfg) ? "stepfun" : "xiaomi";
+}
+
+// `opts.stepfunVoiceId` threads the CharacterDesigner's LLM-selected preset
+// down to stepfunProvision. Xiaomi ignores it. See StepfunProvisionOptions.
+export type ProvisionVoiceOptions = StepfunProvisionOptions;
+
+export async function provisionVoice(
+  cfg: TtsConfig,
+  description: string,
+  // Optional per-character salt (typically the character name). Only
+  // StepFun's preset-picker uses it — Xiaomi voicedesign mints a unique
+  // clip per call regardless. Threading it through keeps the API uniform
+  // and prevents archetype collisions on the StepFun path.
+  salt?: string,
+  opts?: ProvisionVoiceOptions,
+): Promise<CharacterVoice> {
+  return isStepfun(cfg)
+    ? stepfunProvision(cfg, description, salt, opts)
+    : xiaomiProvision(cfg, description);
+}
+
+// Dispatch by the voice's own provider tag, not by the current config. A
+// session can outlive a provider switch (e.g. .env.local flip mid-game), and
+// each voice must be synthesized via the protocol that minted it. The cfg
+// still needs to point at the matching provider's endpoint; mismatch surfaces
+// as a transparent network error, which `synthesizeBeat` already swallows.
+export async function synthesize(
+  cfg: TtsConfig,
+  voice: CharacterVoice,
+  text: string,
+  delivery?: string,
+  signal?: AbortSignal,
+): Promise<{ audioBase64: string; mimeType: string }> {
+  if (voice.provider === "stepfun") {
+    return stepfunSynthesize(cfg, voice, text, delivery, signal);
+  }
+  return xiaomiSynthesize(cfg, voice, text, delivery, signal);
+}
@@ -0,0 +1,34 @@
+[
+  { "id": "cixingnansheng", "gender": "male", "age": "young", "tones": ["磁性", "成熟", "narrative"], "desc": "磁性成熟男声，沉稳有厚度，适合旁白/叙事/解说" },
+  { "id": "wenrounansheng", "gender": "male", "age": "young", "tones": ["温柔", "gentle", "supportive"], "desc": "温柔男声，暖系治愈，适合陪伴/安抚/暖男主" },
+  { "id": "wenrougongzi", "gender": "male", "age": "young", "tones": ["温柔", "公子", "tender"], "desc": "温柔公子型男声，清润书卷气，适合古风公子/儒雅青年" },
+  { "id": "yuanqinansheng", "gender": "male", "age": "teen", "tones": ["元气", "energetic", "阳光"], "desc": "元气阳光少年男声，明亮有活力，适合少年/阳光系男主" },
+  { "id": "zhengpaiqingnian", "gender": "male", "age": "young", "tones": ["正派", "正气", "earnest"], "desc": "正派正气青年男声，端庄坚定，适合正剧男主/英雄" },
+  { "id": "shuangkuainansheng", "gender": "male", "age": "young", "tones": ["爽快", "干脆", "brisk"], "desc": "爽快干脆男声，利落不拖沓，适合热血/爽文男主" },
+  { "id": "boyinnansheng", "gender": "male", "age": "middle", "tones": ["播音", "broadcast", "稳重"], "desc": "播音腔稳重男声，字正腔圆，适合新闻/旁白/中年男主" },
+  { "id": "ruyananshi", "gender": "male", "age": "middle", "tones": ["儒雅", "斯文", "refined"], "desc": "儒雅斯文中年男声，文气内敛，适合学者/师者/儒雅男性" },
+  { "id": "shenchennanyin", "gender": "male", "age": "middle", "tones": ["深沉", "低沉", "deep"], "desc": "深沉低沉男声，厚重磁性，适合成熟/权威/反派男主" },
+  { "id": "qingniandaxuesheng", "gender": "male", "age": "young", "tones": ["大学生", "青年", "student"], "desc": "大学生青年男声，自然清爽，适合校园男主/学生" },
+  { "id": "zixinnansheng", "gender": "male", "age": "young", "tones": ["自信", "confident"], "desc": "自信青年男声，有底气不张扬，适合精英/自信男主" },
+  { "id": "elegantgentle-female", "gender": "female", "age": "young", "tones": ["气质", "温婉", "professional"], "desc": "气质温婉女声，得体大方，适合职业女性/气质女主" },
+  { "id": "livelybreezy-female", "gender": "female", "age": "teen", "tones": ["活力", "轻快", "upbeat"], "desc": "活力轻快少女声，明快有节奏，适合元气少女" },
+  { "id": "jingdiannvsheng", "gender": "female", "age": "middle", "tones": ["经典", "classic", "成熟"], "desc": "经典成熟女声，圆润端庄，适合旁白/成熟女性" },
+  { "id": "wenroushunv", "gender": "female", "age": "middle", "tones": ["温柔", "熟女", "mature"], "desc": "温柔熟女声，成熟柔润，适合熟女/姐姐型角色" },
+  { "id": "tianmeinvsheng", "gender": "female", "age": "young", "tones": ["甜美", "sweet"], "desc": "甜美女声，甜润可爱，适合甜系女主/甜妹" },
+  { "id": "qingchunshaonv", "gender": "female", "age": "teen", "tones": ["清纯", "少女", "pure"], "desc": "清纯少女声，干净清澈，适合清纯少女/初恋感" },
+  { "id": "yuanqishaonv", "gender": "female", "age": "teen", "tones": ["元气", "少女", "活力", "energetic"], "desc": "元气活力少女声，明亮张扬，适合元气少女/活泼女主" },
+  { "id": "linjiajiejie", "gender": "female", "age": "young", "tones": ["邻家", "姐姐"], "desc": "邻家姐姐声，亲切自然，适合邻家姐姐/青梅竹马" },
+  { "id": "jilingshaonv", "gender": "female", "age": "teen", "tones": ["机灵", "灵动", "少女"], "desc": "机灵灵动少女声，俏皮跳脱，适合机灵少女/鬼马角色" },
+  { "id": "ruanmengnvsheng", "gender": "female", "age": "teen", "tones": ["软萌", "可爱", "稚嫩", "甜软"], "desc": "软萌可爱稚嫩女声，甜软奶气，适合萝莉/软萌角色" },
+  { "id": "youyanvsheng", "gender": "female", "age": "young", "tones": ["优雅", "elegant"], "desc": "优雅女声，从容矜持，适合优雅/淑女型角色" },
+  { "id": "lengyanyujie", "gender": "female", "age": "middle", "tones": ["冷艳", "御姐", "高冷"], "desc": "冷艳御姐声，高冷有气场，适合御姐/女王/高冷女主" },
+  { "id": "shuangkuaijiejie", "gender": "female", "age": "young", "tones": ["爽快", "姐姐", "干脆"], "desc": "爽快干脆姐姐声，利落飒爽，适合飒爽女主/大姐大" },
+  { "id": "wenjingxuejie", "gender": "female", "age": "young", "tones": ["文静", "学姐", "安静"], "desc": "文静学姐声，安静内敛，适合文静/学姐/内向女主" },
+  { "id": "linjiameimei", "gender": "female", "age": "teen", "tones": ["邻家", "妹妹"], "desc": "邻家妹妹声，稚气天真，适合妹妹型/天真少女" },
+  { "id": "zhixingjiejie", "gender": "female", "age": "young", "tones": ["知性", "姐姐", "聪慧"], "desc": "知性聪慧姐姐声，沉稳理性，适合知性女性/学姐" },
+  { "id": "ganliannvsheng", "gender": "female", "age": "middle", "tones": ["干练", "sharp", "professional"], "desc": "干练职业女声，利落专业，适合职场女性/女强人" },
+  { "id": "qinhenvsheng", "gender": "female", "age": "young", "tones": ["亲和", "warm", "亲切"], "desc": "亲和温暖女声，亲切易接近，适合亲和型/治愈系女主" },
+  { "id": "huolinvsheng", "gender": "female", "age": "young", "tones": ["活力", "lively", "活泼"], "desc": "活力活泼女声，热情外放，适合活泼女主/开朗角色" },
+  { "id": "qinqienvsheng", "gender": "female", "age": "middle", "tones": ["亲切", "温暖"], "desc": "亲切温暖中年女声，温厚母性，适合阿姨/母亲/温暖长辈" },
+  { "id": "wenrounvsheng", "gender": "female", "age": "young", "tones": ["温柔", "tender", "柔和"], "desc": "温柔柔和女声，轻柔不张扬，适合温柔女主/治愈系" }
+]
@@ -0,0 +1,230 @@
+import type { CharacterVoice, TtsConfig } from "@infiplot/types";
+import catalogData from "./stepfun-voices.json";
+
+// Preset voice record. The 32 presets live in stepfun-voices.json (the single
+// source of truth — shared with the CharacterDesigner prompt, /api/tts-provider
+// validity check, and the offline enrich script). gender/age are discriminant
+// unions so detectGender / detectAge scoring stays type-safe.
+export type PresetVoice = {
+  id: string;
+  gender: "male" | "female";
+  age: "teen" | "young" | "middle";
+  /** Keywords (中文 or English) that, when present in the LLM's voice
+   *  description, boost this preset's score. Drawn from StepFun's published
+   *  voice name + recommended scenario. */
+  tones: string[];
+  /** 中文人设短语，供 LLM（设定师 prompt / enrich 脚本）在选音色时理解每个
+   *  预设适合的角色类型。打分函数（pickStepfunVoiceId）仍只用 tones。 */
+  desc: string;
+};
+
+// JSON literals widen gender/age to `string`; cast back to the discriminant
+// unions. The catalog is a build-time-checked asset (touched rarely), and
+// pickStepfunVoiceId / isValidStepfunVoiceId tolerate anything we ship, so a
+// wrong entry surfaces as a bad voice pick rather than a crash.
+const PRESET_VOICES = catalogData as unknown as PresetVoice[];
+
+// StepFun TTS uses an OpenAI-compatible /v1/audio/speech endpoint with PRESET
+// voice IDs only — there is no "design a new voice from text description"
+// equivalent to Xiaomi MiMo's voicedesign. We therefore translate the LLM's
+// Chinese voiceDescription into a preset voice ID by keyword matching
+// (gender + age + tone), with a deterministic hash-based spread across the
+// top-N candidates so multiple similar characters don't collapse onto the
+// same voice. Provision is a pure function — no network call needed.
+
+/** Provider detection — shared by /api/tts-provider, orchestrator fallback,
+ *  and the client (via the route). StepFun is inferred from a *.stepfun.com
+ *  host in the base URL, matching lib/tts-client/index.ts. Exported so every
+ *  caller agrees on the same rule. */
+export function isStepfun(cfg: TtsConfig): boolean {
+  return /(^|[./])stepfun\.com\b/i.test(cfg.baseUrl);
+}
+
+function arrayBufferToBase64(buffer: ArrayBuffer): string {
+  const bytes = new Uint8Array(buffer);
+  let binary = "";
+  const len = bytes.byteLength;
+  for (let i = 0; i < len; i++) {
+    binary += String.fromCharCode(bytes[i]!);
+  }
+  return btoa(binary);
+}
+
+const OUTPUT_FORMAT = "mp3";
+const OUTPUT_MIME = "audio/mpeg";
+
+// Full catalog from StepFun's docs (32 presets across step-tts-mini /
+// step-tts-2 / stepaudio-2.5-tts). The JSON is the single source of truth —
+// shared by the scorer here, the CharacterDesigner prompt (via
+// formatStepfunCatalogForPrompt), the /api/tts-provider route's validity
+// check, and the offline enrich script. Adding more later is safe — the
+// scorer degrades gracefully when an unknown id is picked.
+// (catalogData is cast to PresetVoice[] at the import above; kept as
+// PRESET_VOICES so existing references stay unchanged.)
+
+/** All valid preset voice ids — for validation by the CharacterDesigner
+ *  (discard an out-of-catalog LLM pick) and the enrich script. */
+export const STEPFUN_PRESET_VOICE_IDS: string[] = PRESET_VOICES.map(
+  (v) => v.id,
+);
+
+const STEPFUN_ID_SET = new Set(STEPFUN_PRESET_VOICE_IDS);
+
+/** True iff `id` is one of the 32 catalog presets. Used to drop LLM-hallucinated
+ *  ids before they reach StepFun (which would otherwise 4xx on synth). */
+export function isValidStepfunVoiceId(id: string | null | undefined): boolean {
+  return !!id && STEPFUN_ID_SET.has(id);
+}
+
+/** Render the catalog as a 中文 prompt-friendly list, one line per preset,
+ *  so the CharacterDesigner and the enrich script can ask the LLM to pick a
+ *  matching voice id. Each line: `id — desc（gender/age）`. */
+export function formatStepfunCatalogForPrompt(): string {
+  return PRESET_VOICES.map(
+    (v) => `- ${v.id}：${v.desc}（${v.gender}/${v.age}）`,
+  ).join("\n");
+}
+
+// Cheap deterministic 32-bit hash — used only to spread similar descriptions
+// across the top-N candidate voices so two "温柔女声" characters don't collide.
+function hashStr(s: string): number {
+  let h = 5381;
+  for (let i = 0; i < s.length; i++) h = ((h << 5) + h + s.charCodeAt(i)) | 0;
+  return Math.abs(h);
+}
+
+function detectGender(desc: string): "male" | "female" {
+  if (/女性|女声|少女|姐姐|妹妹|熟女|御姐|阿姨|奶奶|女孩|姑娘|大妈|女子|女生|女士|小姐/.test(desc)) {
+    return "female";
+  }
+  if (/男性|男声|少年|青年|大叔|哥哥|弟弟|男人|男孩|大爷|爷爷|男子|男生|先生|公子|师傅/.test(desc)) {
+    return "male";
+  }
+  // Weak signals: single-char pronouns checked last to avoid false positives
+  // on compound words like "其他" (other) or "她们" (they-fem).
+  if (/她/.test(desc)) return "female";
+  if (/他/.test(desc)) return "male";
+  return "female";
+}
+
+function detectAge(desc: string): "teen" | "young" | "middle" {
+  if (/中年|熟女|大叔|大妈|阿姨|奶奶|爷爷|老师|师傅|御姐|经理|总监|教授|博士|总裁|长辈|父亲|母亲|爸爸|妈妈/.test(desc)) {
+    return "middle";
+  }
+  if (/少女|少年|学生|高中|初中|妹妹|弟弟|小学|童年|稚嫩|十几岁|十六|十七|十八|未成年/.test(desc)) {
+    return "teen";
+  }
+  return "young";
+}
+
+/** Map an LLM-written 中文 voice description to a StepFun preset voice ID.
+ *  Pure function — exported for tests and for the synthesis-time sanity log.
+ */
+export function pickStepfunVoiceId(description: string, salt = ""): string {
+  const desc = description.toLowerCase();
+  const gender = detectGender(desc);
+  const age = detectAge(desc);
+
+  const scored = PRESET_VOICES
+    .filter((v) => v.gender === gender)
+    .map((v) => {
+      let score = 0;
+      if (v.age === age) score += 4;
+      for (const tone of v.tones) {
+        if (desc.includes(tone.toLowerCase())) score += 2;
+      }
+      return { v, score };
+    })
+    .sort((a, b) => b.score - a.score);
+
+  // Catalog can't be filtered to zero; this guards against a future edit
+  // that prunes the table too aggressively.
+  if (scored.length === 0) return PRESET_VOICES[0]!.id;
+
+  // Pick from the top 3 (or fewer) deterministically by hashing the
+  // description + an optional salt (charName) so two characters that share
+  // archetype keywords don't collapse onto the identical preset. Hash the
+  // lowercased desc so case differences in the same description don't pick
+  // different presets (scoring above is already case-insensitive).
+  const top = scored.slice(0, Math.min(3, scored.length));
+  const idx = hashStr(desc + "|" + salt.toLowerCase()) % top.length;
+  return top[idx]!.v.id;
+}
+
+// Provision is synchronous / no network — StepFun has no voicedesign equivalent.
+// We mirror xiaomiProvision's async signature so the router stays uniform.
+// The optional `salt` (character name) spreads two characters that share
+// archetype keywords across the top-N candidate presets.
+//
+// `opts.stepfunVoiceId` — when the CharacterDesigner already picked a preset
+// (it sees the same catalog via formatStepfunCatalogForPrompt), honor it if
+// valid; otherwise fall back to the keyword scorer. This keeps StepFun
+// provisioning a pure function (zero network cost) while lifting voice-id
+// selection quality to LLM-grade on the live path.
+export type StepfunProvisionOptions = {
+  /** LLM-selected preset id from the CharacterDesigner; validated against the
+   *  catalog and ignored when out of range (hallucination guard). */
+  stepfunVoiceId?: string;
+};
+
+export async function stepfunProvision(
+  cfg: TtsConfig,
+  description: string,
+  salt?: string,
+  opts?: StepfunProvisionOptions,
+): Promise<CharacterVoice> {
+  const voiceId =
+    opts && isValidStepfunVoiceId(opts.stepfunVoiceId)
+      ? opts.stepfunVoiceId!
+      : pickStepfunVoiceId(description, salt);
+  return {
+    provider: "stepfun",
+    voiceId,
+    model: cfg.speechModel,
+    mimeType: OUTPUT_MIME,
+  };
+}
+
+export async function stepfunSynthesize(
+  cfg: TtsConfig,
+  voice: CharacterVoice,
+  text: string,
+  _delivery?: string,
+  signal?: AbortSignal,
+): Promise<{ audioBase64: string; mimeType: string }> {
+  if (voice.provider !== "stepfun") {
+    throw new Error(
+      `stepfunSynthesize received non-stepfun voice (provider="${voice.provider}")`,
+    );
+  }
+
+  // Strip trailing slash so /v1 + /audio/speech doesn't double up.
+  const base = cfg.baseUrl.replace(/\/$/, "");
+  const url = `${base}/audio/speech`;
+
+  const body = {
+    model: voice.model || cfg.speechModel,
+    input: text,
+    voice: voice.voiceId,
+    response_format: OUTPUT_FORMAT,
+  };
+
+  const res = await fetch(url, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${cfg.apiKey}`,
+    },
+    body: JSON.stringify(body),
+    signal,
+  });
+
+  if (!res.ok) {
+    const txt = await res.text();
+    throw new Error(`StepFun TTS ${res.status}: ${txt.slice(0, 300)}`);
+  }
+
+  const ab = await res.arrayBuffer();
+  const audioBase64 = arrayBufferToBase64(ab);
+  return { audioBase64, mimeType: OUTPUT_MIME };
+}
@@ -79,6 +79,11 @@ export async function xiaomiSynthesize(
  delivery?: string,
  signal?: AbortSignal,
 ): Promise<{ audioBase64: string; mimeType: string }> {
+  if (voice.provider !== "xiaomi") {
+    throw new Error(
+      `xiaomiSynthesize received non-xiaomi voice (provider="${voice.provider}")`,
+    );
+  }
  const url = joinUrl(cfg.baseUrl, "/chat/completions");

  // The free-form delivery direction rides in the `user` (director) message,
@@ -160,12 +160,24 @@ export type WriterPlan = {
 //  Characters & voices (TTS)
 // ──────────────────────────────────────────────────────────────────────

-export type CharacterVoice = {
-  provider: "xiaomi";
-  /** Xiaomi MiMo design output stored as reference audio for later clones. */
-  referenceAudioBase64: string;
-  mimeType: string;
-};
+export type CharacterVoice =
+  | {
+      provider: "xiaomi";
+      /** Xiaomi MiMo design output stored as reference audio for later clones. */
+      referenceAudioBase64: string;
+      mimeType: string;
+    }
+  | {
+      provider: "stepfun";
+      /** StepFun preset voice ID (e.g. "cixingnansheng"). Selected by keyword
+       *  matching against the LLM-written voiceDescription — no network call
+       *  on provision (StepFun has no voicedesign endpoint), so this carries
+       *  only the picked preset, not a clip. */
+      voiceId: string;
+      /** TTS model used at synth time (step-tts-mini / step-tts-2 / stepaudio-2.5-tts). */
+      model: string;
+      mimeType: string;
+    };

 export type Character = {
  name: string;
@@ -196,6 +208,13 @@ export type Character = {
  basePortraitUrl?: string;
  /** Xiaomi MiMo voice reference audio. */
  voice?: CharacterVoice;
+  /** StepFun preset voice id (e.g. "cixingnansheng"). Only present on
+   *  characters designed while the server ran StepFun, OR on prebaked
+   *  homepage cards enriched with a StepFun voice id. Lets the client send a
+   *  lightweight beat-audio request (no ~220KB Xiaomi reference audio) when the
+   *  server runs StepFun, and lets the server normalize an off-provider voice
+   *  without a fresh provision. Validated against the catalog at synth time. */
+  stepfunVoiceId?: string;
 };

 /** A single beat's synthesized audio, attached to the response. */
@@ -315,19 +334,15 @@ export type VisionClassify = "insert-beat" | "change-scene";
 *   openai_compatible  text / vision / image  — OpenAI Chat Completions +
 *                      `/images/generations` (self-implemented fetch; the
 *                      default for text/vision when unset)
- *   anthropic          text / vision          — native Anthropic Messages (AI SDK)
- *   google             text / vision / image  — native Gemini (AI SDK); image
- *                      uses the Nano Banana family
- *   openai             image only             — OpenAI gpt-image via AI SDK,
- *                      unlocks reference-image editing (for text/vision use
- *                      openai_compatible, which already speaks OpenAI's format)
+ *   openai             image only             — OpenAI gpt-image via the
+ *                      official OpenAI SDK, unlocks reference-image editing
+ *                      (for text/vision use openai_compatible, which already
+ *                      speaks OpenAI's format)
 *   runware            image only             — Runware task-array protocol
 *                      (self-implemented; the default for runware.ai URLs)
 */
 export type ProviderProtocol =
  | "openai_compatible"
-  | "anthropic"
-  | "google"
  | "openai"
  | "runware";

@@ -351,6 +366,22 @@ export type TtsConfig = {
  speechModel: string;
 };

+/** Which TTS provider the server is configured for (inferred from TtsConfig's
+ *  base URL by lib/tts-client's isStepfun). Exposed to the client via the
+ *  /api/tts-provider route so the play page can send only the voice fields
+ *  the server actually needs — e.g. skip the ~220KB Xiaomi reference audio
+ *  when the server runs StepFun (saving Fast Origin Transfer bandwidth).
+ *  `null` means no server-side TTS (silent). BYO client TTS takes precedence
+ *  over this signal. */
+export type TtsProvider = "stepfun" | "xiaomi" | null;
+
+// /api/tts-provider — lightweight GET returning the server's TTS provider so
+// the client can shape beat-audio request bodies accordingly (see fetchBeatAudio
+// in app/play/page.tsx). Response is a few dozen bytes; runs once per session.
+export type TtsProviderResponse = {
+  provider: TtsProvider;
+};
+
 export type EngineConfig = {
  text: ProviderConfig;
  image: ProviderConfig;
@@ -359,6 +390,19 @@ export type EngineConfig = {
  tts?: TtsConfig;
  /** When true the renderer returns a placeholder PNG instead of calling the image API. */
  mockImage?: boolean;
+  /**
+   * Per-attempt hard timeout (ms) for image-generation requests. Unset → no
+   * client-side timeout (only the provider's own gateway limits apply, e.g.
+   * Runware kills tasks at ~55s with a 504).
+   */
+  imageTimeoutMs?: number;
+  /**
+   * Painter scene-paint hedge threshold (ms). When the Tier-A (referenced)
+   * paint hasn't completed after this long, a second identical request races
+   * the first and the earlier result wins. Unset/0 → hedging disabled.
+   * Derived from healthy-day Runware p95 (~14s); recommended 15000.
+   */
+  imageHedgeMs?: number;
 };

 // ──────────────────────────────────────────────────────────────────────
@@ -440,7 +484,23 @@ export type BeatAudioRequest = {
    line: string;
    lineDelivery?: string;
  };
-  voice: CharacterVoice;
+  /** The speaker's already-provisioned voice. Optional now — when the server
+   *  runs a DIFFERENT provider than `voice.provider` (e.g. the client holds a
+   *  Xiaomi voice from a prebaked card but the server runs StepFun), the
+   *  client may omit `voice` and send `voiceDescription` + `stepfunVoiceId`
+   *  instead to save the ~220KB reference-audio transfer. The server then
+   *  re-provisions against its own provider before synthesizing. */
+  voice?: CharacterVoice;
+  /** Voice-design card (中文). Used by the server to re-provision when
+   *  `voice` is absent or its provider doesn't match the server's TTS. */
+  voiceDescription?: string;
+  /** Speaker name — used as the StepFun provision salt for archetype spreading
+   *  when the server falls back to pickStepfunVoiceId. */
+  characterName?: string;
+  /** Pre-selected StepFun preset id (from a live CharacterDesigner pick or a
+   *  prebaked card). Honored directly when the server runs StepFun, skipping
+   *  both the keyword scorer and a network provision. */
+  stepfunVoiceId?: string;
 };

 export type BeatAudioResponse = {
@@ -15,18 +15,18 @@
    "start": "next start",
    "lint": "next lint",
    "typecheck": "tsc --noEmit",
+    "enrich:firstacts": "node scripts/enrich-firstacts-stepfun.mjs",
    "build:cf": "opennextjs-cloudflare build",
    "preview:cf": "opennextjs-cloudflare preview",
    "deploy:cf": "opennextjs-cloudflare deploy"
  },
  "dependencies": {
-    "@ai-sdk/anthropic": "^3.0.81",
-    "@ai-sdk/google": "^3.0.80",
-    "@ai-sdk/openai": "^3.0.67",
-    "ai": "^6.0.196",
+    "@supabase/ssr": "^0.12",
+    "@supabase/supabase-js": "^2.108",
    "jsonrepair": "^3.14.0",
    "jszip": "^3.10.1",
    "next": "^16.0.0",
+    "openai": "^6.42.0",
    "react": "^19.0.0",
    "react-dom": "^19.0.0"
  },
@@ -8,18 +8,12 @@ importers:

  .:
    dependencies:
-      '@ai-sdk/anthropic':
-        specifier: ^3.0.81
-        version: 3.0.81(zod@4.4.3)
-      '@ai-sdk/google':
-        specifier: ^3.0.80
-        version: 3.0.80(zod@4.4.3)
-      '@ai-sdk/openai':
-        specifier: ^3.0.67
-        version: 3.0.67(zod@4.4.3)
-      ai:
-        specifier: ^6.0.196
-        version: 6.0.196(zod@4.4.3)
+      '@supabase/ssr':
+        specifier: ^0.12
+        version: 0.12.0(@supabase/supabase-js@2.108.1)
+      '@supabase/supabase-js':
+        specifier: ^2.108
+        version: 2.108.1
      jsonrepair:
        specifier: ^3.14.0
        version: 3.14.0
@@ -29,6 +23,9 @@ importers:
      next:
        specifier: ^16.0.0
        version: 16.2.7(@opentelemetry/api@1.9.1)(react-dom@19.2.7(react@19.2.7))(react@19.2.7)
+      openai:
+        specifier: ^6.42.0
+        version: 6.42.0(ws@8.20.1)(zod@4.4.3)
      react:
        specifier: ^19.0.0
        version: 19.2.7
@@ -69,40 +66,6 @@ importers:

 packages:

-  '@ai-sdk/anthropic@3.0.81':
-    resolution: {integrity: sha512-B1JDd9Ugq9R5AgIaW3674lhGCMMYJcPUxnrZh8fzbGojgg4QvHFRv6eZahGQAUsmGHbcf74G9bdSBDLWQGY2GA==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/gateway@3.0.124':
-    resolution: {integrity: sha512-h8CrmbSG+8X0C+M/E1M4oiDHYevqwbzAPN+uLRHS0eJaatF2MZ+juNtOHXNOjk7Bsk9mD2RjYMjJO9dFkb9I7Q==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/google@3.0.80':
-    resolution: {integrity: sha512-5ORbm/yFUPO0MEvZsxBMN0cdKw2+lwU/wVn5KN3KF8Dmk1LughuDuUohMh/7iU/XFTiyB0OvmTW/tdV/J7O9zg==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/openai@3.0.67':
-    resolution: {integrity: sha512-oAiGC9eWG7IgtdsdS74bOCnAAHarAfTJhWN9x5INwnWPekL802AvF+0I5DvLzIF1MIRmNw4N8mPSL/GUVbX9Mw==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/provider-utils@4.0.27':
-    resolution: {integrity: sha512-ubkAJ+xODouwtmN1tYlvTPphH1hPOBfZaEQe8U7skGvFAnIRs9PPpsq57bC2+Ky/MB4yzhd6YOsxTAx9sGpazw==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
-  '@ai-sdk/provider@3.0.10':
-    resolution: {integrity: sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw==}
-    engines: {node: '>=18'}
-
  '@alloc/quick-lru@5.2.0':
    resolution: {integrity: sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==}
    engines: {node: '>=10'}
@@ -1257,8 +1220,37 @@ packages:
  '@speed-highlight/core@1.2.15':
    resolution: {integrity: sha512-BMq1K3DsElxDWawkX6eLg9+CKJrTVGCBAWVuHXVUV2u0s2711qiChLSId6ikYPfxhdYocLNt3wWwSvDiTvFabw==}

-  '@standard-schema/spec@1.1.0':
-    resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==}
+  '@supabase/auth-js@2.108.1':
+    resolution: {integrity: sha512-Lle5rKU8f9LF3K5dDd8Or8mkkG+ptzRZZWKPVMm9B9UuovH65Ss2+iFnQqRsCqaGouvJEcTWyl0cj2riNrrDLQ==}
+    engines: {node: '>=20.0.0'}
+
+  '@supabase/functions-js@2.108.1':
+    resolution: {integrity: sha512-fxBRW/A4IG7ADQztVt0NaEy5ysiO1WJ2pbldsnBchrkHuyepX0Krek9qA9T4gUQBVVTCE9Ea4pdsM5hfn3nc4A==}
+    engines: {node: '>=20.0.0'}
+
+  '@supabase/phoenix@0.4.2':
+    resolution: {integrity: sha512-YSAGnmDAfuleFCVt3CeurQZAhxRfXWeZIIkwp7NhYzQ1UwW6ePSnzsFAiUm/mbCkfoCf70QQHKW/K6RKh52a4A==}
+
+  '@supabase/postgrest-js@2.108.1':
+    resolution: {integrity: sha512-9lj2MCPPMgSTaJ5y+amnhb3TWPtMFVlbDn2hmX/VV91xQU4j0AauwfMaBErHBJ+zzsSwjc0jLU+zLIZFLQzfig==}
+    engines: {node: '>=20.0.0'}
+
+  '@supabase/realtime-js@2.108.1':
+    resolution: {integrity: sha512-mHGGqOjwd1XTydcoffUqEMsbFQHUi6A3uhQ0EXr3iqzpLqItxKA9nbN6gIQxrZ7JRRnuUe/iOFPUkYV9Tdc5lg==}
+    engines: {node: '>=20.0.0'}
+
+  '@supabase/ssr@0.12.0':
+    resolution: {integrity: sha512-d9XV5XzJvzzZbeAIM7fWTCUYxQJZ2Ru6ny3dJHmHGp/LIrJ+o9FpD7N9Rf/UhhWEvHXSoDe8SI32Z2ouOdMjBg==}
+    peerDependencies:
+      '@supabase/supabase-js': ^2.108.0
+
+  '@supabase/storage-js@2.108.1':
+    resolution: {integrity: sha512-Er0SGGt85iT6ye+SSh98Az6L2CesoZJuyzEZYH2oBOAnIxa9Nn4CtwUC3veGxYggoT56X+3tVuuQeDBP8kR8sg==}
+    engines: {node: '>=20.0.0'}
+
+  '@supabase/supabase-js@2.108.1':
+    resolution: {integrity: sha512-V/1hRKLSCJ0zEL+9QFRBUtivvePfOsaAYQmC0HhFNSHC2F3xFs4jSF3YhkLmzex6E4V4FGvmBDOP72D/53NnZA==}
+    engines: {node: '>=20.0.0'}

  '@swc/helpers@0.5.15':
    resolution: {integrity: sha512-JQ5TuMi45Owi4/BIMAJBoSQoOJu12oOk/gADqlcUL9JEdHB8vyjUSsxqeNXnmXHjYKMi2WcYtezGEEhqUI/E2g==}
@@ -1283,10 +1275,6 @@ packages:
  '@types/react@19.2.16':
    resolution: {integrity: sha512-esJiCAnl0kfpNdE69f3So4WJUXy95dLZydX0KwK46riIHDzHM7O9Vtf9xCHW0PXIqvgqNrswl522kA/5yx+F4w==}

-  '@vercel/oidc@3.2.0':
-    resolution: {integrity: sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug==}
-    engines: {node: '>= 20'}
-
  abort-controller@3.0.0:
    resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
    engines: {node: '>=6.5'}
@@ -1304,12 +1292,6 @@ packages:
    resolution: {integrity: sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==}
    engines: {node: '>= 8.0.0'}

-  ai@6.0.196:
-    resolution: {integrity: sha512-2T45UeqKL4a11KQ14I5i1YYHOvCFrMF478E1k6PVjlQSGUvXSv4xrxIaQbUL4qgv91DADSbddwv3oR49pPAK3g==}
-    engines: {node: '>=18'}
-    peerDependencies:
-      zod: ^3.25.76 || ^4.1.8
-
  ansi-colors@4.1.3:
    resolution: {integrity: sha512-/6w/C21Pm1A7aZitlI5Ni/2J6FFQN8i1Cvz3kHABAAbw93v/NlvKdVOqz7CCWz/3iv/JplRSEEZ83XION15ovw==}
    engines: {node: '>=6'}
@@ -1618,10 +1600,6 @@ packages:
    resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
    engines: {node: '>=6'}

-  eventsource-parser@3.1.0:
-    resolution: {integrity: sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg==}
-    engines: {node: '>=18.0.0'}
-
  execa@5.1.1:
    resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==}
    engines: {node: '>=10'}
@@ -1767,6 +1745,10 @@ packages:
  humanize-ms@1.2.1:
    resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==}

+  iceberg-js@0.8.1:
+    resolution: {integrity: sha512-1dhVQZXhcHje7798IVM+xoo/1ZdVfzOMIc8/rgVSijRK38EDqOJoGula9N/8ZI5RD8QTxNQtK/Gozpr+qUqRRA==}
+    engines: {node: '>=20.0.0'}
+
  iconv-lite@0.7.2:
    resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==}
    engines: {node: '>=0.10.0'}
@@ -1833,9 +1815,6 @@ packages:
    resolution: {integrity: sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==}
    hasBin: true

-  json-schema@0.4.0:
-    resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==}
-
  jsonrepair@3.14.0:
    resolution: {integrity: sha512-tWPGKMZf/8UPim+fcW2EfcQ/d/7aKUrP6IECz9G3Tu6Q5dX0orSleqJ9z6sSw7qrQkjF8/Edo4DvsWBZ8H+HNg==}
    hasBin: true
@@ -2028,6 +2007,17 @@ packages:
    resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==}
    engines: {node: '>=6'}

+  openai@6.42.0:
+    resolution: {integrity: sha512-1WFEt/uXMXOLhYRNkgJWo08Y2YNvNwpVU72K7ibrWgWpNOXd4VojXLbe6SQ4bLiUQ3Y8jz4IiyVkylJCL1DtZg==}
+    peerDependencies:
+      ws: ^8.18.0
+      zod: ^3.25 || ^4.0
+    peerDependenciesMeta:
+      ws:
+        optional: true
+      zod:
+        optional: true
+
  package-json-from-dist@1.0.1:
    resolution: {integrity: sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==}

@@ -2495,42 +2485,6 @@ packages:

 snapshots:

-  '@ai-sdk/anthropic@3.0.81(zod@4.4.3)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.10
-      '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3)
-      zod: 4.4.3
-
-  '@ai-sdk/gateway@3.0.124(zod@4.4.3)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.10
-      '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3)
-      '@vercel/oidc': 3.2.0
-      zod: 4.4.3
-
-  '@ai-sdk/google@3.0.80(zod@4.4.3)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.10
-      '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3)
-      zod: 4.4.3
-
-  '@ai-sdk/openai@3.0.67(zod@4.4.3)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.10
-      '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3)
-      zod: 4.4.3
-
-  '@ai-sdk/provider-utils@4.0.27(zod@4.4.3)':
-    dependencies:
-      '@ai-sdk/provider': 3.0.10
-      '@standard-schema/spec': 1.1.0
-      eventsource-parser: 3.1.0
-      zod: 4.4.3
-
-  '@ai-sdk/provider@3.0.10':
-    dependencies:
-      json-schema: 0.4.0
-
  '@alloc/quick-lru@5.2.0': {}

  '@ast-grep/napi-darwin-arm64@0.40.5':
@@ -3632,7 +3586,8 @@ snapshots:
      - encoding
      - supports-color

-  '@opentelemetry/api@1.9.1': {}
+  '@opentelemetry/api@1.9.1':
+    optional: true

  '@poppinss/colors@4.1.6':
    dependencies:
@@ -3844,7 +3799,42 @@ snapshots:

  '@speed-highlight/core@1.2.15': {}

-  '@standard-schema/spec@1.1.0': {}
+  '@supabase/auth-js@2.108.1':
+    dependencies:
+      tslib: 2.8.1
+
+  '@supabase/functions-js@2.108.1':
+    dependencies:
+      tslib: 2.8.1
+
+  '@supabase/phoenix@0.4.2': {}
+
+  '@supabase/postgrest-js@2.108.1':
+    dependencies:
+      tslib: 2.8.1
+
+  '@supabase/realtime-js@2.108.1':
+    dependencies:
+      '@supabase/phoenix': 0.4.2
+      tslib: 2.8.1
+
+  '@supabase/ssr@0.12.0(@supabase/supabase-js@2.108.1)':
+    dependencies:
+      '@supabase/supabase-js': 2.108.1
+      cookie: 1.1.1
+
+  '@supabase/storage-js@2.108.1':
+    dependencies:
+      iceberg-js: 0.8.1
+      tslib: 2.8.1
+
+  '@supabase/supabase-js@2.108.1':
+    dependencies:
+      '@supabase/auth-js': 2.108.1
+      '@supabase/functions-js': 2.108.1
+      '@supabase/postgrest-js': 2.108.1
+      '@supabase/realtime-js': 2.108.1
+      '@supabase/storage-js': 2.108.1

  '@swc/helpers@0.5.15':
    dependencies:
@@ -3873,8 +3863,6 @@ snapshots:
    dependencies:
      csstype: 3.2.3

-  '@vercel/oidc@3.2.0': {}
-
  abort-controller@3.0.0:
    dependencies:
      event-target-shim: 5.0.1
@@ -3890,14 +3878,6 @@ snapshots:
    dependencies:
      humanize-ms: 1.2.1

-  ai@6.0.196(zod@4.4.3):
-    dependencies:
-      '@ai-sdk/gateway': 3.0.124(zod@4.4.3)
-      '@ai-sdk/provider': 3.0.10
-      '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3)
-      '@opentelemetry/api': 1.9.1
-      zod: 4.4.3
-
  ansi-colors@4.1.3: {}

  ansi-regex@5.0.1: {}
@@ -4213,8 +4193,6 @@ snapshots:

  event-target-shim@5.0.1: {}

-  eventsource-parser@3.1.0: {}
-
  execa@5.1.1:
    dependencies:
      cross-spawn: 7.0.6
@@ -4414,6 +4392,8 @@ snapshots:
    dependencies:
      ms: 2.1.3

+  iceberg-js@0.8.1: {}
+
  iconv-lite@0.7.2:
    dependencies:
      safer-buffer: 2.1.2
@@ -4460,8 +4440,6 @@ snapshots:

  jiti@1.21.7: {}

-  json-schema@0.4.0: {}
-
  jsonrepair@3.14.0: {}

  jszip@3.10.1:
@@ -4617,6 +4595,11 @@ snapshots:
    dependencies:
      mimic-fn: 2.1.0

+  openai@6.42.0(ws@8.20.1)(zod@4.4.3):
+    optionalDependencies:
+      ws: 8.20.1
+      zod: 4.4.3
+
  package-json-from-dist@1.0.1: {}

  pako@1.0.11: {}
@@ -5132,4 +5115,5 @@ snapshots:
      cookie: 1.1.1
      youch-core: 0.3.3

-  zod@4.4.3: {}
+  zod@4.4.3:
+    optional: true
@@ -0,0 +1,31 @@
+import { type NextRequest, NextResponse } from "next/server";
+import { createServerClient } from "@supabase/ssr";
+
+export async function proxy(request: NextRequest) {
+  const supabaseUrl = process.env.NEXT_PUBLIC_SUPABASE_URL;
+  const supabaseKey = process.env.NEXT_PUBLIC_SUPABASE_PUBLISHABLE_KEY;
+  if (!supabaseUrl || !supabaseKey) return NextResponse.next();
+
+  let response = NextResponse.next({ request });
+  const supabase = createServerClient(supabaseUrl, supabaseKey, {
+    cookies: {
+      getAll: () => request.cookies.getAll(),
+      setAll: (cookiesToSet) => {
+        for (const { name, value } of cookiesToSet) {
+          request.cookies.set(name, value);
+        }
+        response = NextResponse.next({ request });
+        for (const { name, value, options } of cookiesToSet) {
+          response.cookies.set(name, value, options);
+        }
+      },
+    },
+  });
+
+  // Must await: getUser() triggers the token refresh, and the refreshed
+  // cookies are written to `response` via the setAll callback above. Returning
+  // before it resolves can drop the refreshed session cookie.
+  await supabase.auth.getUser();
+
+  return response;
+}
--- a/Show More
+++ b/Show More