diff --git a/.env.example b/.env.example index ae1980e..6d04fa6 100644 --- a/.env.example +++ b/.env.example @@ -3,14 +3,18 @@ # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS # (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]). # -# TEXT / VISION use any OpenAI-compatible endpoint (any OpenAI- -# compatible host works: OpenRouter, OpenAI, Anthropic via proxy, -# Gemini, DeepSeek, Ollama, ...). +# TEXT / VISION default to any OpenAI-compatible endpoint, and can switch to +# native Anthropic or Google Gemini via TEXT_PROVIDER / VISION_PROVIDER. # TTS uses Xiaomi MiMo's own voice design / clone protocol # (not OpenAI-compatible; appends -voicedesign / -voiceclone). # -# IMAGE uses Runware's own task-array protocol (not OpenAI-compatible); -# the adapter posts an `imageInference` task to IMAGE_BASE_URL. +# IMAGE supports Runware (its own task-array protocol), OpenAI (gpt-image), +# and Google Gemini (Nano Banana) via IMAGE_PROVIDER. +# +# *_PROVIDER (optional) selects the wire protocol; leave unset for the +# OpenAI-compatible default (image is auto-detected from the URL). Base URLs +# tolerate a missing or extra /v1 (or a trailing /chat/completions) — the +# engine normalizes them. # ============================================================= # ---- 1. Text LLM · scene director ---------------------------------- @@ -26,6 +30,10 @@ TEXT_BASE_URL=https://api.deepseek.com/v1 TEXT_API_KEY=sk-xxx TEXT_MODEL=deepseek-v4-flash +# TEXT_PROVIDER: openai_compatible (default) | anthropic | google +# anthropic → TEXT_BASE_URL=https://api.anthropic.com TEXT_MODEL=claude-sonnet-4-6 +# google → TEXT_BASE_URL=https://generativelanguage.googleapis.com TEXT_MODEL=gemini-3.5-flash +# TEXT_PROVIDER=openai_compatible # ---- 2. Image generator (renders the scene background) ------------- # Recommended: Runware + FLUX.2 [klein] 9B KV — distilled 4-step model, @@ -36,12 +44,27 @@ TEXT_MODEL=deepseek-v4-flash IMAGE_BASE_URL=https://api.runware.ai/v1 IMAGE_API_KEY=runware-xxx IMAGE_MODEL=runware:400@6 +# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible +# | openai | google +# openai → gpt-image, supports referenceImages (character/scene continuity). +# IMAGE_BASE_URL=https://api.openai.com IMAGE_MODEL=gpt-image-1 +# google → Gemini "Nano Banana" (Imagen is EOL 2026-06-24, do not use it). +# IMAGE_BASE_URL=https://generativelanguage.googleapis.com +# IMAGE_MODEL=gemini-2.5-flash-image +# NOTE: openai/google return raw bytes → inlined as a data: URI for the session +# (heavier per-call transport than Runware's UUID re-reference loop). Runware +# stays fastest + cheapest for the scene-by-scene flow. +# IMAGE_PROVIDER=runware # ---- 3. Vision model · multimodal click interpretation ------------- # Recommended: MiMo V2.5 — multimodal, accepts image_url content parts. VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 VISION_API_KEY=tp-xxx VISION_MODEL=mimo-v2.5 +# VISION_PROVIDER: openai_compatible (default) | anthropic | google +# anthropic → VISION_BASE_URL=https://api.anthropic.com VISION_MODEL=claude-sonnet-4-6 +# google → VISION_BASE_URL=https://generativelanguage.googleapis.com VISION_MODEL=gemini-3.5-flash +# VISION_PROVIDER=openai_compatible # ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------ # Per-character voice design → clone, with per-line delivery direction. diff --git a/README.en.md b/README.en.md index e600b6d..93fac18 100644 --- a/README.en.md +++ b/README.en.md @@ -159,6 +159,12 @@ With the recommended trio, each scene's cost comes mainly from the image generat By default the browser fetches images directly from the provider — no setup needed; leave `NEXT_PUBLIC_IMAGE_PROXY_URL` blank and you're completely unaffected. You only want this if you hit progressive "top-to-bottom" image loading (Chrome's `ERR_QUIC_PROTOCOL_ERROR` on some networks paints partial PNGs row by row): deploy a tiny Cloudflare Worker that re-fetches images server-side and serves them atomically over HTTP/2. One-click deploy at **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)**, then paste the `workers.dev` URL it prints into `NEXT_PUBLIC_IMAGE_PROXY_URL`. +**5. Let players bring their own voice Key (optional, recommended)** + +Xiaomi rate-limits the TTS model by RPM/TPM. When a public deployment has many people playing at once through a single shared `TTS_API_KEY`, those limits are easy to hit — the symptom is **story and visuals work fine, but there's no audio**. To fix this, players can optionally enter **their own** Xiaomi MiMo key on the homepage (free to obtain). Synthesis then runs **browser-direct to Xiaomi**, the **key stays in the player's browser and never touches your server**, and they get stable voice with lower latency. It's purely additive: leave it blank and playback falls back to your server key exactly as before. + +See the [Bring-your-own voice Key guide](docs/xiaomi-tts-key.md) for how to obtain and enter one. + --- ## Roadmap diff --git a/README.ja.md b/README.ja.md index ee2fdcd..bb536ad 100644 --- a/README.ja.md +++ b/README.ja.md @@ -158,6 +158,12 @@ InfiPlot は 4 種類のモデルプロバイダと通信します。**テキス デフォルトではブラウザが画像プロバイダーに直接アクセスするため、設定は不要です —— `NEXT_PUBLIC_IMAGE_PROXY_URL` を空欄のままにすれば、まったく影響ありません。画像が「上から順に」表示される現象(一部のネットワークで Chrome の `ERR_QUIC_PROTOCOL_ERROR` により PNG が行ごとに描画される)に遭遇した場合のみ必要です。小さな Cloudflare Worker をデプロイすると、画像をサーバー側で再取得し HTTP/2 で一括返却します。ワンクリックデプロイは **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)** を参照し、出力された `workers.dev` の URL を `NEXT_PUBLIC_IMAGE_PROXY_URL` に設定してください。 +**5. プレイヤー自身の音声 Key(任意・推奨)** + +Xiaomi は TTS モデルに RPM/TPM 制限を設けています。公開デプロイで多数のプレイヤーが単一の `TTS_API_KEY` を共有して同時にプレイすると、この制限に達しやすく、**ストーリーも画像も正常なのに音声だけ出ない**という症状になります。対策として、プレイヤーはトップページで**自分の** Xiaomi MiMo Key(無料で取得可)を任意で入力できます。合成は**ブラウザから Xiaomi へ直接**行われ、**Key はプレイヤーのブラウザ内にのみ保存され、あなたのサーバーを一切経由しません**。これにより安定した音声と低遅延が得られます。完全な追加機能であり、未入力ならこれまで通りサーバー側の Key にフォールバックします。 + +取得・入力の手順は [音声 Key 持ち込みガイド](docs/xiaomi-tts-key.md) を参照してください。 + --- ## Roadmap diff --git a/README.md b/README.md index f597861..edafdea 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ InfiPlot 同时支持部署到 Vercel 与 Cloudflare Workers。Cloudflare 部署 ## 配置教程 -InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Vision)都使用 OpenAI 兼容的接口**,可以自由搭配。**图像(Image)**目前接入 **Runware**(其自有的 task-array 协议,并非 OpenAI 兼容)。**语音(TTS)**使用**小米 MiMo** 自有的音色设计/克隆协议——支持角色级音色设计、克隆与逐行演绎指导。 +InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Vision)** 默认使用 OpenAI 兼容接口,也可原生切换到 **Anthropic** 或 **Google Gemini**。**图像(Image)** 支持 **Runware**(其自有 task-array 协议)、**OpenAI**(`gpt-image`)与 **Google Gemini**(Nano Banana)。**语音(TTS)**使用**小米 MiMo** 自有的音色设计/克隆协议——支持角色级音色设计、克隆与逐行演绎指导。 **1. 选择你的供应商** @@ -136,6 +136,18 @@ InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Visio | Vision · 点击解读 | `VISION_BASE_URL` `VISION_API_KEY` `VISION_MODEL` | ✅ | Google 的 `gemini-3.5-flash` | | TTS · 角色配音 | `TTS_BASE_URL` `TTS_API_KEY` `TTS_SPEECH_MODEL` | 可选 —— 留空则静音运行 | 小米 MiMo 的 `mimo-v2.5-tts` | +> **可选 · 指定接口协议**:每类模型都可加一个 `*_PROVIDER` 变量(`TEXT_PROVIDER` / `VISION_PROVIDER` / `IMAGE_PROVIDER`)显式选择接口协议。**不设则保持向后兼容**——文本/视觉默认走 OpenAI 兼容接口,图像按 `*_BASE_URL` 自动判断(`runware.ai` → Runware,否则 OpenAI 兼容;个别在 `runware.ai` 上以 OpenAI 协议提供的模型——如 `image-2-vip`——会按 OpenAI 兼容处理,需要时用 `IMAGE_PROVIDER` 显式覆盖即可)。 +> +> | 取值 | 适用 | 说明 | +> |---|---|---| +> | `openai_compatible`(默认) | Text · Vision · Image | OpenAI Chat Completions / `/images/generations` | +> | `anthropic` | Text · Vision | 原生 Anthropic Messages 接口 | +> | `google` | Text · Vision · Image | 原生 Gemini;图像用 Nano Banana 系(如 `gemini-2.5-flash-image`,**勿用 Imagen(已废弃,2026-06-24 停服)**) | +> | `openai` | Image | OpenAI `gpt-image`,支持参考图编辑 | +> | `runware` | Image | Runware task-array 协议 | +> +> 此外,`*_BASE_URL` 带不带 `/v1`(甚至末尾多写了 `/chat/completions`)都能正常工作——引擎会自动规范化。 + **2. 填写环境变量** 九个变量为必填;TTS 可选(留空则静音运行)。此外还有一个用于低成本测试的开关: @@ -158,6 +170,12 @@ InfiPlot 会与四类模型供应商通信。**文本(Text)和视觉(Visio 默认浏览器直连图片供应商,无需任何配置 —— 留空 `NEXT_PUBLIC_IMAGE_PROXY_URL` 即可,完全不受影响。只有当你遇到图片「层层加载」(Chrome 在某些网络下 `ERR_QUIC_PROTOCOL_ERROR` 导致 PNG 逐行渲染)时才需要它:部署一个极小的 Cloudflare Worker,把图片改为服务端转发 + HTTP/2 原子返回。一键部署见 **[infiplot-image-proxy](https://github.com/zonghaoyuan/infiplot-image-proxy)**,然后把它给出的 `workers.dev` 地址填进 `NEXT_PUBLIC_IMAGE_PROXY_URL`。 +**5. 玩家自带配音 Key(可选,推荐)** + +小米对 TTS 模型有 RPM/TPM 限额。当你的公共部署有多人同时游玩、共用同一把 `TTS_API_KEY` 时,很容易撞到限额,表现为**剧情、画面都正常,唯独没有声音**。为此,玩家可以在首页可选地填入**自己的**小米 MiMo Key(免费申请)——配音请求由**浏览器直连小米**完成,**Key 只存在玩家本地、绝不经过你的服务器**,从而获得稳定配音与更低延迟。这是纯增强:不填则照常使用你部署的服务器 Key,行为不变。 + +申请与填写步骤见 [自带配音 Key 教程](docs/xiaomi-tts-key.md)。 + --- ## Roadmap diff --git a/app/api/beat-audio/route.ts b/app/api/beat-audio/route.ts index bc84417..7815d9b 100644 --- a/app/api/beat-audio/route.ts +++ b/app/api/beat-audio/route.ts @@ -4,9 +4,6 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -// The synth itself has a 15s per-call ceiling in the engine. 30s here just -// covers JSON parsing + outbound network buffer. -export const maxDuration = 30; export async function POST(req: Request) { let body: BeatAudioRequest; @@ -26,7 +23,11 @@ export async function POST(req: Request) { try { const config = loadEngineConfig(); const result = await requestBeatAudio(config, body); - return NextResponse.json(result); + if (!result.audio) return new Response(null, { status: 204 }); + const binary = Buffer.from(result.audio.base64, "base64"); + return new Response(binary, { + headers: { "Content-Type": result.audio.mime }, + }); } catch (err) { // Engine already swallows synth errors and returns audio:null. Anything // that reaches here is config-level — surface so the client can log it. diff --git a/app/api/insert-beat/route.ts b/app/api/insert-beat/route.ts index 467392c..820b514 100644 --- a/app/api/insert-beat/route.ts +++ b/app/api/insert-beat/route.ts @@ -4,7 +4,6 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -export const maxDuration = 60; export async function POST(req: Request) { let body: InsertBeatRequest; @@ -22,9 +21,14 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(); + const base = loadEngineConfig(); + // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. + const config = body.clientTts === true ? { ...base, tts: undefined } : base; const result = await requestInsertBeat(config, body); - return NextResponse.json(result); + return NextResponse.json({ + ...result, + characters: result.characters.map((c) => ({ ...c, voice: undefined })), + }); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; return NextResponse.json({ error: message }, { status: 500 }); diff --git a/app/api/parse-style-image/route.ts b/app/api/parse-style-image/route.ts index 02d165e..6fb2d4d 100644 --- a/app/api/parse-style-image/route.ts +++ b/app/api/parse-style-image/route.ts @@ -7,7 +7,6 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -export const maxDuration = 60; // Same rationale as /api/vision: the client resizes to 512px max-dim webp // (~30-80KB base64 typical) before upload, so 3 MB is generous headroom diff --git a/app/api/scene/route.ts b/app/api/scene/route.ts index 2fc432f..ed25b0f 100644 --- a/app/api/scene/route.ts +++ b/app/api/scene/route.ts @@ -1,14 +1,18 @@ import { requestScene } from "@infiplot/engine"; -import type { SceneRequest } from "@infiplot/types"; +import type { Character, SceneRequest } from "@infiplot/types"; import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; +function stripKnownVoices( + characters: Character[], + knownNames: Set, +): Character[] { + return characters.map((c) => + knownNames.has(c.name) ? { ...c, voice: undefined } : c, + ); +} + export const runtime = "nodejs"; -// Capped at 60 for Vercel Hobby (300 allowed on Pro). The scene pipeline is -// Writer + CharDesigner×N + Cinematographer + Painter — happy path 9–12s; the -// tail (cold provider, multiple new characters) can push 30–45s, so 60 is a -// reasonable headroom on Hobby. -export const maxDuration = 60; export async function POST(req: Request) { let body: SceneRequest; @@ -23,9 +27,17 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(); + const base = loadEngineConfig(); + // See StartRequest.clientTts — BYO clients synth in-browser, so drop server TTS. + const config = body.clientTts === true ? { ...base, tts: undefined } : base; const result = await requestScene(config, body); - return NextResponse.json(result); + const knownNames = new Set( + (body.session.characters ?? []).map((c) => c.name), + ); + return NextResponse.json({ + ...result, + characters: stripKnownVoices(result.characters, knownNames), + }); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; return NextResponse.json({ error: message }, { status: 500 }); diff --git a/app/api/start/route.ts b/app/api/start/route.ts index ecd5312..3ce3169 100644 --- a/app/api/start/route.ts +++ b/app/api/start/route.ts @@ -4,7 +4,6 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -export const maxDuration = 60; // Matches /api/vision and /api/parse-style-image — the user's resized 512px // webp is ~30-80 KB; this caps pathological direct-API payloads (which would @@ -41,7 +40,11 @@ export async function POST(req: Request) { } try { - const config = loadEngineConfig(); + const base = loadEngineConfig(); + // BYO key: the browser provisions + synths voices directly against Xiaomi + // (key never reaches us), so strip server-side TTS so the engine skips all + // provisioning + synth. See StartRequest.clientTts. + const config = body.clientTts === true ? { ...base, tts: undefined } : base; const result = await startSession(config, body); return NextResponse.json(result); } catch (err) { diff --git a/app/api/vision/route.ts b/app/api/vision/route.ts index 6f294df..3bd2d59 100644 --- a/app/api/vision/route.ts +++ b/app/api/vision/route.ts @@ -4,7 +4,6 @@ import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; export const runtime = "nodejs"; -export const maxDuration = 60; // Browser annotator resizes to 768 wide → typically 200-800 KB base64. // 3 MB caps abusive direct-API payloads (which would inflate upstream diff --git a/app/layout.tsx b/app/layout.tsx index ec8a719..f76e561 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -1,4 +1,4 @@ -import type { Metadata } from "next"; +import type { Metadata, Viewport } from "next"; import { Cormorant_Garamond, Inter } from "next/font/google"; import { Analytics } from "@/components/Analytics"; import "./globals.css"; @@ -25,6 +25,15 @@ export const metadata: Metadata = { description: "InfiPlot 是一款用 AI 实时生成图片、语音与剧情分支的交互式剧情游戏 Demo。", }; +// viewportFit:cover lets the immersive /play portrait layout extend under the +// iOS notch / home-indicator and exposes env(safe-area-inset-*) to the +// floating controls. device-width + initialScale keep mobile rendering 1:1. +export const viewport: Viewport = { + width: "device-width", + initialScale: 1, + viewportFit: "cover", +}; + export default function RootLayout({ children, }: { diff --git a/app/page.tsx b/app/page.tsx index e39ab0d..10e2c58 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -10,14 +10,8 @@ import { PLOT_STYLES, type Gender, } from "@/lib/options"; - -/* ============================================================================ - InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型) - - 顶部 Header:左上角衬线 wordmark logo -"use client"; - -import { useRouter } from "next/navigation"; -import { useEffect, useRef, useState } from "react"; +import { readStoredTtsConfig } from "@/lib/clientTtsConfig"; +import { TtsKeyModal } from "@/components/TtsKeyModal"; /* ============================================================================ InfiPlot · 首页(编辑式视觉风格 · 居中构图,呼应低保真原型) @@ -1394,7 +1388,12 @@ export default function HomePage() { // 顶部使用提示:默认展示,用户可点 × 永久关闭(localStorage:infiplot:hintClosed)。 const [hintClosed, setHintClosed] = useState(false); + // 自带 TTS Key 弹窗:可选增强,Key 只存浏览器、绝不经过服务器。 + const [ttsOpen, setTtsOpen] = useState(false); + const [ttsConfigured, setTtsConfigured] = useState(false); + const styleRow = OPTS.findIndex((o) => o.modal); + const voiceRow = OPTS.findIndex((o) => o.label === "语音配音"); const genderIndex = sel[0] ?? 0; const gender = (OPTS[0]!.items[genderIndex] as Gender) ?? "男性向"; const phrases = EXAMPLE_PHRASES[gender]; @@ -1436,6 +1435,11 @@ export default function HomePage() { } }, []); + // 启动时回填「已启用」徽标——读 localStorage 判断用户是否已存过 Key。 + useEffect(() => { + setTtsConfigured(readStoredTtsConfig() != null); + }, []); + // 输入框随内容自动增高:长文本整段可见(打字与点卡片填入都覆盖)。 useEffect(() => { const el = inputRef.current; @@ -1661,6 +1665,30 @@ export default function HomePage() { ))} + {/* 自带 TTS Key 入口:公共语音模型有 RPM/TPM 限额,高并发易静音; + 填自己的小米 MiMo Key(免费)→ 稳定配音、延迟更低,且 Key 只存本地。 */} +
+ +
+ {/* 使用提示:可被用户永久关闭(localStorage:infiplot:hintClosed) */} {!hintClosed && (
@@ -1826,6 +1854,21 @@ export default function HomePage() { setCustomStyleRefImage={setCustomStyleRefImage} /> )} + {ttsOpen && ( + setTtsOpen(false)} + onSaved={(configured) => { + setTtsConfigured(configured); + // 启用自带 Key 时顺手把「语音配音」拨到「开启」——否则用户配了 Key + // 却还是静音,体验自相矛盾。停用时不动其选择,尊重用户原本的偏好。 + if (configured && voiceRow >= 0) { + const onIdx = OPTS[voiceRow]!.items.indexOf("开启"); + if (onIdx >= 0) + setSel((s) => s.map((v, j) => (j === voiceRow ? onIdx : v))); + } + }} + /> + )}
); } diff --git a/app/play/page.tsx b/app/play/page.tsx index e5a62fa..528da77 100644 --- a/app/play/page.tsx +++ b/app/play/page.tsx @@ -6,30 +6,87 @@ import { Suspense, useCallback, useEffect, + useLayoutEffect, useMemo, useRef, useState, } from "react"; import { PlayCanvas, type Phase } from "@/components/PlayCanvas"; +import { TtsKeyModal } from "@/components/TtsKeyModal"; import { annotateClick } from "@/lib/annotateClient"; +import { loadClientTtsConfig } from "@/lib/clientTtsConfig"; import { PRESETS } from "@/lib/presets"; +import { provisionVoice, synthesize } from "@infiplot/tts-client"; import type { Beat, - BeatAudio, - BeatAudioResponse, BeatChoice, + Character, + CharacterVoice, InsertBeatResponse, + Orientation, Scene, SceneExit, SceneResponse, Session, StartResponse, + TtsConfig, VisionResponse, } from "@infiplot/types"; import { track } from "@/lib/analytics"; const MUTED_STORAGE_KEY = "infiplot:muted"; +// ── FOT reduction helpers ────────────────────────────────────────────── +// Strip bulky voice.referenceAudioBase64 from the session before sending it to +// the server. The engine only needs character names + visualDescriptions for +// scene generation; voice data is only used by /api/beat-audio (which receives +// the voice directly, not via session). The client retains voices locally and +// re-merges them from the response via mergeCharactersPreserveVoice. +function stripVoicesForTransport(session: Session): Session { + return { + ...session, + characters: session.characters.map((c) => ({ ...c, voice: undefined })), + }; +} + +// Merge server-returned characters with locally-held voices. The server strips +// voice from already-known characters (P0), so only NEW characters carry voice. +// For existing characters, re-attach the voice the client already holds. +function mergeCharactersPreserveVoice( + local: Character[], + remote: Character[], +): Character[] { + const localByName = new Map(local.map((c) => [c.name, c])); + return remote.map((c) => { + const prev = localByName.get(c.name); + if (!prev) return c; + return { ...c, voice: c.voice ?? prev.voice }; + }); +} + +// Consecutive silent (no-audio) beats before we surface the BYO-key nudge to a +// non-BYO, unmuted player. Set high enough that one transient miss won't trip +// it, low enough to catch a scene that's clearly being rate-limited. +const SILENCE_NUDGE_THRESHOLD = 3; + +// Mobile-portrait users get a 9:16 scene image painted for them; everyone else +// (desktop, tablet, mobile-landscape) keeps the 16:9 landscape image. Only a +// touch device (coarse pointer) held upright counts as "portrait" — a mouse +// device is always landscape. Detected once and locked for the whole session. +function detectOrientation(): Orientation { + if (typeof window === "undefined") return "landscape"; + const portrait = window.matchMedia("(orientation: portrait)").matches; + const coarse = window.matchMedia("(pointer: coarse)").matches; + return portrait && coarse ? "portrait" : "landscape"; +} + +// Runs before the browser paints (so it can correct first-frame state without a +// visible flash), but useLayoutEffect warns when called during SSR. PlayInner +// only ever renders on the client (/play prerenders the Suspense fallback), yet +// fall back to useEffect on the server anyway to keep the warning out. +const useIsomorphicLayoutEffect = + typeof window !== "undefined" ? useLayoutEffect : useEffect; + // Cap how long we wait for the browser to download + decode a scene image // before giving up and rendering anyway. Runware's CDN is usually <2s for a // 1792×1024 PNG, but over slow links / VPN / strict corp networks the same @@ -257,6 +314,7 @@ function prefetchScenePath( baseSession: Session, steps: ScenePathStep[], depth: number, + clientTts: boolean, ): void { if (depth >= PREFETCH_MAX_DEPTH) return; const key = pathKey(steps); @@ -267,8 +325,10 @@ function prefetchScenePath( const promise = (async () => { const res = await fetch("/api/scene", { method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session: specSession }), + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ session: stripVoicesForTransport(specSession), clientTts }), signal: abort.signal, }); if (!res.ok) { @@ -283,6 +343,12 @@ function prefetchScenePath( // transition path awaits the same cached promise via getOrCreateBlobUrl. void getOrCreateBlobUrl(data.imageUrl); + // Re-attach locally-held voices the server stripped from known characters. + data.characters = mergeCharactersPreserveVoice( + baseSession.characters, + data.characters, + ); + // Recursive: if the resulting scene has exactly one change-scene exit, // it is a must-pass node — prefetch its child too. if (depth + 1 < PREFETCH_MAX_DEPTH) { @@ -307,7 +373,13 @@ function prefetchScenePath( characters: data.characters, storyState: data.storyState, }; - prefetchScenePath(pool, carriedBase, [...steps, nextStep], depth + 1); + prefetchScenePath( + pool, + carriedBase, + [...steps, nextStep], + depth + 1, + clientTts, + ); } } @@ -342,6 +414,44 @@ function clearPool(pool: Map): void { pool.clear(); } +// ────────────────────────────────────────────────────────────────────── +// BYO voice resolution (client-direct Xiaomi TTS). +// +// In BYO mode the server skips all TTS (clientTts:true), so the browser must +// obtain each speaker's reference audio itself. `cache` is keyed by character +// NAME and persists for the whole session, so a voice locked in on a +// character's first speaking beat stays identical across every later scene — +// even though /api/scene returns its characters without `.voice`. Storing the +// in-flight Promise (not the resolved value) dedupes the burst of concurrent +// beats by the same speaker into ONE voicedesign call, which matters because +// Xiaomi rate-limits voicedesign hard. +// ────────────────────────────────────────────────────────────────────── + +async function resolveByoVoice( + cache: Map>, + cfg: TtsConfig, + speaker: Character, +): Promise { + const cached = cache.get(speaker.name); + if (cached) return cached; + // Prebaked cards ship baked reference audio — reuse it directly (cross-key + // synth with the user's key works), keeping the prebaked voice identical. + if (speaker.voice) { + const ready = Promise.resolve(speaker.voice); + cache.set(speaker.name, ready); + return ready; + } + if (!speaker.voiceDescription) return null; + const p = provisionVoice(cfg, speaker.voiceDescription); + cache.set(speaker.name, p); + try { + return await p; + } catch (e) { + cache.delete(speaker.name); // failed provision — let a later beat retry + throw e; + } +} + // ────────────────────────────────────────────────────────────────────── // Component // ────────────────────────────────────────────────────────────────────── @@ -355,7 +465,7 @@ function PlayInner() { const [currentScene, setCurrentScene] = useState(null); const [currentBeatId, setCurrentBeatId] = useState(null); const [imageUrl, setImageUrl] = useState(null); - const [beatAudioMap, setBeatAudioMap] = useState>({}); + const [beatAudioMap, setBeatAudioMap] = useState>({}); // Lazy-initialize 优先级:本局选择(homepage 的「语音配音」存到 sessionStorage:infiplot:custom) // > 上次会话的粘性偏好(localStorage:infiplot:muted) > 默认非静音。 // 这样首页选了「关闭」开始游戏,进来就是静音;选「开启」就不是静音;进入 play 页后用户自己 @@ -381,7 +491,20 @@ function PlayInner() { } | null>(null); const [error, setError] = useState(null); const [presentation, setPresentation] = useState(false); + // Session-locked image orientation (see detectOrientation). "portrait" makes + // the whole play surface render full-bleed vertical on phones. + const [orientation, setOrientation] = useState("landscape"); const [lastExitLabel, setLastExitLabel] = useState(null); + // Consecutive server-side TTS misses (null audio / failed /api/beat-audio). + // Climbs when the shared server key is rate-limited by MiMo — the exact pain + // BYO fixes — so the play page can nudge non-BYO users to add their own key. + // Reset to 0 on any successful synth. Only the server path touches it. + const [silenceStrikes, setSilenceStrikes] = useState(0); + // Once the player dismisses the silence nudge, keep it gone for this session. + const [nudgeDismissed, setNudgeDismissed] = useState(false); + // The in-place BYO-key modal, opened from the silence nudge so the player can + // add a key without leaving the play page. + const [ttsModalOpen, setTtsModalOpen] = useState(false); const startedRef = useRef(false); const poolRef = useRef>(new Map()); @@ -396,6 +519,21 @@ function PlayInner() { // 不再单独维护 audioEnabledRef —— 单一来源避免两个 flag 漂移。 const mutedRef = useRef(muted); + // Resolved bring-your-own Xiaomi TTS config (region preset + key), read once + // from localStorage. When non-null, the browser provisions + synths voices + // directly against Xiaomi — the key never touches our server — and every + // start/scene/insert-beat request carries clientTts:true so the engine skips + // server-side TTS. null = user hasn't opted in (server default / silent). + const [byoTtsConfig, setByoTtsConfig] = useState(() => + loadClientTtsConfig(), + ); + const byoTtsRef = useRef(byoTtsConfig); + // BYO voice cache (see resolveByoVoice). Keyed by character name; persists + // across scenes so each speaker is provisioned at most once per session. + const provisionedVoicesRef = useRef>>( + new Map(), + ); + // Mirrors for use inside async handlers (closure-stable) const sessionRef = useRef(null); const currentSceneRef = useRef(null); @@ -411,9 +549,7 @@ function PlayInner() { return currentScene.beats.find((b) => b.id === currentBeatId) ?? null; }, [currentScene, currentBeatId]); - const currentBeatAudio = currentBeat ? beatAudioMap[currentBeat.id] : undefined; - const audioBase64 = currentBeatAudio?.base64 ?? null; - const audioMime = currentBeatAudio?.mime ?? null; + const audioSrc = (currentBeat ? beatAudioMap[currentBeat.id] : undefined) ?? null; useEffect(() => { sessionRef.current = session; @@ -476,31 +612,73 @@ function PlayInner() { // 「首页选关闭」也走这条路:bootstrap 时 muted 已被初始化为 true。 if (!beat.speaker || !beat.line) return; const speaker = sess.characters.find((c) => c.name === beat.speaker); - if (!speaker?.voice) return; // not yet provisioned — server can't synth anyway + if (!speaker) return; + + const byo = byoTtsRef.current; + // Non-BYO relies on the server having provisioned speaker.voice. BYO + // skipped server TTS, so it needs a baked voice (prebaked card) or a + // voiceDescription to provision from in the browser. + if (!byo && !speaker.voice) return; + if (byo && !speaker.voice && !speaker.voiceDescription) return; + if (beatAudioAbortRef.current.has(beat.id)) return; const abort = new AbortController(); beatAudioAbortRef.current.set(beat.id, abort); try { - const res = await fetch("/api/beat-audio", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery }, - voice: speaker.voice, - }), - signal: abort.signal, - }); - if (!res.ok) return; - const json = (await res.json()) as BeatAudioResponse; - // Skip the state write if we've been aborted between the .ok check and + let audioUrl: string | null = null; + if (byo) { + // Client-direct: provision (once per speaker, cached) + synth against + // Xiaomi with the user's own key — no /api/beat-audio round-trip and + // the key never touches our server. + const voice = await resolveByoVoice( + provisionedVoicesRef.current, + byo, + speaker, + ); + if (!voice || abort.signal.aborted) return; + const out = await synthesize( + byo, + voice, + beat.line, + beat.lineDelivery, + abort.signal, + ); + audioUrl = `data:${out.mimeType};base64,${out.audioBase64}`; + } else { + const res = await fetch("/api/beat-audio", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery }, + voice: speaker.voice, + }), + signal: abort.signal, + }); + if (res.status === 204) { + setSilenceStrikes((n) => Math.min(n + 1, 99)); + return; + } + if (!res.ok) { + setSilenceStrikes((n) => Math.min(n + 1, 99)); + return; + } + const blob = await res.blob(); + audioUrl = URL.createObjectURL(blob); + setSilenceStrikes(0); + } + // Skip the state write if we've been aborted between the await and // here — beat ids are scene-local, so a late arrival from a prior // scene would otherwise overwrite the current scene's audio under the // same id. - if (json.audio && !abort.signal.aborted) { - setBeatAudioMap((m) => ({ ...m, [beat.id]: json.audio as BeatAudio })); + if (audioUrl && !abort.signal.aborted) { + setBeatAudioMap((m) => ({ ...m, [beat.id]: audioUrl })); + } else if (audioUrl?.startsWith("blob:")) { + URL.revokeObjectURL(audioUrl); } } catch { - // aborted or network error — silent fallback + // aborted / network / Xiaomi rate-limit — silent fallback (no audio) } finally { // Only clear the slot if it's still ours. An aborted prior fetch // running its finally late could otherwise delete the controller of a @@ -536,7 +714,12 @@ function PlayInner() { // scenes) so a late arrival would land under the wrong beat otherwise. useEffect(() => { cancelBeatAudioFetches(); - setBeatAudioMap({}); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); prefetchSceneAudio(); }, [currentScene?.id, prefetchSceneAudio]); @@ -571,10 +754,41 @@ function PlayInner() { if (prev === muted) return; cancelBeatAudioFetches(); if (muted) return; - setBeatAudioMap({}); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); prefetchSceneAudio(); }, [muted, prefetchSceneAudio]); + // ── BYO key enabled/disabled from the play page (silence nudge → modal) ─ + // On enable: point the synth path at the user's key and immediately + // re-synthesize the current scene in-browser, so the voices the player just + // missed come back without a reload (their characters already carry + // server-provisioned `voice`, which resolveByoVoice reuses with the new key). + // On disable: just stop using it; later scenes fall back to the server. + const handleByoSaved = useCallback( + (configured: boolean) => { + const cfg = configured ? loadClientTtsConfig() : null; + byoTtsRef.current = cfg; + setByoTtsConfig(cfg); + if (cfg) { + setSilenceStrikes(0); + cancelBeatAudioFetches(); + setBeatAudioMap((prev) => { + for (const url of Object.values(prev)) { + if (url.startsWith("blob:")) URL.revokeObjectURL(url); + } + return {}; + }); + prefetchSceneAudio(); + } + }, + [prefetchSceneAudio], + ); + // ── Presentation mode toggle ───────────────────────────────────────── const togglePresentation = useCallback(async () => { const entering = !presentation; @@ -619,6 +833,16 @@ function PlayInner() { }; }, [togglePresentation, presentation]); + // Lock the visible orientation BEFORE the first paint, so portrait phones + // never flash the landscape loading chrome. The state inits to "landscape" + // for SSR-safety; this corrects it pre-paint (no-op re-render on landscape + // devices). Prebaked cards (decision C) stay landscape-baked regardless of + // device. The bootstrap effect below re-derives the same value for the + // /api/start payload. + useIsomorphicLayoutEffect(() => { + setOrientation(params.get("card") ? "landscape" : detectOrientation()); + }, [params]); + // ── Bootstrap: start session ───────────────────────────────────────── useEffect(() => { if (startedRef.current) return; @@ -638,6 +862,7 @@ function PlayInner() { worldSetting: string; styleGuide: string; styleReferenceImage?: string; + orientation?: Orientation; } | null = null; if (!cardName) { if (presetId) { @@ -666,6 +891,16 @@ function PlayInner() { } } + // Lock orientation for the whole session. Prebaked cards (decision C) are + // landscape-baked, so they stay landscape regardless of device; only the + // live /api/start path requests a portrait paint when the phone is upright. + // The visible state is already set pre-paint by the layout effect above; + // here we only need the value for the /api/start payload. + const sessionOrientation: Orientation = cardName + ? "landscape" + : detectOrientation(); + if (livePayload) livePayload.orientation = sessionOrientation; + if (!cardName && !livePayload) { router.replace("/"); return; @@ -693,8 +928,13 @@ function PlayInner() { ) : fetch("/api/start", { method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(livePayload), + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + ...livePayload, + clientTts: !!byoTtsRef.current, + }), }).then(async (r) => { if (!r.ok) { const j = (await r.json().catch(() => ({}))) as { error?: string }; @@ -734,6 +974,7 @@ function PlayInner() { characters: data.characters, storyState: data.storyState, styleReferenceImage: data.styleReferenceImage, + orientation: data.scene.orientation ?? sessionOrientation, }; visitedBeatsRef.current = [data.scene.entryBeatId]; setSession(initial); @@ -767,7 +1008,7 @@ function PlayInner() { nextSceneSeed: choice.effect.nextSceneSeed, }, }; - prefetchScenePath(poolRef.current, s, [step], 0); + prefetchScenePath(poolRef.current, s, [step], 0, !!byoTtsRef.current); } }, [currentScene?.id, session?.id]); @@ -844,7 +1085,10 @@ function PlayInner() { visitedBeatIds: [result.scene.entryBeatId], }, ], - characters: result.characters, + characters: mergeCharactersPreserveVoice( + base.characters, + result.characters, + ), storyState: result.storyState, }; visitedBeatsRef.current = [result.scene.entryBeatId]; @@ -918,8 +1162,13 @@ function PlayInner() { const promise = (async () => { const res = await fetch("/api/scene", { method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session: specSession }), + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + session: stripVoicesForTransport(specSession), + clientTts: !!byoTtsRef.current, + }), }); if (!res.ok) { const j = (await res.json().catch(() => ({}))) as { error?: string }; @@ -940,8 +1189,10 @@ function PlayInner() { const annotatedImageBase64 = await annotateClick(imageUrl, click); const visionRes = await fetch("/api/vision", { method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session, annotatedImageBase64 }), + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ session: stripVoicesForTransport(session), annotatedImageBase64 }), }); if (!visionRes.ok) { const j = (await visionRes.json().catch(() => ({}))) as { @@ -956,10 +1207,13 @@ function PlayInner() { setPhase("inserting-beat"); const insertRes = await fetch("/api/insert-beat", { method: "POST", - headers: { "Content-Type": "application/json" }, + headers: { + "Content-Type": "application/json", + }, body: JSON.stringify({ - session, + session: stripVoicesForTransport(session), freeformAction: decision.intent.freeformAction, + clientTts: !!byoTtsRef.current, }), }); if (!insertRes.ok) { @@ -995,7 +1249,10 @@ function PlayInner() { history: session.history.map((h, i, arr) => i === arr.length - 1 ? { ...h, scene: patched } : h, ), - characters: insertChars, + characters: mergeCharactersPreserveVoice( + session.characters, + insertChars, + ), }; setSession(nextSession); setCurrentScene(patched); @@ -1036,8 +1293,13 @@ function PlayInner() { const promise = (async () => { const res = await fetch("/api/scene", { method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session: specSession }), + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + session: stripVoicesForTransport(specSession), + clientTts: !!byoTtsRef.current, + }), }); if (!res.ok) { const j = (await res.json().catch(() => ({}))) as { @@ -1071,12 +1333,12 @@ function PlayInner() {

出 · 了 · 点 · 状 · 况

-

+

{error}

返 回 @@ -1086,13 +1348,18 @@ function PlayInner() { ); } - if (presentation) { + // Mobile portrait renders full-bleed by default — it sidesteps the iOS + // Safari Fullscreen API (unsupported on iPhone) with a CSS full-viewport + // layout instead. Desktop "presentation" mode shares the same immersive + // canvas, toggled via the F key. + const immersive = presentation || orientation === "portrait"; + + if (immersive) { return (
+ {orientation === "portrait" && ( +
+ + + + +
+ )}
); } @@ -1109,6 +1401,16 @@ function PlayInner() { const sceneCount = session?.history.length ?? 0; const beatCount = visitedBeatsRef.current.length; + // Surface the BYO-key nudge only to an unmuted, non-BYO player whose last few + // beats came back silent (shared key rate-limited) — the exact pain BYO fixes. + // Dismissible for the session. + const showSilenceNudge = + phase === "ready" && + !muted && + !byoTtsConfig && + !nudgeDismissed && + silenceStrikes >= SILENCE_NUDGE_THRESHOLD; + return (
@@ -1131,8 +1433,7 @@ function PlayInner() {
} aboveCanvasLeft={ - + <> + + + {/* Silence nudge — a compact pill right beside the mute toggle. + Clicking opens the BYO-key modal in place (no trip to the + homepage). The × dismisses it for the session. */} + {showSilenceNudge && ( + + + + + )} + } /> @@ -1181,7 +1511,16 @@ function PlayInner() {

)}
+ + + {ttsModalOpen && ( + setTtsModalOpen(false)} + onSaved={handleByoSaved} + footerNote="保存后会立即用这把 Key 在你的浏览器里合成当前这一幕的配音;本设备后续游玩也会自动使用此 Key。" + /> + )} ); } diff --git a/components/PlayCanvas.tsx b/components/PlayCanvas.tsx index 8e58285..5ee2ced 100644 --- a/components/PlayCanvas.tsx +++ b/components/PlayCanvas.tsx @@ -1,7 +1,7 @@ "use client"; import { useCallback, useEffect, useRef, useState, type ReactNode } from "react"; -import type { Beat, BeatChoice } from "@infiplot/types"; +import type { Beat, BeatChoice, Orientation } from "@infiplot/types"; export type Phase = | "loading-first" // first scene not yet rendered @@ -109,11 +109,13 @@ function ChoiceButton({ index, label, disabled, + vertical, onClick, }: { index: number; label: string; disabled: boolean; + vertical: boolean; onClick: () => void; }) { return ( @@ -121,8 +123,8 @@ function ChoiceButton({ type="button" disabled={disabled} onClick={onClick} - className="group relative flex-1 min-w-0 px-4 py-3 text-left transition-all duration-200 - disabled:opacity-50 disabled:cursor-wait" + className={`group relative ${vertical ? "w-full" : "flex-1 min-w-0"} px-4 py-3 text-left transition-all duration-200 + disabled:opacity-50 disabled:cursor-wait`} style={{ background: "rgba(20, 14, 8, 0.68)", border: "1.5px solid rgba(180, 140, 80, 0.65)", @@ -141,13 +143,13 @@ function ChoiceButton({ /> {index + 1}. {label} @@ -160,8 +162,7 @@ function ChoiceButton({ // ── Main component ───────────────────────────────────────────────────── export function PlayCanvas({ imageUrl, - audioBase64, - audioMime, + audioSrc, muted, phase, beat, @@ -170,12 +171,12 @@ export function PlayCanvas({ onAdvance, onSelectChoice, fullViewport = false, + orientation = "landscape", aboveCanvas, aboveCanvasLeft, }: { imageUrl: string | null; - audioBase64: string | null; - audioMime: string | null; + audioSrc: string | null; muted: boolean; phase: Phase; beat: Beat | null; @@ -184,6 +185,8 @@ export function PlayCanvas({ onAdvance: () => void; onSelectChoice: (choice: BeatChoice) => void; fullViewport?: boolean; + // 会话锁定的图片朝向。"portrait" 时整图铺满视口(object-fit:cover)、选项竖排、字号放大。 + orientation?: Orientation; // 渲染在图片正上方、右对齐的 slot(画面外、紧贴右上角)。 aboveCanvas?: ReactNode; // 渲染在图片正上方、左对齐的 slot(画面外、紧贴左上角),与 aboveCanvas 水平镜像。 @@ -204,7 +207,7 @@ export function PlayCanvas({ const { shown: typedBody, done: typingDone, skip: skipTypewriter } = useTypewriter(displayBody, beat?.id ?? "", { targetDurationMs: audioDurationMs, - waitForAudio: Boolean(audioBase64), + waitForAudio: Boolean(audioSrc), }); // ── Audio source change ────────────────────────────────────────────── @@ -212,12 +215,12 @@ export function PlayCanvas({ // unblock the typewriter via timeout so text doesn't stall. useEffect(() => { setAudioDurationMs(undefined); - if (!audioBase64) return; + if (!audioSrc) return; const timer = setTimeout(() => { setAudioDurationMs((prev) => prev ?? 0); }, AUDIO_WAIT_TIMEOUT_MS); return () => clearTimeout(timer); - }, [audioBase64]); + }, [audioSrc]); // ── Mute toggle ─────────────────────────────────────────────────────── useEffect(() => { @@ -225,12 +228,12 @@ export function PlayCanvas({ if (!el) return; el.muted = muted; el.playbackRate = SPEECH_RATE; - if (!muted && audioBase64 && el.paused) { + if (!muted && audioSrc && el.paused) { el.play().catch(() => { // autoplay blocked — silent until next interaction }); } - }, [muted, audioBase64]); + }, [muted, audioSrc]); function handleAudioMetadata() { const el = audioRef.current; @@ -255,9 +258,27 @@ export function PlayCanvas({ function handleImageClick(e: React.MouseEvent) { if (phase !== "ready" || !imgRef.current || !beat) return; - const rect = imgRef.current.getBoundingClientRect(); - const x = (e.clientX - rect.left) / rect.width; - const y = (e.clientY - rect.top) / rect.height; + const el = imgRef.current; + const rect = el.getBoundingClientRect(); + // Portrait renders with object-fit:cover, which scales the 9:16 image to + // FILL the box and crops the overflow — so the rendered box ≠ the full + // image. Map the click from box-space back into full-image-space via the + // cover geometry so the marker lands where the user tapped. Landscape's box + // matches the image aspect (no crop), so it keeps simple normalization. + let x: number; + let y: number; + if (orientation === "portrait") { + const nw = el.naturalWidth || 1024; + const nh = el.naturalHeight || 1792; + const scale = Math.max(rect.width / nw, rect.height / nh); + const dispW = nw * scale; + const dispH = nh * scale; + x = (e.clientX - rect.left + (dispW - rect.width) / 2) / dispW; + y = (e.clientY - rect.top + (dispH - rect.height) / 2) / dispH; + } else { + x = (e.clientX - rect.left) / rect.width; + y = (e.clientY - rect.top) / rect.height; + } // If the typewriter is still printing, a click completes it instantly // (standard VN affordance) — the page never sees this click. if (!typingDone) { @@ -291,13 +312,26 @@ export function PlayCanvas({ const interactive = phase === "ready" && !!imageUrl; const dimmed = phase === "transitioning"; - const sizeStyle = fullViewport - ? { maxWidth: "100vw", maxHeight: "100dvh" } - : { maxWidth: "96vw", maxHeight: "calc(100dvh - 200px)" }; + const portrait = orientation === "portrait"; + const intrinsicW = portrait ? 1024 : 1792; + const intrinsicH = portrait ? 1792 : 1024; - const placeholderWidth = fullViewport - ? "min(100vw, calc(100dvh * 16 / 9))" - : "min(96vw, calc((100dvh - 200px) * 16 / 9))"; + // Portrait (mobile) always fills the whole viewport with object-fit:cover so + // the 9:16 image matches the exact device/window — no letterbox. Landscape + // keeps the prior contain-style sizing so the full 16:9 frame stays visible. + const sizeStyle: React.CSSProperties = portrait + ? { width: "100vw", height: "100dvh", objectFit: "cover" } + : fullViewport + ? { maxWidth: "100vw", maxHeight: "100dvh" } + : { maxWidth: "96vw", maxHeight: "calc(100dvh - 200px)" }; + + const placeholderStyle: React.CSSProperties = portrait + ? { width: "100vw", height: "100dvh" } + : { + width: fullViewport + ? "min(100vw, calc(100dvh * 16 / 9))" + : "min(96vw, calc((100dvh - 200px) * 16 / 9))", + }; return ( @@ -305,11 +339,11 @@ export function PlayCanvas({ className={`flex flex-col items-center ${fullViewport ? "w-full h-full justify-center" : "w-full"}`} > {/* Hidden audio element — voice playback for the current beat */} - {audioBase64 && ( + {audioSrc && (