feat(web): embed beat audio into gallery and infiplot exports

Walk every speaking beat at export time, reuse current scene's beatAudioMap, and synth the rest via BYO TTS or /api/beat-audio with concurrency 4. Show a progress toast on the play page while collecting. Gallery export keeps audio in a sidecar localStorage key so the first paint is not blocked by JSON.parse-ing several MB of base64; the gallery lazy-loads it after the first scene image, then plays per-beat audio with a mute toggle persisted to localStorage. .infiplot share files embed audioByBeatId in the doc itself (v2); on import the data URIs survive scene swaps and feed back into the per-beat audio map so replayers hear the original voices for free. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-11 09:29:16 +08:00
parent a61a91060d
commit 621f83c47b
6 changed files with 528 additions and 59 deletions
@@ -53,7 +53,7 @@ type AnalyticsEventData = {
  tts_toggle: { muted: boolean };
  fullscreen_toggle: { on: boolean };
  play_heartbeat: never;
-  gallery_export: { scene_count: number };
+  gallery_export: { scene_count: number; audio_count: number };
 };

 export type AnalyticsEvent = keyof AnalyticsEventData;
@@ -0,0 +1,199 @@
+// ──────────────────────────────────────────────────────────────────────
+//  Audio collection for the gallery / .infiplot share exports.
+//
+//  Walks every speaking beat across `session.history` and produces a
+//  Record keyed by `${sceneId}:${beatId}` whose values are inline
+//  data: URIs (base64). Data URIs are the only audio form that survives
+//  transport through localStorage, AES-GCM ciphertext, and a fresh
+//  browser tab — blob: URLs from /api/beat-audio are tied to the document
+//  that created them.
+//
+//  Three sources, in priority order:
+//    1. prebaked  — audio that came in through a .infiplot share file.
+//                   Already a data URI, so just copied through.
+//    2. current beatAudioMap — the play page's per-beat audio for the
+//                   scene the player is on right now. Blob URLs get
+//                   converted to data URIs; data URIs pass through.
+//    3. fresh synth — BYO client TTS (browser-direct Xiaomi/StepFun) when
+//                   a key is configured, otherwise /api/beat-audio.
+//
+//  Concurrency 4 to keep TTS providers happy when a long session has
+//  dozens of speaking beats. Errors are silently skipped — a missing beat
+//  just plays without voice; we never block the export on a TTS hiccup.
+// ──────────────────────────────────────────────────────────────────────
+
+import { provisionVoice, synthesize } from "@infiplot/tts-client";
+import type {
+  Beat,
+  Character,
+  CharacterVoice,
+  Session,
+  TtsConfig,
+} from "@infiplot/types";
+
+const CONCURRENCY = 4;
+
+export type CollectBeatAudioOptions = {
+  session: Session;
+  /** Current-scene audio already loaded by the play page (keyed by bare beat id). */
+  beatAudioMap: Record<string, string>;
+  /** Scene id `beatAudioMap` belongs to (so we can promote its entries into the full key). */
+  currentSceneId: string | null;
+  /** BYO TTS config when the user supplied their own key; null for server-side TTS. */
+  byoTts: TtsConfig | null;
+  /** Cache of in-flight BYO voice provisions, keyed by character name. Reused across calls. */
+  byoVoiceCache: Map<string, Promise<CharacterVoice>>;
+  /** Audio carried in from a `.infiplot` share file (already keyed by `sceneId:beatId`). */
+  prebakedAudio?: Record<string, string>;
+  /** Progress callback (done/total). Fired after every beat (success or failure). */
+  onProgress?: (done: number, total: number) => void;
+  signal?: AbortSignal;
+};
+
+type Job = {
+  key: string;
+  scene: Session["history"][number]["scene"];
+  beat: Beat;
+};
+
+export async function collectBeatAudioForExport(
+  opts: CollectBeatAudioOptions,
+): Promise<Record<string, string>> {
+  const out: Record<string, string> = {};
+
+  if (opts.prebakedAudio) {
+    for (const [k, v] of Object.entries(opts.prebakedAudio)) {
+      if (typeof v === "string" && v.startsWith("data:")) out[k] = v;
+    }
+  }
+
+  const jobs: Job[] = [];
+  for (const entry of opts.session.history) {
+    const scene = entry.scene;
+    for (const beat of scene.beats) {
+      if (!beat.speaker || !beat.line) continue;
+      const key = `${scene.id}:${beat.id}`;
+      if (out[key]) continue;
+      jobs.push({ key, scene, beat });
+    }
+  }
+
+  // Hoist current-scene blob/data URLs first so the play page's already-
+  // synthesized audio is reused instead of re-billed. Blob URLs are local to
+  // this document — convert to base64 so they survive export.
+  if (opts.currentSceneId) {
+    for (const job of jobs) {
+      if (job.scene.id !== opts.currentSceneId) continue;
+      const local = opts.beatAudioMap[job.beat.id];
+      if (!local) continue;
+      try {
+        out[job.key] = await urlToDataUri(local);
+      } catch {
+        // ignore — falls through to synth below
+      }
+    }
+  }
+
+  const remaining = jobs.filter((j) => !out[j.key]);
+  const total = jobs.length;
+  let done = jobs.length - remaining.length;
+  opts.onProgress?.(done, total);
+
+  const charByName = new Map(opts.session.characters.map((c) => [c.name, c]));
+
+  let cursor = 0;
+  async function worker(): Promise<void> {
+    while (cursor < remaining.length) {
+      if (opts.signal?.aborted) return;
+      const job = remaining[cursor++]!;
+      try {
+        const audio = await synthesizeBeatForExport(
+          job.beat,
+          charByName.get(job.beat.speaker!),
+          opts.byoTts,
+          opts.byoVoiceCache,
+          opts.signal,
+        );
+        if (audio) out[job.key] = audio;
+      } catch {
+        // silent — beat will play without voice
+      }
+      done++;
+      opts.onProgress?.(done, total);
+    }
+  }
+
+  const workers = Array.from(
+    { length: Math.min(CONCURRENCY, Math.max(1, remaining.length)) },
+    () => worker(),
+  );
+  await Promise.all(workers);
+  return out;
+}
+
+async function synthesizeBeatForExport(
+  beat: Beat,
+  speaker: Character | undefined,
+  byo: TtsConfig | null,
+  voiceCache: Map<string, Promise<CharacterVoice>>,
+  signal?: AbortSignal,
+): Promise<string | null> {
+  if (!speaker || !beat.line) return null;
+
+  if (byo) {
+    let voiceP = voiceCache.get(speaker.name);
+    if (!voiceP) {
+      if (speaker.voice) {
+        voiceP = Promise.resolve(speaker.voice);
+      } else if (speaker.voiceDescription) {
+        voiceP = provisionVoice(byo, speaker.voiceDescription, speaker.name);
+      } else {
+        return null;
+      }
+      voiceCache.set(speaker.name, voiceP);
+    }
+    let voice: CharacterVoice;
+    try {
+      voice = await voiceP;
+    } catch {
+      voiceCache.delete(speaker.name);
+      return null;
+    }
+    const out = await synthesize(byo, voice, beat.line, beat.lineDelivery, signal);
+    return `data:${out.mimeType};base64,${out.audioBase64}`;
+  }
+
+  if (!speaker.voice) return null;
+  const res = await fetch("/api/beat-audio", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      beat: { id: beat.id, line: beat.line, lineDelivery: beat.lineDelivery },
+      voice: speaker.voice,
+    }),
+    signal,
+  });
+  if (res.status === 204 || !res.ok) return null;
+  const blob = await res.blob();
+  return await blobToDataUri(blob);
+}
+
+async function urlToDataUri(url: string): Promise<string> {
+  if (url.startsWith("data:")) return url;
+  const res = await fetch(url);
+  const blob = await res.blob();
+  return await blobToDataUri(blob);
+}
+
+function blobToDataUri(blob: Blob): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onloadend = () => {
+      const v = reader.result;
+      if (typeof v === "string") resolve(v);
+      else reject(new Error("FileReader produced non-string result"));
+    };
+    reader.onerror = () => reject(reader.error ?? new Error("FileReader failed"));
+    reader.readAsDataURL(blob);
+  });
+}
@@ -11,7 +11,7 @@ import type {
 export const STORY_SHARE_STORAGE_KEY = "infiplot:story-import";

 export type StoryShareDoc = {
-  v: 1;
+  v: 1 | 2;
  kind: "infiplot-story";
  exportedAt: number;
  current: {
@@ -19,6 +19,11 @@ export type StoryShareDoc = {
    beatId?: string;
  };
  session: Session;
+  /** Pre-synthesized per-beat audio (data:audio/...;base64,...). Keyed by
+   *  `${sceneId}:${beatId}`. v2+ only — older files just have no audio and
+   *  play silent on replay. Embedding keeps the share file self-contained
+   *  so a friend can hear the recorded voices without their own TTS key. */
+  audioByBeatId?: Record<string, string>;
 };

 type JsonRecord = Record<string, unknown>;
@@ -133,13 +138,16 @@ function sanitizeSessionForShare(session: Session): Session {
 export function createStoryShareDoc(
  session: Session,
  current: { sceneIndex: number; beatId?: string },
+  audioByBeatId?: Record<string, string>,
 ): StoryShareDoc {
+  const hasAudio = !!audioByBeatId && Object.keys(audioByBeatId).length > 0;
  return {
-    v: 1,
+    v: hasAudio ? 2 : 1,
    kind: "infiplot-story",
    exportedAt: Date.now(),
    current,
    session: sanitizeSessionForShare(session),
+    ...(hasAudio ? { audioByBeatId } : {}),
  };
 }

@@ -149,7 +157,7 @@ export function storyShareFilename(doc: StoryShareDoc): string {

 export function parseStoryShareDoc(value: unknown): StoryShareDoc {
  if (!isRecord(value)) throw new Error("这不是有效的剧情分享文件");
-  if (value.kind !== "infiplot-story" || value.v !== 1) {
+  if (value.kind !== "infiplot-story" || (value.v !== 1 && value.v !== 2)) {
    throw new Error("剧情分享文件格式不支持");
  }
  if (typeof value.exportedAt !== "number" || !Number.isFinite(value.exportedAt)) {
@@ -211,9 +219,22 @@ export function parseStoryShareDoc(value: unknown): StoryShareDoc {
    }
  }

+  let audioByBeatId: Record<string, string> | undefined;
+  if (value.audioByBeatId !== undefined) {
+    if (!isRecord(value.audioByBeatId)) {
+      throw new Error("剧情分享文件配音数据不合法");
+    }
+    const cleaned: Record<string, string> = {};
+    for (const [k, v] of Object.entries(value.audioByBeatId)) {
+      if (typeof v === "string" && v.startsWith("data:")) cleaned[k] = v;
+    }
+    if (Object.keys(cleaned).length > 0) audioByBeatId = cleaned;
+  }
+
  const doc = value as StoryShareDoc;
  return {
    ...doc,
    session: sanitizeSessionForShare(doc.session),
+    ...(audioByBeatId ? { audioByBeatId } : {}),
  };
 }