feat(engine): merge cloudflare-migration — paradigm D engine, BYOK proxy, story persistence (#95)

Squash-merge the cloudflare-migration branch (7 commits by Kai ki) into staging with conflict resolution, feature integration, and bug fixes. Engine: - Paradigm D: single-stream Writer replacing dual-phase Plan/Beats - Delete Architect agent; story bible generated via Writer <plan> tag - Modular prompt architecture (segments/registry/builder) - StreamRouter for tagged stream splitting (<plan>/<story>/<choices>) Infrastructure: - Cloudflare Workers deployment (wrangler.jsonc, OpenNext adapter) - D1 database schema + Drizzle ORM (scaffolded, not yet active) - R2 storage helpers (scaffolded, not yet active) - Story persistence API routes + client-side persistence BYOK (Bring Your Own Key): - /api/llm/user-proxy with SSRF-protected LLM proxy (+ requireUser auth) - CORS-aware fetch in ai-client: auto-detect CORS failure, fallback to server proxy transparently via OpenAI SDK custom fetch - BYO config support added to classify-freeform and vision routes - SettingsModal CORS privacy notice (keys never logged/stored) SSE streaming: - engineClient.ts: fetchSSE helper for progressive scene events - startSession/requestScene accept optional emit callback - Fix SSE error event field name (error → message) in scene/start routes i18n integration: - Wire buildLanguageDirective into paradigm D's prompt builder - Update corsNotice i18n keys (zh-CN/en/ja) with CORS proxy privacy text - Preserve Session.language + LanguageSwitcher from i18n commit Co-authored-by: Kai ki <155355644+zbf1009@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-18 18:05:38 +08:00
parent 05bd7e229c
commit 0e4c2ebef4
78 changed files with 7396 additions and 919 deletions
@@ -156,6 +156,45 @@ export type WriterPlan = {
  entrySpeaker?: string;
 };

+// ──────────────────────────────────────────────────────────────────────
+//  Paradigm D — Writer single-pass streaming plan extensions.
+//
+//  In paradigm D the Writer streams one tagged response: <plan> → <story>
+//  → <choices>. WriterScenePlan is the parsed <plan> segment: the existing
+//  WriterPlan skeleton PLUS per-character scene intents (and story bible on
+//  first scene), handed to the downstream media translators the instant
+//  </plan> closes.
+// ──────────────────────────────────────────────────────────────────────
+
+/** Per-scene performance intent for one character, authored by the Writer in
+ *  the <plan> segment. Ephemeral (this scene only) — distinct from the
+ *  persistent CharacterPersona card. Feeds downstream media translators. */
+export type CharacterIntent = {
+  name: string;
+  /** 本幕情绪基调。 */
+  mood?: string;
+  /** 本幕动机 / 目的。 */
+  motivation?: string;
+  /** 本幕说话基调（指导对白质感 + TTS lineDelivery）。 */
+  speakingTone?: string;
+};
+
+/** Parsed <plan> tag: the existing WriterPlan shape plus per-character scene
+ *  intents and optional story bible (first scene only). The optional extension
+ *  keeps any degraded / minimal plan valid — downstream consumers see a
+ *  WriterPlan superset. */
+export type WriterScenePlan = WriterPlan & {
+  /** 各角色本幕表现意图，供 </plan> 闭合时分发下游媒体翻译官。 */
+  characterIntents?: CharacterIntent[];
+  /** 故事圣经（仅开局产出）——稳定区字段。后续场景 plan 不含此字段。 */
+  storyBible?: {
+    logline: string;
+    genreTags: string;
+    protagonist: string;
+    castNotes?: string;
+  };
+};
+
 // ──────────────────────────────────────────────────────────────────────
 //  Characters & voices (TTS)
 // ──────────────────────────────────────────────────────────────────────
@@ -179,6 +218,30 @@ export type CharacterVoice =
      mimeType: string;
    };

+// ──────────────────────────────────────────────────────────────────────
+//  CharacterPersona — narrative / story dimension of a Character.
+//  Merged into Character via intersection (all optional). Filled primarily
+//  by the Writer's <plan> 思维链 (paradigm D); the CharacterDesigner then
+//  realizes it into visual + voice cards. Absent on legacy sessions →
+//  callers degrade to "name only". SENTINEL append-only: adding persona
+//  only appends bytes to the stable prompt prefix — never reorders.
+// ──────────────────────────────────────────────────────────────────────
+
+export type CharacterPersona = {
+  /** 背景 / 身份 / 核心设定。 */
+  persona?: string;
+  /** 性格标签，如 ["傲娇", "腹黑", "重情义"]。 */
+  personalityTraits?: string[];
+  /** 说话风格 / 口头禅 — 对白质感的关键。 */
+  speakingStyle?: string;
+  /** 2-3 条代表性对白，作为 few-shot 锚定语气。 */
+  sampleDialogue?: string[];
+  /** 与玩家("你")的关系 / 态度。 */
+  relationshipToPlayer?: string;
+  /** 隐藏信息 / 伏笔，可驱动后续反转（默认不外显）。 */
+  secrets?: string[];
+};
+
 export type Character = {
  name: string;
  /**
@@ -215,7 +278,7 @@ export type Character = {
   *  server runs StepFun, and lets the server normalize an off-provider voice
   *  without a fresh provision. Validated against the catalog at synth time. */
  stepfunVoiceId?: string;
-};
+} & CharacterPersona;

 /** A single beat's synthesized audio, attached to the response. */
 export type BeatAudio = {
@@ -270,6 +333,33 @@ export type StoryStatePatch = {
  nextHook?: string;
 };

+// ──────────────────────────────────────────────────────────────────────
+//  WorldBook — lightweight lore injection system.
+//
+//  Entries with position "constant" are always injected into the stable
+//  prompt prefix. Entries with position "triggered" are scanned against
+//  recent beat text and injected into the dynamic suffix when keywords
+//  match. Priority controls ordering when multiple entries fire.
+// ──────────────────────────────────────────────────────────────────────
+
+export type WorldBookEntry = {
+  id: string;
+  /** Keywords that trigger this entry's injection (for triggered entries). */
+  keys: string[];
+  /** The lore content to inject into the prompt. */
+  content: string;
+  /** "constant" = always injected (stable prefix); "triggered" = keyword-matched (dynamic suffix). */
+  position: "constant" | "triggered";
+  /** Higher priority entries are injected first. Defaults to 0. */
+  priority?: number;
+};
+
+export type WorldBook = {
+  id: string;
+  name: string;
+  entries: WorldBookEntry[];
+};
+
 // ──────────────────────────────────────────────────────────────────────
 //  Session
 // ──────────────────────────────────────────────────────────────────────
@@ -317,6 +407,11 @@ export type Session = {
   * back-compat with sessions created before this field existed.
   */
  language?: string;
+  /**
+   * Optional world books for lore injection. "constant" entries are always in
+   * the prompt; "triggered" entries inject when keywords match recent text.
+   */
+  worldBooks?: WorldBook[];
 };

 // ──────────────────────────────────────────────────────────────────────
@@ -417,6 +512,18 @@ export type EngineConfig = {
 //  API contracts
 // ──────────────────────────────────────────────────────────────────────

+/**
+ * BYOK (Bring Your Own Key) LLM credentials carried in request bodies.
+ * Per-role: text/image/vision can be independently configured. Keys never
+ * persist or log server-side — they only pass through request→config build
+ * (see lib/config.ts buildByoEngineConfig). vision typically mirrors text.
+ */
+export type ByoLlmKeys = {
+  text?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
+  image?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
+  vision?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
+};
+
 export type StartRequest = {
  worldSetting: string;
  styleGuide: string;
@@ -439,6 +546,13 @@ export type StartRequest = {
  /** Active UI locale — see Session.language. Drives the engine's language
   *  directive so AI output is generated in the player's chosen language. */
  language?: string;
+  /**
+   * BYOK: user-provided LLM keys. When present, server uses these to construct
+   * EngineConfig instead of reading from env. Per-role: text/image/vision can
+   * be independently configured. Keys never persist or log — they only pass
+   * through request→config construction.
+   */
+  byo?: ByoLlmKeys;
 };

 // /api/parse-style-image — vision LLM extracts a textual painting-style
@@ -473,6 +587,8 @@ export type SceneRequest = {
  session: Session;
  /** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
  clientTts?: boolean;
+  /** See StartRequest.byo — BYOK LLM keys. */
+  byo?: ByoLlmKeys;
 };

 export type SceneResponse = {
@@ -534,6 +650,8 @@ export type VisionRequest = {
   * server-side image re-fetch per click.
   */
  annotatedImageBase64: string;
+  /** See StartRequest.byo — BYOK LLM keys. */
+  byo?: ByoLlmKeys;
 };

 export type VisionResponse = {
@@ -547,6 +665,8 @@ export type VisionResponse = {
 export type FreeformClassifyRequest = {
  session: Session;
  freeformText: string;
+  /** See StartRequest.byo — BYOK LLM keys. */
+  byo?: ByoLlmKeys;
 };

 export type FreeformClassify = "insert-beat" | "change-scene";
@@ -563,6 +683,8 @@ export type InsertBeatRequest = {
  freeformAction: string;
  /** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
  clientTts?: boolean;
+  /** See StartRequest.byo — BYOK LLM keys. */
+  byo?: ByoLlmKeys;
 };

 /** Partial beat fields produced by the insert-beat director. */
@@ -577,3 +699,69 @@ export type InsertBeatResponse = {
  partial: InsertBeatPartial;
  characters: Character[];
 };
+
+// ──────────────────────────────────────────────────────────────────────
+//  Paradigm D — streaming primitives (chatStream / StreamRouter / SSE)
+//
+//  Output-side counterpart to prompt caching's input-side stable prefix
+//  (the two are orthogonal). chatStream yields incremental text + an
+//  end-of-stream usage promise. The StreamRouter slices the Writer's
+//  tagged stream into plan/story/choices and dispatches downstream. API
+//  routes serialize assembled fragments as SSE events for progressive
+//  client playback.
+// ──────────────────────────────────────────────────────────────────────
+
+/** Token usage stats returned at stream end. Kept SDK-agnostic so the type
+ *  file doesn't depend on any specific provider package. */
+export type ChatStreamUsage = {
+  prompt_tokens?: number;
+  completion_tokens?: number;
+  prompt_tokens_details?: { cached_tokens?: number };
+};
+
+/** Return shape of the streaming chat primitive (ai-client `chatStream`).
+ *  `textStream` yields incremental chunks; `usage` resolves at stream end
+ *  so `summarizeSdkUsage` cache accounting works unchanged. */
+export type ChatStreamResult = {
+  textStream: AsyncIterable<string>;
+  usage: Promise<ChatStreamUsage | undefined>;
+};
+
+/** Callbacks the StreamRouter fires as it slices the Writer's tagged stream.
+ *  All optional so a caller can subscribe to a subset. */
+export type StreamRouterHandlers = {
+  /** `</plan>` closed — dispatch downstream media translators in parallel. */
+  onPlan?: (plan: WriterScenePlan) => void;
+  /** `<story>` incremental text — push to client for progressive playback. */
+  onBeat?: (beatChunk: string) => void;
+  /** `</story>` closed — prose finalized, ready for splitting. */
+  onStoryComplete?: (rawStory: string) => void;
+  /** `</choices>` closed. */
+  onChoices?: (choices: BeatChoice[]) => void;
+};
+
+/** Aggregate result of routing one Writer stream to completion. `degraded` is
+ *  true when tag parsing fell back (missing / misordered / unclosed / timeout),
+ *  per the degrade-before-main-path reliability rule. */
+export type StreamRouterResult = {
+  plan?: WriterScenePlan;
+  beats: Beat[];
+  choices?: BeatChoice[];
+  /** Raw prose content of the <story> segment (not JSON-parsed). The director
+   *  feeds this to proseSplitter to produce Beat[]. */
+  rawStorySegment?: string;
+  degraded: boolean;
+};
+
+/** Server → client SSE events for progressive scene playback (paradigm D).
+ *  `TDone` is the terminal full-assembly payload — `SceneResponse` for
+ *  `/api/scene`, `StartResponse` for `/api/start`. The prefetch path
+ *  consumes events to `done` and reassembles a complete response. */
+export type SceneStreamEvent<TDone = SceneResponse> =
+  | { type: "plan"; plan: WriterScenePlan }
+  | { type: "beat"; beat: Beat }
+  | { type: "background"; imageUrl: string; sceneKey?: string }
+  | { type: "voice"; name: string; voice: CharacterVoice }
+  | { type: "choices"; choices: BeatChoice[] }
+  | { type: "done"; response: TDone }
+  | { type: "error"; message: string; degraded?: boolean };