feat(engine): merge cloudflare-migration — paradigm D engine, BYOK proxy, story persistence (#95)

Squash-merge the cloudflare-migration branch (7 commits by Kai ki) into
staging with conflict resolution, feature integration, and bug fixes.

Engine:
- Paradigm D: single-stream Writer replacing dual-phase Plan/Beats
- Delete Architect agent; story bible generated via Writer <plan> tag
- Modular prompt architecture (segments/registry/builder)
- StreamRouter for tagged stream splitting (<plan>/<story>/<choices>)

Infrastructure:
- Cloudflare Workers deployment (wrangler.jsonc, OpenNext adapter)
- D1 database schema + Drizzle ORM (scaffolded, not yet active)
- R2 storage helpers (scaffolded, not yet active)
- Story persistence API routes + client-side persistence

BYOK (Bring Your Own Key):
- /api/llm/user-proxy with SSRF-protected LLM proxy (+ requireUser auth)
- CORS-aware fetch in ai-client: auto-detect CORS failure, fallback to
  server proxy transparently via OpenAI SDK custom fetch
- BYO config support added to classify-freeform and vision routes
- SettingsModal CORS privacy notice (keys never logged/stored)

SSE streaming:
- engineClient.ts: fetchSSE helper for progressive scene events
- startSession/requestScene accept optional emit callback
- Fix SSE error event field name (error → message) in scene/start routes

i18n integration:
- Wire buildLanguageDirective into paradigm D's prompt builder
- Update corsNotice i18n keys (zh-CN/en/ja) with CORS proxy privacy text
- Preserve Session.language + LanguageSwitcher from i18n commit

Co-authored-by: Kai ki <155355644+zbf1009@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Zonghao Yuan
2026-06-18 18:05:38 +08:00
committed by GitHub
parent 05bd7e229c
commit 0e4c2ebef4
78 changed files with 7396 additions and 919 deletions
+189 -1
View File
@@ -156,6 +156,45 @@ export type WriterPlan = {
entrySpeaker?: string;
};
// ──────────────────────────────────────────────────────────────────────
// Paradigm D — Writer single-pass streaming plan extensions.
//
// In paradigm D the Writer streams one tagged response: <plan> → <story>
// → <choices>. WriterScenePlan is the parsed <plan> segment: the existing
// WriterPlan skeleton PLUS per-character scene intents (and story bible on
// first scene), handed to the downstream media translators the instant
// </plan> closes.
// ──────────────────────────────────────────────────────────────────────
/** Per-scene performance intent for one character, authored by the Writer in
* the <plan> segment. Ephemeral (this scene only) — distinct from the
* persistent CharacterPersona card. Feeds downstream media translators. */
export type CharacterIntent = {
name: string;
/** 本幕情绪基调。 */
mood?: string;
/** 本幕动机 / 目的。 */
motivation?: string;
/** 本幕说话基调(指导对白质感 + TTS lineDelivery)。 */
speakingTone?: string;
};
/** Parsed <plan> tag: the existing WriterPlan shape plus per-character scene
* intents and optional story bible (first scene only). The optional extension
* keeps any degraded / minimal plan valid — downstream consumers see a
* WriterPlan superset. */
export type WriterScenePlan = WriterPlan & {
/** 各角色本幕表现意图,供 </plan> 闭合时分发下游媒体翻译官。 */
characterIntents?: CharacterIntent[];
/** 故事圣经(仅开局产出)——稳定区字段。后续场景 plan 不含此字段。 */
storyBible?: {
logline: string;
genreTags: string;
protagonist: string;
castNotes?: string;
};
};
// ──────────────────────────────────────────────────────────────────────
// Characters & voices (TTS)
// ──────────────────────────────────────────────────────────────────────
@@ -179,6 +218,30 @@ export type CharacterVoice =
mimeType: string;
};
// ──────────────────────────────────────────────────────────────────────
// CharacterPersona — narrative / story dimension of a Character.
// Merged into Character via intersection (all optional). Filled primarily
// by the Writer's <plan> 思维链 (paradigm D); the CharacterDesigner then
// realizes it into visual + voice cards. Absent on legacy sessions →
// callers degrade to "name only". SENTINEL append-only: adding persona
// only appends bytes to the stable prompt prefix — never reorders.
// ──────────────────────────────────────────────────────────────────────
export type CharacterPersona = {
/** 背景 / 身份 / 核心设定。 */
persona?: string;
/** 性格标签,如 ["傲娇", "腹黑", "重情义"]。 */
personalityTraits?: string[];
/** 说话风格 / 口头禅 — 对白质感的关键。 */
speakingStyle?: string;
/** 2-3 条代表性对白,作为 few-shot 锚定语气。 */
sampleDialogue?: string[];
/** 与玩家("你")的关系 / 态度。 */
relationshipToPlayer?: string;
/** 隐藏信息 / 伏笔,可驱动后续反转(默认不外显)。 */
secrets?: string[];
};
export type Character = {
name: string;
/**
@@ -215,7 +278,7 @@ export type Character = {
* server runs StepFun, and lets the server normalize an off-provider voice
* without a fresh provision. Validated against the catalog at synth time. */
stepfunVoiceId?: string;
};
} & CharacterPersona;
/** A single beat's synthesized audio, attached to the response. */
export type BeatAudio = {
@@ -270,6 +333,33 @@ export type StoryStatePatch = {
nextHook?: string;
};
// ──────────────────────────────────────────────────────────────────────
// WorldBook — lightweight lore injection system.
//
// Entries with position "constant" are always injected into the stable
// prompt prefix. Entries with position "triggered" are scanned against
// recent beat text and injected into the dynamic suffix when keywords
// match. Priority controls ordering when multiple entries fire.
// ──────────────────────────────────────────────────────────────────────
export type WorldBookEntry = {
id: string;
/** Keywords that trigger this entry's injection (for triggered entries). */
keys: string[];
/** The lore content to inject into the prompt. */
content: string;
/** "constant" = always injected (stable prefix); "triggered" = keyword-matched (dynamic suffix). */
position: "constant" | "triggered";
/** Higher priority entries are injected first. Defaults to 0. */
priority?: number;
};
export type WorldBook = {
id: string;
name: string;
entries: WorldBookEntry[];
};
// ──────────────────────────────────────────────────────────────────────
// Session
// ──────────────────────────────────────────────────────────────────────
@@ -317,6 +407,11 @@ export type Session = {
* back-compat with sessions created before this field existed.
*/
language?: string;
/**
* Optional world books for lore injection. "constant" entries are always in
* the prompt; "triggered" entries inject when keywords match recent text.
*/
worldBooks?: WorldBook[];
};
// ──────────────────────────────────────────────────────────────────────
@@ -417,6 +512,18 @@ export type EngineConfig = {
// API contracts
// ──────────────────────────────────────────────────────────────────────
/**
* BYOK (Bring Your Own Key) LLM credentials carried in request bodies.
* Per-role: text/image/vision can be independently configured. Keys never
* persist or log server-side — they only pass through request→config build
* (see lib/config.ts buildByoEngineConfig). vision typically mirrors text.
*/
export type ByoLlmKeys = {
text?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
image?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
vision?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
};
export type StartRequest = {
worldSetting: string;
styleGuide: string;
@@ -439,6 +546,13 @@ export type StartRequest = {
/** Active UI locale — see Session.language. Drives the engine's language
* directive so AI output is generated in the player's chosen language. */
language?: string;
/**
* BYOK: user-provided LLM keys. When present, server uses these to construct
* EngineConfig instead of reading from env. Per-role: text/image/vision can
* be independently configured. Keys never persist or log — they only pass
* through request→config construction.
*/
byo?: ByoLlmKeys;
};
// /api/parse-style-image — vision LLM extracts a textual painting-style
@@ -473,6 +587,8 @@ export type SceneRequest = {
session: Session;
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
clientTts?: boolean;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type SceneResponse = {
@@ -534,6 +650,8 @@ export type VisionRequest = {
* server-side image re-fetch per click.
*/
annotatedImageBase64: string;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type VisionResponse = {
@@ -547,6 +665,8 @@ export type VisionResponse = {
export type FreeformClassifyRequest = {
session: Session;
freeformText: string;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type FreeformClassify = "insert-beat" | "change-scene";
@@ -563,6 +683,8 @@ export type InsertBeatRequest = {
freeformAction: string;
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
clientTts?: boolean;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
/** Partial beat fields produced by the insert-beat director. */
@@ -577,3 +699,69 @@ export type InsertBeatResponse = {
partial: InsertBeatPartial;
characters: Character[];
};
// ──────────────────────────────────────────────────────────────────────
// Paradigm D — streaming primitives (chatStream / StreamRouter / SSE)
//
// Output-side counterpart to prompt caching's input-side stable prefix
// (the two are orthogonal). chatStream yields incremental text + an
// end-of-stream usage promise. The StreamRouter slices the Writer's
// tagged stream into plan/story/choices and dispatches downstream. API
// routes serialize assembled fragments as SSE events for progressive
// client playback.
// ──────────────────────────────────────────────────────────────────────
/** Token usage stats returned at stream end. Kept SDK-agnostic so the type
* file doesn't depend on any specific provider package. */
export type ChatStreamUsage = {
prompt_tokens?: number;
completion_tokens?: number;
prompt_tokens_details?: { cached_tokens?: number };
};
/** Return shape of the streaming chat primitive (ai-client `chatStream`).
* `textStream` yields incremental chunks; `usage` resolves at stream end
* so `summarizeSdkUsage` cache accounting works unchanged. */
export type ChatStreamResult = {
textStream: AsyncIterable<string>;
usage: Promise<ChatStreamUsage | undefined>;
};
/** Callbacks the StreamRouter fires as it slices the Writer's tagged stream.
* All optional so a caller can subscribe to a subset. */
export type StreamRouterHandlers = {
/** `</plan>` closed — dispatch downstream media translators in parallel. */
onPlan?: (plan: WriterScenePlan) => void;
/** `<story>` incremental text — push to client for progressive playback. */
onBeat?: (beatChunk: string) => void;
/** `</story>` closed — prose finalized, ready for splitting. */
onStoryComplete?: (rawStory: string) => void;
/** `</choices>` closed. */
onChoices?: (choices: BeatChoice[]) => void;
};
/** Aggregate result of routing one Writer stream to completion. `degraded` is
* true when tag parsing fell back (missing / misordered / unclosed / timeout),
* per the degrade-before-main-path reliability rule. */
export type StreamRouterResult = {
plan?: WriterScenePlan;
beats: Beat[];
choices?: BeatChoice[];
/** Raw prose content of the <story> segment (not JSON-parsed). The director
* feeds this to proseSplitter to produce Beat[]. */
rawStorySegment?: string;
degraded: boolean;
};
/** Server → client SSE events for progressive scene playback (paradigm D).
* `TDone` is the terminal full-assembly payload — `SceneResponse` for
* `/api/scene`, `StartResponse` for `/api/start`. The prefetch path
* consumes events to `done` and reassembles a complete response. */
export type SceneStreamEvent<TDone = SceneResponse> =
| { type: "plan"; plan: WriterScenePlan }
| { type: "beat"; beat: Beat }
| { type: "background"; imageUrl: string; sceneKey?: string }
| { type: "voice"; name: string; voice: CharacterVoice }
| { type: "choices"; choices: BeatChoice[] }
| { type: "done"; response: TDone }
| { type: "error"; message: string; degraded?: boolean };