0e4c2ebef4
Squash-merge the cloudflare-migration branch (7 commits by Kai ki) into staging with conflict resolution, feature integration, and bug fixes. Engine: - Paradigm D: single-stream Writer replacing dual-phase Plan/Beats - Delete Architect agent; story bible generated via Writer <plan> tag - Modular prompt architecture (segments/registry/builder) - StreamRouter for tagged stream splitting (<plan>/<story>/<choices>) Infrastructure: - Cloudflare Workers deployment (wrangler.jsonc, OpenNext adapter) - D1 database schema + Drizzle ORM (scaffolded, not yet active) - R2 storage helpers (scaffolded, not yet active) - Story persistence API routes + client-side persistence BYOK (Bring Your Own Key): - /api/llm/user-proxy with SSRF-protected LLM proxy (+ requireUser auth) - CORS-aware fetch in ai-client: auto-detect CORS failure, fallback to server proxy transparently via OpenAI SDK custom fetch - BYO config support added to classify-freeform and vision routes - SettingsModal CORS privacy notice (keys never logged/stored) SSE streaming: - engineClient.ts: fetchSSE helper for progressive scene events - startSession/requestScene accept optional emit callback - Fix SSE error event field name (error → message) in scene/start routes i18n integration: - Wire buildLanguageDirective into paradigm D's prompt builder - Update corsNotice i18n keys (zh-CN/en/ja) with CORS proxy privacy text - Preserve Session.language + LanguageSwitcher from i18n commit Co-authored-by: Kai ki <155355644+zbf1009@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
768 lines
35 KiB
TypeScript
768 lines
35 KiB
TypeScript
// ──────────────────────────────────────────────────────────────────────
|
||
// Beat — one dialogue / narration moment within a Scene.
|
||
// Multiple beats share the same background image; tapping or choosing
|
||
// advances among them WITHOUT regenerating the image.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type Beat = {
|
||
id: string;
|
||
narration?: string;
|
||
speaker?: string;
|
||
line?: string;
|
||
/** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
|
||
lineDelivery?: string;
|
||
/**
|
||
* Characters visible in this beat with their pose / expression for this moment.
|
||
* Read by the Cinematographer when composing the scene's establishing shot —
|
||
* the beat the entry beat lands in is the visual anchor for the image.
|
||
*/
|
||
activeCharacters?: BeatActiveCharacter[];
|
||
next: BeatNext;
|
||
};
|
||
|
||
export type BeatActiveCharacter = {
|
||
name: string;
|
||
/** Free-form 中文 description of pose / expression / what the character is doing. */
|
||
pose?: string;
|
||
};
|
||
|
||
export type BeatNext =
|
||
| { type: "continue"; nextBeatId: string }
|
||
| { type: "choice"; choices: BeatChoice[] };
|
||
|
||
export type BeatChoice = {
|
||
id: string;
|
||
label: string;
|
||
effect: BeatChoiceEffect;
|
||
};
|
||
|
||
export type BeatChoiceEffect =
|
||
| { kind: "advance-beat"; targetBeatId: string }
|
||
| { kind: "change-scene"; nextSceneSeed: string };
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Orientation — session-wide image aspect, locked at session start.
|
||
// "landscape" → 16:9 (1792×1024), the default for desktop / mobile-landscape.
|
||
// "portrait" → 9:16 (1024×1792), painted for mobile users holding the phone
|
||
// upright so the scene fills the screen instead of letterboxing a widescreen
|
||
// image. CSS object-fit then adapts the 9:16 frame to the exact device size.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type Orientation = "portrait" | "landscape";
|
||
|
||
/** Normalize an untrusted orientation value (from a request body, or a
|
||
* persisted session that predates the field) to a valid Orientation.
|
||
* Anything other than "portrait" falls back to "landscape" (back-compat). */
|
||
export function coerceOrientation(value: unknown): Orientation {
|
||
return value === "portrait" ? "portrait" : "landscape";
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Scene — one background image + a graph of beats.
|
||
// The Director emits an entire Scene per call; the player navigates
|
||
// through its beats locally with zero network until exiting.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type Scene = {
|
||
id: string;
|
||
scenePrompt: string;
|
||
beats: Beat[];
|
||
entryBeatId: string;
|
||
/**
|
||
* Stable English slug identifying the visual scene's location + time,
|
||
* e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this
|
||
* key, the Painter slots the previous Scene's image into Runware's
|
||
* `referenceImages` (alongside character portraits) so the same physical
|
||
* space stays visually consistent across cuts.
|
||
*/
|
||
sceneKey?: string;
|
||
/**
|
||
* Runware UUID of this Scene's generated image. Cheapest form to send back
|
||
* to Runware's `referenceImages` in subsequent calls (UUID > URL > base64
|
||
* in transport cost). Not shown to the client — `imageUrl` is what renders.
|
||
*/
|
||
imageUuid?: string;
|
||
/**
|
||
* Public CDN URL of this Scene's generated image. Returned to the client for
|
||
* `<img src>` rendering; the client also feeds it through a Canvas 2D click
|
||
* annotator before posting to `/api/vision` (see
|
||
* `VisionRequest.annotatedImageBase64`).
|
||
*
|
||
* For MOCK_IMAGE=true this is a `data:image/svg+xml;...` data URI, not a
|
||
* Runware URL — the client renders both forms transparently.
|
||
*/
|
||
imageUrl?: string;
|
||
/**
|
||
* Orientation this scene's image was painted in. Mirrors the session's
|
||
* locked orientation; recorded per-scene so the client can pick the right
|
||
* intrinsic dimensions / object-fit even across legacy or mixed history.
|
||
*/
|
||
orientation?: Orientation;
|
||
};
|
||
|
||
export type SceneExit =
|
||
| {
|
||
kind: "choice";
|
||
choiceId: string;
|
||
label: string;
|
||
nextSceneSeed: string;
|
||
}
|
||
| { kind: "freeform"; action: string };
|
||
|
||
export type SceneHistoryEntry = {
|
||
scene: Scene;
|
||
visitedBeatIds: string[];
|
||
exit?: SceneExit;
|
||
/** Story memory immediately after this scene was generated. Used by imported
|
||
* story replays so continuing from an earlier shared scene preserves the
|
||
* right narrative context instead of jumping to the export-time final state. */
|
||
storyStateAfter?: StoryState;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Writer two-phase split
|
||
//
|
||
// The Writer runs as TWO LLM calls so scene-image generation can begin
|
||
// before the dialogue is fully written:
|
||
// Phase A (WriterPlan) — the minimal skeleton the image pipeline needs:
|
||
// sceneSummary + sceneKey + the entry beat's
|
||
// on-stage roster + the full cast to design.
|
||
// Phase B (beats) — the full beats[] graph + storyStatePatch, written
|
||
// to honor the plan, overlapped with image gen.
|
||
// The Cinematographer + character design + Painter all run off the Plan, so
|
||
// Phase B's (longer) output is hidden behind the image pipeline.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type WriterPlan = {
|
||
/** 中文 scene synopsis (location + time + mood + key event + opening hook).
|
||
* The sole input the Cinematographer composes the establishing shot from. */
|
||
sceneSummary: string;
|
||
/** English location+time slug for cross-scene visual continuity. */
|
||
sceneKey?: string;
|
||
/** Beat id the player lands on when entering the scene. Phase B must emit a
|
||
* beat with this id (reconciled if it doesn't). */
|
||
entryBeatId: string;
|
||
/** Every NPC name that appears anywhere in this scene. Drives character
|
||
* design (card + portrait + voice) IN PARALLEL with Phase B beat writing, so
|
||
* the whole cast is provisioned by the time the scene returns. Phase B may
|
||
* only use names from this list (plus the POV "你"). Never includes the player. */
|
||
cast: string[];
|
||
/** The entry beat's on-stage roster (who's visible + pose when the player
|
||
* lands). Drives the Cinematographer's framing and the entry-beat portraits
|
||
* the Painter anchors to. Never includes the POV player. */
|
||
entryActiveCharacters: BeatActiveCharacter[];
|
||
/** The entry beat's speaker — an NPC name, "你" (player speaking), or
|
||
* undefined for a pure narration/environment entry. Drives shot selection. */
|
||
entrySpeaker?: string;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Paradigm D — Writer single-pass streaming plan extensions.
|
||
//
|
||
// In paradigm D the Writer streams one tagged response: <plan> → <story>
|
||
// → <choices>. WriterScenePlan is the parsed <plan> segment: the existing
|
||
// WriterPlan skeleton PLUS per-character scene intents (and story bible on
|
||
// first scene), handed to the downstream media translators the instant
|
||
// </plan> closes.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
/** Per-scene performance intent for one character, authored by the Writer in
|
||
* the <plan> segment. Ephemeral (this scene only) — distinct from the
|
||
* persistent CharacterPersona card. Feeds downstream media translators. */
|
||
export type CharacterIntent = {
|
||
name: string;
|
||
/** 本幕情绪基调。 */
|
||
mood?: string;
|
||
/** 本幕动机 / 目的。 */
|
||
motivation?: string;
|
||
/** 本幕说话基调(指导对白质感 + TTS lineDelivery)。 */
|
||
speakingTone?: string;
|
||
};
|
||
|
||
/** Parsed <plan> tag: the existing WriterPlan shape plus per-character scene
|
||
* intents and optional story bible (first scene only). The optional extension
|
||
* keeps any degraded / minimal plan valid — downstream consumers see a
|
||
* WriterPlan superset. */
|
||
export type WriterScenePlan = WriterPlan & {
|
||
/** 各角色本幕表现意图,供 </plan> 闭合时分发下游媒体翻译官。 */
|
||
characterIntents?: CharacterIntent[];
|
||
/** 故事圣经(仅开局产出)——稳定区字段。后续场景 plan 不含此字段。 */
|
||
storyBible?: {
|
||
logline: string;
|
||
genreTags: string;
|
||
protagonist: string;
|
||
castNotes?: string;
|
||
};
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Characters & voices (TTS)
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type CharacterVoice =
|
||
| {
|
||
provider: "xiaomi";
|
||
/** Xiaomi MiMo design output stored as reference audio for later clones. */
|
||
referenceAudioBase64: string;
|
||
mimeType: string;
|
||
}
|
||
| {
|
||
provider: "stepfun";
|
||
/** StepFun preset voice ID (e.g. "cixingnansheng"). Selected by keyword
|
||
* matching against the LLM-written voiceDescription — no network call
|
||
* on provision (StepFun has no voicedesign endpoint), so this carries
|
||
* only the picked preset, not a clip. */
|
||
voiceId: string;
|
||
/** TTS model used at synth time (step-tts-mini / step-tts-2 / stepaudio-2.5-tts). */
|
||
model: string;
|
||
mimeType: string;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// CharacterPersona — narrative / story dimension of a Character.
|
||
// Merged into Character via intersection (all optional). Filled primarily
|
||
// by the Writer's <plan> 思维链 (paradigm D); the CharacterDesigner then
|
||
// realizes it into visual + voice cards. Absent on legacy sessions →
|
||
// callers degrade to "name only". SENTINEL append-only: adding persona
|
||
// only appends bytes to the stable prompt prefix — never reorders.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type CharacterPersona = {
|
||
/** 背景 / 身份 / 核心设定。 */
|
||
persona?: string;
|
||
/** 性格标签,如 ["傲娇", "腹黑", "重情义"]。 */
|
||
personalityTraits?: string[];
|
||
/** 说话风格 / 口头禅 — 对白质感的关键。 */
|
||
speakingStyle?: string;
|
||
/** 2-3 条代表性对白,作为 few-shot 锚定语气。 */
|
||
sampleDialogue?: string[];
|
||
/** 与玩家("你")的关系 / 态度。 */
|
||
relationshipToPlayer?: string;
|
||
/** 隐藏信息 / 伏笔,可驱动后续反转(默认不外显)。 */
|
||
secrets?: string[];
|
||
};
|
||
|
||
export type Character = {
|
||
name: string;
|
||
/**
|
||
* 中文 voice-acting direction card. Must begin with explicit gender, then
|
||
* age / timbre / personality / speed / accent. Fed to Xiaomi MiMo's
|
||
* voicedesign endpoint when the voice is first provisioned.
|
||
*/
|
||
voiceDescription: string;
|
||
/**
|
||
* English appearance card — comma-separated visual attributes following
|
||
* Runware/FLUX prompt-engineering convention. Fed to the Painter as a
|
||
* character archetype anchor so the same face/outfit/style stays consistent
|
||
* across every scene this character appears in.
|
||
*/
|
||
visualDescription?: string;
|
||
/**
|
||
* Runware UUID for the base portrait. Generated by the CharacterDesigner
|
||
* once, reused as a `referenceImages` entry on every subsequent scene the
|
||
* character appears in. UUID is the cheapest reference form for Runware.
|
||
*/
|
||
basePortraitUuid?: string;
|
||
/**
|
||
* Public CDN URL for the base portrait. Same image as `basePortraitUuid`;
|
||
* kept around for the client (if it ever wants to render character cards)
|
||
* and as a fallback reference form for `referenceImages` when UUID is absent.
|
||
*/
|
||
basePortraitUrl?: string;
|
||
/** Xiaomi MiMo voice reference audio. */
|
||
voice?: CharacterVoice;
|
||
/** StepFun preset voice id (e.g. "cixingnansheng"). Only present on
|
||
* characters designed while the server ran StepFun, OR on prebaked
|
||
* homepage cards enriched with a StepFun voice id. Lets the client send a
|
||
* lightweight beat-audio request (no ~220KB Xiaomi reference audio) when the
|
||
* server runs StepFun, and lets the server normalize an off-provider voice
|
||
* without a fresh provision. Validated against the catalog at synth time. */
|
||
stepfunVoiceId?: string;
|
||
} & CharacterPersona;
|
||
|
||
/** A single beat's synthesized audio, attached to the response. */
|
||
export type BeatAudio = {
|
||
base64: string;
|
||
mime: string;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// StoryState — the persistent "story bible" + evolving narrative memory.
|
||
//
|
||
// Created once at session start by the Architect agent (rich opening
|
||
// planning), then carried across every scene and incrementally updated by
|
||
// the Writer. This is the single throughline that keeps tone, cast, and
|
||
// stakes coherent across scene cuts — without it each Writer call would
|
||
// re-derive the whole arc from a flat beat log and drift.
|
||
//
|
||
// Split into STABLE fields (set by the Architect, rarely change) and
|
||
// VOLATILE fields (rewritten each scene via StoryStatePatch).
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type StoryState = {
|
||
// ── Stable (Architect-authored; persists unless deliberately revised) ──
|
||
/** One-line central dramatic question / 主线钩子. */
|
||
logline: string;
|
||
/** Genre + tone tags anchoring the 爽点 framework, e.g. "甜宠 / 校园 / 慢热治愈". */
|
||
genreTags: string;
|
||
/** Second-person protagonist card: who 你 are, the immediate situation, the
|
||
* core want, and a flaw/secret. The audience proxy — never rendered. */
|
||
protagonist: string;
|
||
/** Key supporting cast and their relationship/tension with 你 (one per line). */
|
||
castNotes?: string;
|
||
|
||
// ── Volatile (rewritten each scene by the Writer's StoryStatePatch) ──
|
||
/** Rolling, compressed synopsis of what has happened so far (~3-5 句). */
|
||
synopsis: string;
|
||
/** Unresolved hooks / mysteries / questions still owed to the player. */
|
||
openThreads?: string[];
|
||
/** Current relationship/emotion state per character, e.g.
|
||
* "夏海:好感升温,刚向你告白了一半". */
|
||
relationships?: string[];
|
||
/** Where the story is heading next — the conflict/reversal/suspense the
|
||
* next scene should drive toward. Seeds the next scene's hook. */
|
||
nextHook?: string;
|
||
};
|
||
|
||
/** The volatile subset the Writer rewrites after each scene. Stable fields
|
||
* (logline/genreTags/protagonist/castNotes) are preserved by the merge. */
|
||
export type StoryStatePatch = {
|
||
synopsis?: string;
|
||
openThreads?: string[];
|
||
relationships?: string[];
|
||
nextHook?: string;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// WorldBook — lightweight lore injection system.
|
||
//
|
||
// Entries with position "constant" are always injected into the stable
|
||
// prompt prefix. Entries with position "triggered" are scanned against
|
||
// recent beat text and injected into the dynamic suffix when keywords
|
||
// match. Priority controls ordering when multiple entries fire.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type WorldBookEntry = {
|
||
id: string;
|
||
/** Keywords that trigger this entry's injection (for triggered entries). */
|
||
keys: string[];
|
||
/** The lore content to inject into the prompt. */
|
||
content: string;
|
||
/** "constant" = always injected (stable prefix); "triggered" = keyword-matched (dynamic suffix). */
|
||
position: "constant" | "triggered";
|
||
/** Higher priority entries are injected first. Defaults to 0. */
|
||
priority?: number;
|
||
};
|
||
|
||
export type WorldBook = {
|
||
id: string;
|
||
name: string;
|
||
entries: WorldBookEntry[];
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Session
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type Session = {
|
||
id: string;
|
||
createdAt: number;
|
||
worldSetting: string;
|
||
styleGuide: string;
|
||
history: SceneHistoryEntry[];
|
||
/** Character registry — accumulates across scenes; voices + portraits persist for reuse. */
|
||
characters: Character[];
|
||
/**
|
||
* Persistent story bible + evolving narrative memory. Set at session start
|
||
* by the Architect, carried by the client across every /api/scene call, and
|
||
* updated by the Writer each scene. Optional for back-compat with any
|
||
* session payload created before this field existed.
|
||
*/
|
||
storyState?: StoryState;
|
||
/**
|
||
* Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
|
||
* When set, the Painter prepends it to `referenceImages` on every scene so the
|
||
* uploaded image anchors painting style (brush, color, mood) across the whole
|
||
* session. Resized client-side before upload (~512px max dim) to keep session
|
||
* payload small for /api/scene round-trips.
|
||
*/
|
||
styleReferenceImage?: string;
|
||
/**
|
||
* Session-wide image orientation, locked at session start from the client's
|
||
* device + orientation and carried on every /api/scene call so all scenes
|
||
* share one aspect ratio. Absent → "landscape" (back-compat).
|
||
*/
|
||
orientation?: Orientation;
|
||
/**
|
||
* Optional player-chosen display name. When set, NPC dialogue will address
|
||
* the player by this name instead of the generic "你". Stored client-side
|
||
* only (localStorage); never persisted server-side.
|
||
*/
|
||
playerName?: string;
|
||
/**
|
||
* Active UI locale when the session was started, in BCP-47 form (e.g.
|
||
* "zh-CN", "en", "ja"). The engine appends a single-line language directive
|
||
* to the Architect / Writer user messages so AI-generated dialogue, beats,
|
||
* and narration are produced in this language. Absent → "zh-CN" for
|
||
* back-compat with sessions created before this field existed.
|
||
*/
|
||
language?: string;
|
||
/**
|
||
* Optional world books for lore injection. "constant" entries are always in
|
||
* the prompt; "triggered" entries inject when keywords match recent text.
|
||
*/
|
||
worldBooks?: WorldBook[];
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Vision
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
export type ClickIntent = {
|
||
freeformAction: string;
|
||
reasoning: string;
|
||
};
|
||
|
||
export type VisionClassify = "insert-beat" | "change-scene";
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Provider config
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Wire protocol used to talk to a model provider. Which values are valid
|
||
* depends on the model role — each ai-client adapter accepts its own subset
|
||
* and falls back to a sensible default for anything else:
|
||
*
|
||
* openai_compatible text / vision / image — OpenAI Chat Completions +
|
||
* `/images/generations` (self-implemented fetch; the
|
||
* default for text/vision when unset)
|
||
* openai image only — OpenAI gpt-image via the
|
||
* official OpenAI SDK, unlocks reference-image editing
|
||
* (for text/vision use openai_compatible, which already
|
||
* speaks OpenAI's format)
|
||
* runware image only — Runware task-array protocol
|
||
* (self-implemented; the default for runware.ai URLs)
|
||
*/
|
||
export type ProviderProtocol =
|
||
| "openai_compatible"
|
||
| "openai"
|
||
| "runware";
|
||
|
||
export type ProviderConfig = {
|
||
baseUrl: string;
|
||
apiKey: string;
|
||
model: string;
|
||
/**
|
||
* Wire protocol. When unset, callers apply a role-specific default:
|
||
* text/vision → "openai_compatible"; image → inferred from baseUrl
|
||
* (runware.ai → "runware", otherwise "openai_compatible") so existing
|
||
* deployments keep working without setting *_PROVIDER.
|
||
*/
|
||
provider?: ProviderProtocol;
|
||
};
|
||
|
||
export type TtsConfig = {
|
||
baseUrl: string;
|
||
apiKey: string;
|
||
/** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
|
||
speechModel: string;
|
||
};
|
||
|
||
/** Which TTS provider the server is configured for (inferred from TtsConfig's
|
||
* base URL by lib/tts-client's isStepfun). Exposed to the client via the
|
||
* /api/tts-provider route so the play page can send only the voice fields
|
||
* the server actually needs — e.g. skip the ~220KB Xiaomi reference audio
|
||
* when the server runs StepFun (saving Fast Origin Transfer bandwidth).
|
||
* `null` means no server-side TTS (silent). BYO client TTS takes precedence
|
||
* over this signal. */
|
||
export type TtsProvider = "stepfun" | "xiaomi" | null;
|
||
|
||
// /api/tts-provider — lightweight GET returning the server's TTS provider so
|
||
// the client can shape beat-audio request bodies accordingly (see fetchBeatAudio
|
||
// in app/play/page.tsx). Response is a few dozen bytes; runs once per session.
|
||
export type TtsProviderResponse = {
|
||
provider: TtsProvider;
|
||
};
|
||
|
||
export type EngineConfig = {
|
||
text: ProviderConfig;
|
||
image: ProviderConfig;
|
||
vision: ProviderConfig;
|
||
/** Optional — when missing the game runs silently (no TTS). */
|
||
tts?: TtsConfig;
|
||
/** When true the renderer returns a placeholder PNG instead of calling the image API. */
|
||
mockImage?: boolean;
|
||
/**
|
||
* Per-attempt hard timeout (ms) for image-generation requests. Unset → no
|
||
* client-side timeout (only the provider's own gateway limits apply, e.g.
|
||
* Runware kills tasks at ~55s with a 504).
|
||
*/
|
||
imageTimeoutMs?: number;
|
||
/**
|
||
* Painter scene-paint hedge threshold (ms). When the Tier-A (referenced)
|
||
* paint hasn't completed after this long, a second identical request races
|
||
* the first and the earlier result wins. Unset/0 → hedging disabled.
|
||
* Derived from healthy-day Runware p95 (~14s); recommended 15000.
|
||
*/
|
||
imageHedgeMs?: number;
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// API contracts
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* BYOK (Bring Your Own Key) LLM credentials carried in request bodies.
|
||
* Per-role: text/image/vision can be independently configured. Keys never
|
||
* persist or log server-side — they only pass through request→config build
|
||
* (see lib/config.ts buildByoEngineConfig). vision typically mirrors text.
|
||
*/
|
||
export type ByoLlmKeys = {
|
||
text?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
|
||
image?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
|
||
vision?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
|
||
};
|
||
|
||
export type StartRequest = {
|
||
worldSetting: string;
|
||
styleGuide: string;
|
||
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
|
||
styleReferenceImage?: string;
|
||
/**
|
||
* When true the client supplied its own Xiaomi TTS key and will provision +
|
||
* synth voices in the browser (key never touches our server). The route then
|
||
* drops `config.tts` so the engine skips all server-side TTS work.
|
||
*/
|
||
clientTts?: boolean;
|
||
/**
|
||
* Device orientation chosen at session start. "portrait" makes the engine
|
||
* paint 9:16 vertical scene images (mobile, held upright); "landscape"
|
||
* (default) keeps 16:9 widescreen. Locked for the whole session.
|
||
*/
|
||
orientation?: Orientation;
|
||
/** Optional player display name — see Session.playerName. */
|
||
playerName?: string;
|
||
/** Active UI locale — see Session.language. Drives the engine's language
|
||
* directive so AI output is generated in the player's chosen language. */
|
||
language?: string;
|
||
/**
|
||
* BYOK: user-provided LLM keys. When present, server uses these to construct
|
||
* EngineConfig instead of reading from env. Per-role: text/image/vision can
|
||
* be independently configured. Keys never persist or log — they only pass
|
||
* through request→config construction.
|
||
*/
|
||
byo?: ByoLlmKeys;
|
||
};
|
||
|
||
// /api/parse-style-image — vision LLM extracts a textual painting-style
|
||
// prompt from a user-uploaded reference image. The same base64 is echoed
|
||
// back so the client can later pass it through to /api/start.
|
||
export type ParseStyleImageRequest = {
|
||
/** Data URL: `data:image/...;base64,...`. */
|
||
imageDataUrl: string;
|
||
};
|
||
|
||
export type ParseStyleImageResponse = {
|
||
/** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
|
||
stylePrompt: string;
|
||
};
|
||
|
||
export type StartResponse = {
|
||
sessionId: string;
|
||
scene: Scene;
|
||
/** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
|
||
imageUrl: string;
|
||
/** Character registry with voice references + visual cards provisioned. */
|
||
characters: Character[];
|
||
/** Story bible created by the Architect + updated by the opening scene's
|
||
* Writer. The client persists this into the session for later /api/scene calls. */
|
||
storyState: StoryState;
|
||
};
|
||
|
||
// /api/scene — generates the next Scene, given session whose latest
|
||
// history entry has `exit` set. Also used for prefetch speculation
|
||
// (frontend synthesizes a speculative exit).
|
||
export type SceneRequest = {
|
||
session: Session;
|
||
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
|
||
clientTts?: boolean;
|
||
/** See StartRequest.byo — BYOK LLM keys. */
|
||
byo?: ByoLlmKeys;
|
||
};
|
||
|
||
export type SceneResponse = {
|
||
scene: Scene;
|
||
/** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
|
||
imageUrl: string;
|
||
characters: Character[];
|
||
/** Story bible after this scene's Writer applied its update. The client
|
||
* must persist this back into the session so the throughline survives the
|
||
* next scene cut. */
|
||
storyState: StoryState;
|
||
};
|
||
|
||
// /api/beat-audio — lazily synthesize one beat's voice. Client fires this
|
||
// per beat after a scene loads; server has a per-call timeout so MiMo
|
||
// tail-latency cannot block the UI. A null audio response means "play silent."
|
||
export type BeatAudioRequest = {
|
||
beat: {
|
||
id: string;
|
||
line: string;
|
||
lineDelivery?: string;
|
||
};
|
||
/** The speaker's already-provisioned voice. Optional now — when the server
|
||
* runs a DIFFERENT provider than `voice.provider` (e.g. the client holds a
|
||
* Xiaomi voice from a prebaked card but the server runs StepFun), the
|
||
* client may omit `voice` and send `voiceDescription` + `stepfunVoiceId`
|
||
* instead to save the ~220KB reference-audio transfer. The server then
|
||
* re-provisions against its own provider before synthesizing. */
|
||
voice?: CharacterVoice;
|
||
/** Voice-design card (中文). Used by the server to re-provision when
|
||
* `voice` is absent or its provider doesn't match the server's TTS. */
|
||
voiceDescription?: string;
|
||
/** Speaker name — used as the StepFun provision salt for archetype spreading
|
||
* when the server falls back to pickStepfunVoiceId. */
|
||
characterName?: string;
|
||
/** Pre-selected StepFun preset id (from a live CharacterDesigner pick or a
|
||
* prebaked card). Honored directly when the server runs StepFun, skipping
|
||
* both the keyword scorer and a network provision. */
|
||
stepfunVoiceId?: string;
|
||
};
|
||
|
||
export type BeatAudioResponse = {
|
||
audio: BeatAudio | null;
|
||
};
|
||
|
||
// /api/vision — interprets a background click on the current image and
|
||
// classifies whether it should insert a beat (in-scene exploration) or
|
||
// trigger a scene change.
|
||
export type VisionRequest = {
|
||
session: Session;
|
||
/**
|
||
* Raw PNG base64 (no `data:` prefix) of the scene image WITH the player's
|
||
* click marker already drawn on it by the browser's Canvas 2D. The server
|
||
* forwards this straight to the vision LLM as an OpenAI-compatible
|
||
* image_url.
|
||
*
|
||
* Annotation lives client-side so the engine has no Node-native image
|
||
* dependency (sharp doesn't run on Cloudflare Workers) and we save a
|
||
* server-side image re-fetch per click.
|
||
*/
|
||
annotatedImageBase64: string;
|
||
/** See StartRequest.byo — BYOK LLM keys. */
|
||
byo?: ByoLlmKeys;
|
||
};
|
||
|
||
export type VisionResponse = {
|
||
intent: ClickIntent;
|
||
classify: VisionClassify;
|
||
};
|
||
|
||
// /api/classify-freeform — classifies a player's freeform text input
|
||
// into one of three paths: match an existing choice, insert a beat
|
||
// in-scene, or trigger a scene change.
|
||
export type FreeformClassifyRequest = {
|
||
session: Session;
|
||
freeformText: string;
|
||
/** See StartRequest.byo — BYOK LLM keys. */
|
||
byo?: ByoLlmKeys;
|
||
};
|
||
|
||
export type FreeformClassify = "insert-beat" | "change-scene";
|
||
|
||
export type FreeformClassifyResponse = {
|
||
classify: FreeformClassify;
|
||
freeformAction: string;
|
||
};
|
||
|
||
// /api/insert-beat — generates a single transient beat in response to
|
||
// a freeform vision action. Does NOT regenerate the image.
|
||
export type InsertBeatRequest = {
|
||
session: Session;
|
||
freeformAction: string;
|
||
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
|
||
clientTts?: boolean;
|
||
/** See StartRequest.byo — BYOK LLM keys. */
|
||
byo?: ByoLlmKeys;
|
||
};
|
||
|
||
/** Partial beat fields produced by the insert-beat director. */
|
||
export type InsertBeatPartial = {
|
||
narration?: string;
|
||
speaker?: string;
|
||
line?: string;
|
||
lineDelivery?: string;
|
||
};
|
||
|
||
export type InsertBeatResponse = {
|
||
partial: InsertBeatPartial;
|
||
characters: Character[];
|
||
};
|
||
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
// Paradigm D — streaming primitives (chatStream / StreamRouter / SSE)
|
||
//
|
||
// Output-side counterpart to prompt caching's input-side stable prefix
|
||
// (the two are orthogonal). chatStream yields incremental text + an
|
||
// end-of-stream usage promise. The StreamRouter slices the Writer's
|
||
// tagged stream into plan/story/choices and dispatches downstream. API
|
||
// routes serialize assembled fragments as SSE events for progressive
|
||
// client playback.
|
||
// ──────────────────────────────────────────────────────────────────────
|
||
|
||
/** Token usage stats returned at stream end. Kept SDK-agnostic so the type
|
||
* file doesn't depend on any specific provider package. */
|
||
export type ChatStreamUsage = {
|
||
prompt_tokens?: number;
|
||
completion_tokens?: number;
|
||
prompt_tokens_details?: { cached_tokens?: number };
|
||
};
|
||
|
||
/** Return shape of the streaming chat primitive (ai-client `chatStream`).
|
||
* `textStream` yields incremental chunks; `usage` resolves at stream end
|
||
* so `summarizeSdkUsage` cache accounting works unchanged. */
|
||
export type ChatStreamResult = {
|
||
textStream: AsyncIterable<string>;
|
||
usage: Promise<ChatStreamUsage | undefined>;
|
||
};
|
||
|
||
/** Callbacks the StreamRouter fires as it slices the Writer's tagged stream.
|
||
* All optional so a caller can subscribe to a subset. */
|
||
export type StreamRouterHandlers = {
|
||
/** `</plan>` closed — dispatch downstream media translators in parallel. */
|
||
onPlan?: (plan: WriterScenePlan) => void;
|
||
/** `<story>` incremental text — push to client for progressive playback. */
|
||
onBeat?: (beatChunk: string) => void;
|
||
/** `</story>` closed — prose finalized, ready for splitting. */
|
||
onStoryComplete?: (rawStory: string) => void;
|
||
/** `</choices>` closed. */
|
||
onChoices?: (choices: BeatChoice[]) => void;
|
||
};
|
||
|
||
/** Aggregate result of routing one Writer stream to completion. `degraded` is
|
||
* true when tag parsing fell back (missing / misordered / unclosed / timeout),
|
||
* per the degrade-before-main-path reliability rule. */
|
||
export type StreamRouterResult = {
|
||
plan?: WriterScenePlan;
|
||
beats: Beat[];
|
||
choices?: BeatChoice[];
|
||
/** Raw prose content of the <story> segment (not JSON-parsed). The director
|
||
* feeds this to proseSplitter to produce Beat[]. */
|
||
rawStorySegment?: string;
|
||
degraded: boolean;
|
||
};
|
||
|
||
/** Server → client SSE events for progressive scene playback (paradigm D).
|
||
* `TDone` is the terminal full-assembly payload — `SceneResponse` for
|
||
* `/api/scene`, `StartResponse` for `/api/start`. The prefetch path
|
||
* consumes events to `done` and reassembles a complete response. */
|
||
export type SceneStreamEvent<TDone = SceneResponse> =
|
||
| { type: "plan"; plan: WriterScenePlan }
|
||
| { type: "beat"; beat: Beat }
|
||
| { type: "background"; imageUrl: string; sceneKey?: string }
|
||
| { type: "voice"; name: string; voice: CharacterVoice }
|
||
| { type: "choices"; choices: BeatChoice[] }
|
||
| { type: "done"; response: TDone }
|
||
| { type: "error"; message: string; degraded?: boolean };
|