Files
infiplot-web/lib/types/index.ts
T
yuanzonghao d5b4a02cb3 refactor(engine): remove follow-up choices from insert-beat, keep multi-beat only
Insert-beat is a pure in-scene micro-interaction — adding choices that
lead to change-scene contradicted its purpose. Now insert-beat generates
1-3 richer beats then loops back to the original options, which is the
natural UX for "you glanced at something decorative."

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-24 19:09:09 +08:00

775 lines
35 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ──────────────────────────────────────────────────────────────────────
// Beat — one dialogue / narration moment within a Scene.
// Multiple beats share the same background image; tapping or choosing
// advances among them WITHOUT regenerating the image.
// ──────────────────────────────────────────────────────────────────────
export type Beat = {
id: string;
narration?: string;
speaker?: string;
line?: string;
/** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */
lineDelivery?: string;
/**
* Characters visible in this beat with their pose / expression for this moment.
* Read by the Cinematographer when composing the scene's establishing shot —
* the beat the entry beat lands in is the visual anchor for the image.
*/
activeCharacters?: BeatActiveCharacter[];
next: BeatNext;
};
export type BeatActiveCharacter = {
name: string;
/** Free-form 中文 description of pose / expression / what the character is doing. */
pose?: string;
};
export type BeatNext =
| { type: "continue"; nextBeatId: string }
| { type: "choice"; choices: BeatChoice[] };
export type BeatChoice = {
id: string;
label: string;
effect: BeatChoiceEffect;
};
export type BeatChoiceEffect =
| { kind: "advance-beat"; targetBeatId: string }
| { kind: "change-scene"; nextSceneSeed: string };
// ──────────────────────────────────────────────────────────────────────
// Orientation — session-wide image aspect, locked at session start.
// "landscape" → 16:9 (1792×1024), the default for desktop / mobile-landscape.
// "portrait" → 9:16 (1024×1792), painted for mobile users holding the phone
// upright so the scene fills the screen instead of letterboxing a widescreen
// image. CSS object-fit then adapts the 9:16 frame to the exact device size.
// ──────────────────────────────────────────────────────────────────────
export type Orientation = "portrait" | "landscape";
/** Normalize an untrusted orientation value (from a request body, or a
* persisted session that predates the field) to a valid Orientation.
* Anything other than "portrait" falls back to "landscape" (back-compat). */
export function coerceOrientation(value: unknown): Orientation {
return value === "portrait" ? "portrait" : "landscape";
}
// ──────────────────────────────────────────────────────────────────────
// Scene — one background image + a graph of beats.
// The Director emits an entire Scene per call; the player navigates
// through its beats locally with zero network until exiting.
// ──────────────────────────────────────────────────────────────────────
export type Scene = {
id: string;
scenePrompt: string;
beats: Beat[];
entryBeatId: string;
/**
* Stable English slug identifying the visual scene's location + time,
* e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this
* key, the Painter slots the previous Scene's image into Runware's
* `referenceImages` (alongside character portraits) so the same physical
* space stays visually consistent across cuts.
*/
sceneKey?: string;
/**
* Runware UUID of this Scene's generated image. Cheapest form to send back
* to Runware's `referenceImages` in subsequent calls (UUID > URL > base64
* in transport cost). Not shown to the client — `imageUrl` is what renders.
*/
imageUuid?: string;
/**
* Public CDN URL of this Scene's generated image. Returned to the client for
* `<img src>` rendering; the client also feeds it through a Canvas 2D click
* annotator before posting to `/api/vision` (see
* `VisionRequest.annotatedImageBase64`).
*
* For MOCK_IMAGE=true this is a `data:image/svg+xml;...` data URI, not a
* Runware URL — the client renders both forms transparently.
*/
imageUrl?: string;
/**
* Orientation this scene's image was painted in. Mirrors the session's
* locked orientation; recorded per-scene so the client can pick the right
* intrinsic dimensions / object-fit even across legacy or mixed history.
*/
orientation?: Orientation;
};
export type SceneExit =
| {
kind: "choice";
choiceId: string;
label: string;
nextSceneSeed: string;
}
| { kind: "freeform"; action: string };
export type SceneHistoryEntry = {
scene: Scene;
visitedBeatIds: string[];
exit?: SceneExit;
/** Story memory immediately after this scene was generated. Used by imported
* story replays so continuing from an earlier shared scene preserves the
* right narrative context instead of jumping to the export-time final state. */
storyStateAfter?: StoryState;
};
// ──────────────────────────────────────────────────────────────────────
// Writer two-phase split
//
// The Writer runs as TWO LLM calls so scene-image generation can begin
// before the dialogue is fully written:
// Phase A (WriterPlan) — the minimal skeleton the image pipeline needs:
// sceneSummary + sceneKey + the entry beat's
// on-stage roster + the full cast to design.
// Phase B (beats) — the full beats[] graph + storyStatePatch, written
// to honor the plan, overlapped with image gen.
// The Cinematographer + character design + Painter all run off the Plan, so
// Phase B's (longer) output is hidden behind the image pipeline.
// ──────────────────────────────────────────────────────────────────────
export type WriterPlan = {
/** 中文 scene synopsis (location + time + mood + key event + opening hook).
* The sole input the Cinematographer composes the establishing shot from. */
sceneSummary: string;
/** English location+time slug for cross-scene visual continuity. */
sceneKey?: string;
/** Beat id the player lands on when entering the scene. Phase B must emit a
* beat with this id (reconciled if it doesn't). */
entryBeatId: string;
/** Every NPC name that appears anywhere in this scene. Drives character
* design (card + portrait + voice) IN PARALLEL with Phase B beat writing, so
* the whole cast is provisioned by the time the scene returns. Phase B may
* only use names from this list (plus the POV "你"). Never includes the player. */
cast: string[];
/** The entry beat's on-stage roster (who's visible + pose when the player
* lands). Drives the Cinematographer's framing and the entry-beat portraits
* the Painter anchors to. Never includes the POV player. */
entryActiveCharacters: BeatActiveCharacter[];
/** The entry beat's speaker — an NPC name, "你" (player speaking), or
* undefined for a pure narration/environment entry. Drives shot selection. */
entrySpeaker?: string;
};
// ──────────────────────────────────────────────────────────────────────
// Paradigm D — Writer single-pass streaming plan extensions.
//
// In paradigm D the Writer streams one tagged response: <plan> → <story>
// → <choices>. WriterScenePlan is the parsed <plan> segment: the existing
// WriterPlan skeleton PLUS per-character scene intents (and story bible on
// first scene), handed to the downstream media translators the instant
// </plan> closes.
// ──────────────────────────────────────────────────────────────────────
/** Per-scene performance intent for one character, authored by the Writer in
* the <plan> segment. Ephemeral (this scene only) — distinct from the
* persistent CharacterPersona card. Feeds downstream media translators. */
export type CharacterIntent = {
name: string;
/** 本幕情绪基调。 */
mood?: string;
/** 本幕动机 / 目的。 */
motivation?: string;
/** 本幕说话基调(指导对白质感 + TTS lineDelivery)。 */
speakingTone?: string;
};
/** Parsed <plan> tag: the existing WriterPlan shape plus per-character scene
* intents and optional story bible (first scene only). The optional extension
* keeps any degraded / minimal plan valid — downstream consumers see a
* WriterPlan superset. */
export type WriterScenePlan = WriterPlan & {
/** 各角色本幕表现意图,供 </plan> 闭合时分发下游媒体翻译官。 */
characterIntents?: CharacterIntent[];
/** 故事圣经(仅开局产出)——稳定区字段。后续场景 plan 不含此字段。 */
storyBible?: {
logline: string;
genreTags: string;
protagonist: string;
castNotes?: string;
};
};
// ──────────────────────────────────────────────────────────────────────
// Characters & voices (TTS)
// ──────────────────────────────────────────────────────────────────────
export type CharacterVoice =
| {
provider: "xiaomi";
/** Xiaomi MiMo design output stored as reference audio for later clones. */
referenceAudioBase64: string;
mimeType: string;
}
| {
provider: "stepfun";
/** StepFun preset voice ID (e.g. "cixingnansheng"). Selected by keyword
* matching against the LLM-written voiceDescription — no network call
* on provision (StepFun has no voicedesign endpoint), so this carries
* only the picked preset, not a clip. */
voiceId: string;
/** TTS model used at synth time (step-tts-mini / step-tts-2 / stepaudio-2.5-tts). */
model: string;
mimeType: string;
};
// ──────────────────────────────────────────────────────────────────────
// CharacterPersona — narrative / story dimension of a Character.
// Merged into Character via intersection (all optional). Filled primarily
// by the Writer's <plan> 思维链 (paradigm D); the CharacterDesigner then
// realizes it into visual + voice cards. Absent on legacy sessions →
// callers degrade to "name only". SENTINEL append-only: adding persona
// only appends bytes to the stable prompt prefix — never reorders.
// ──────────────────────────────────────────────────────────────────────
export type CharacterPersona = {
/** 背景 / 身份 / 核心设定。 */
persona?: string;
/** 性格标签,如 ["傲娇", "腹黑", "重情义"]。 */
personalityTraits?: string[];
/** 说话风格 / 口头禅 — 对白质感的关键。 */
speakingStyle?: string;
/** 2-3 条代表性对白,作为 few-shot 锚定语气。 */
sampleDialogue?: string[];
/** 与玩家("你")的关系 / 态度。 */
relationshipToPlayer?: string;
/** 隐藏信息 / 伏笔,可驱动后续反转(默认不外显)。 */
secrets?: string[];
};
export type Character = {
name: string;
/**
* 中文 voice-acting direction card. Must begin with explicit gender, then
* age / timbre / personality / speed / accent. Fed to Xiaomi MiMo's
* voicedesign endpoint when the voice is first provisioned.
*/
voiceDescription: string;
/**
* English appearance card — comma-separated visual attributes following
* Runware/FLUX prompt-engineering convention. Fed to the Painter as a
* character archetype anchor so the same face/outfit/style stays consistent
* across every scene this character appears in.
*/
visualDescription?: string;
/**
* Runware UUID for the base portrait. Generated by the CharacterDesigner
* once, reused as a `referenceImages` entry on every subsequent scene the
* character appears in. UUID is the cheapest reference form for Runware.
*/
basePortraitUuid?: string;
/**
* Public CDN URL for the base portrait. Same image as `basePortraitUuid`;
* kept around for the client (if it ever wants to render character cards)
* and as a fallback reference form for `referenceImages` when UUID is absent.
*/
basePortraitUrl?: string;
/** Xiaomi MiMo voice reference audio. */
voice?: CharacterVoice;
/** StepFun preset voice id (e.g. "cixingnansheng"). Only present on
* characters designed while the server ran StepFun, OR on prebaked
* homepage cards enriched with a StepFun voice id. Lets the client send a
* lightweight beat-audio request (no ~220KB Xiaomi reference audio) when the
* server runs StepFun, and lets the server normalize an off-provider voice
* without a fresh provision. Validated against the catalog at synth time. */
stepfunVoiceId?: string;
} & CharacterPersona;
/** A single beat's synthesized audio, attached to the response. */
export type BeatAudio = {
base64: string;
mime: string;
};
// ──────────────────────────────────────────────────────────────────────
// StoryState — the persistent "story bible" + evolving narrative memory.
//
// Created once at session start by the Architect agent (rich opening
// planning), then carried across every scene and incrementally updated by
// the Writer. This is the single throughline that keeps tone, cast, and
// stakes coherent across scene cuts — without it each Writer call would
// re-derive the whole arc from a flat beat log and drift.
//
// Split into STABLE fields (set by the Architect, rarely change) and
// VOLATILE fields (rewritten each scene via StoryStatePatch).
// ──────────────────────────────────────────────────────────────────────
export type StoryState = {
// ── Stable (Architect-authored; persists unless deliberately revised) ──
/** One-line central dramatic question / 主线钩子. */
logline: string;
/** Genre + tone tags anchoring the 爽点 framework, e.g. "甜宠 / 校园 / 慢热治愈". */
genreTags: string;
/** Second-person protagonist card: who 你 are, the immediate situation, the
* core want, and a flaw/secret. The audience proxy — never rendered. */
protagonist: string;
/** Key supporting cast and their relationship/tension with 你 (one per line). */
castNotes?: string;
// ── Volatile (rewritten each scene by the Writer's StoryStatePatch) ──
/** Rolling, compressed synopsis of what has happened so far (~3-5 句). */
synopsis: string;
/** Unresolved hooks / mysteries / questions still owed to the player. */
openThreads?: string[];
/** Current relationship/emotion state per character, e.g.
* "夏海:好感升温,刚向你告白了一半". */
relationships?: string[];
/** Where the story is heading next — the conflict/reversal/suspense the
* next scene should drive toward. Seeds the next scene's hook. */
nextHook?: string;
};
/** The volatile subset the Writer rewrites after each scene. Stable fields
* (logline/genreTags/protagonist/castNotes) are preserved by the merge. */
export type StoryStatePatch = {
synopsis?: string;
openThreads?: string[];
relationships?: string[];
nextHook?: string;
};
// ──────────────────────────────────────────────────────────────────────
// WorldBook — lightweight lore injection system.
//
// Entries with position "constant" are always injected into the stable
// prompt prefix. Entries with position "triggered" are scanned against
// recent beat text and injected into the dynamic suffix when keywords
// match. Priority controls ordering when multiple entries fire.
// ──────────────────────────────────────────────────────────────────────
export type WorldBookEntry = {
id: string;
/** Keywords that trigger this entry's injection (for triggered entries). */
keys: string[];
/** The lore content to inject into the prompt. */
content: string;
/** "constant" = always injected (stable prefix); "triggered" = keyword-matched (dynamic suffix). */
position: "constant" | "triggered";
/** Higher priority entries are injected first. Defaults to 0. */
priority?: number;
};
export type WorldBook = {
id: string;
name: string;
entries: WorldBookEntry[];
};
// ──────────────────────────────────────────────────────────────────────
// Session
// ──────────────────────────────────────────────────────────────────────
export type Session = {
id: string;
createdAt: number;
worldSetting: string;
styleGuide: string;
history: SceneHistoryEntry[];
/** Character registry — accumulates across scenes; voices + portraits persist for reuse. */
characters: Character[];
/**
* Persistent story bible + evolving narrative memory. Set at session start
* by the Architect, carried by the client across every /api/scene call, and
* updated by the Writer each scene. Optional for back-compat with any
* session payload created before this field existed.
*/
storyState?: StoryState;
/**
* Optional user-uploaded style reference image (data URL — `data:image/...;base64,...`).
* When set, the Painter prepends it to `referenceImages` on every scene so the
* uploaded image anchors painting style (brush, color, mood) across the whole
* session. Resized client-side before upload (~512px max dim) to keep session
* payload small for /api/scene round-trips.
*/
styleReferenceImage?: string;
/**
* Session-wide image orientation, locked at session start from the client's
* device + orientation and carried on every /api/scene call so all scenes
* share one aspect ratio. Absent → "landscape" (back-compat).
*/
orientation?: Orientation;
/**
* Optional player-chosen display name. When set, NPC dialogue will address
* the player by this name instead of the generic "你". Stored client-side
* only (localStorage); never persisted server-side.
*/
playerName?: string;
/**
* Active UI locale when the session was started, in BCP-47 form (e.g.
* "zh-CN", "en", "ja"). The engine appends a single-line language directive
* to the Architect / Writer user messages so AI-generated dialogue, beats,
* and narration are produced in this language. Absent → "zh-CN" for
* back-compat with sessions created before this field existed.
*/
language?: string;
/**
* Optional world books for lore injection. "constant" entries are always in
* the prompt; "triggered" entries inject when keywords match recent text.
*/
worldBooks?: WorldBook[];
};
// ──────────────────────────────────────────────────────────────────────
// Vision
// ──────────────────────────────────────────────────────────────────────
export type ClickIntent = {
freeformAction: string;
reasoning: string;
};
export type VisionClassify = "insert-beat" | "change-scene";
// ──────────────────────────────────────────────────────────────────────
// Provider config
// ──────────────────────────────────────────────────────────────────────
/**
* Wire protocol used to talk to a model provider. Which values are valid
* depends on the model role — each ai-client adapter accepts its own subset
* and falls back to a sensible default for anything else:
*
* openai_compatible text / vision / image — OpenAI Chat Completions +
* `/images/generations` (self-implemented fetch; the
* default for text/vision when unset)
* openai image only — OpenAI gpt-image via the
* official OpenAI SDK, unlocks reference-image editing
* (for text/vision use openai_compatible, which already
* speaks OpenAI's format)
* runware image only — Runware task-array protocol
* (self-implemented; the default for runware.ai URLs)
*/
export type ProviderProtocol =
| "openai_compatible"
| "openai"
| "runware";
export type ProviderConfig = {
baseUrl: string;
apiKey: string;
model: string;
/**
* Wire protocol. When unset, callers apply a role-specific default:
* text/vision → "openai_compatible"; image → inferred from baseUrl
* (runware.ai → "runware", otherwise "openai_compatible") so existing
* deployments keep working without setting *_PROVIDER.
*/
provider?: ProviderProtocol;
};
export type TtsConfig = {
baseUrl: string;
apiKey: string;
/** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */
speechModel: string;
};
/** Which TTS provider the server is configured for (inferred from TtsConfig's
* base URL by lib/tts-client's isStepfun). Exposed to the client via the
* /api/tts-provider route so the play page can send only the voice fields
* the server actually needs — e.g. skip the ~220KB Xiaomi reference audio
* when the server runs StepFun (saving Fast Origin Transfer bandwidth).
* `null` means no server-side TTS (silent). BYO client TTS takes precedence
* over this signal. */
export type TtsProvider = "stepfun" | "xiaomi" | null;
// /api/tts-provider — lightweight GET returning the server's TTS provider so
// the client can shape beat-audio request bodies accordingly (see fetchBeatAudio
// in app/play/page.tsx). Response is a few dozen bytes; runs once per session.
export type TtsProviderResponse = {
provider: TtsProvider;
};
export type EngineConfig = {
text: ProviderConfig;
image: ProviderConfig;
vision: ProviderConfig;
/** Optional — when missing the game runs silently (no TTS). */
tts?: TtsConfig;
/** When true the renderer returns a placeholder PNG instead of calling the image API. */
mockImage?: boolean;
/**
* Per-attempt hard timeout (ms) for image-generation requests. Unset → no
* client-side timeout (only the provider's own gateway limits apply, e.g.
* Runware kills tasks at ~55s with a 504).
*/
imageTimeoutMs?: number;
/**
* Painter scene-paint hedge threshold (ms). When the Tier-A (referenced)
* paint hasn't completed after this long, a second identical request races
* the first and the earlier result wins. Unset/0 → hedging disabled.
* Derived from healthy-day Runware p95 (~14s); recommended 15000.
*/
imageHedgeMs?: number;
};
// ──────────────────────────────────────────────────────────────────────
// API contracts
// ──────────────────────────────────────────────────────────────────────
/**
* BYOK (Bring Your Own Key) LLM credentials carried in request bodies.
* Per-role: text/image/vision can be independently configured. Keys never
* persist or log server-side — they only pass through request→config build
* (see lib/config.ts buildByoEngineConfig). vision typically mirrors text.
*/
export type ByoLlmKeys = {
text?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
image?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
vision?: { provider: string; apiKey: string; baseUrl?: string; model?: string };
};
export type StartRequest = {
worldSetting: string;
styleGuide: string;
/** Optional user-uploaded style reference image — see Session.styleReferenceImage. */
styleReferenceImage?: string;
/**
* When true the client supplied its own Xiaomi TTS key and will provision +
* synth voices in the browser (key never touches our server). The route then
* drops `config.tts` so the engine skips all server-side TTS work.
*/
clientTts?: boolean;
/**
* Device orientation chosen at session start. "portrait" makes the engine
* paint 9:16 vertical scene images (mobile, held upright); "landscape"
* (default) keeps 16:9 widescreen. Locked for the whole session.
*/
orientation?: Orientation;
/** Optional player display name — see Session.playerName. */
playerName?: string;
/** Active UI locale — see Session.language. Drives the engine's language
* directive so AI output is generated in the player's chosen language. */
language?: string;
/**
* BYOK: user-provided LLM keys. When present, server uses these to construct
* EngineConfig instead of reading from env. Per-role: text/image/vision can
* be independently configured. Keys never persist or log — they only pass
* through request→config construction.
*/
byo?: ByoLlmKeys;
};
// /api/parse-style-image — vision LLM extracts a textual painting-style
// prompt from a user-uploaded reference image. The same base64 is echoed
// back so the client can later pass it through to /api/start.
export type ParseStyleImageRequest = {
/** Data URL: `data:image/...;base64,...`. */
imageDataUrl: string;
};
export type ParseStyleImageResponse = {
/** English style prompt suitable as a styleGuide (FLUX-friendly attributes). */
stylePrompt: string;
};
export type StartResponse = {
sessionId: string;
scene: Scene;
/** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
imageUrl: string;
/** Character registry with voice references + visual cards provisioned. */
characters: Character[];
/** Story bible created by the Architect + updated by the opening scene's
* Writer. The client persists this into the session for later /api/scene calls. */
storyState: StoryState;
};
// /api/scene — generates the next Scene, given session whose latest
// history entry has `exit` set. Also used for prefetch speculation
// (frontend synthesizes a speculative exit).
export type SceneRequest = {
session: Session;
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
clientTts?: boolean;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type SceneResponse = {
scene: Scene;
/** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */
imageUrl: string;
characters: Character[];
/** Story bible after this scene's Writer applied its update. The client
* must persist this back into the session so the throughline survives the
* next scene cut. */
storyState: StoryState;
};
// /api/beat-audio — lazily synthesize one beat's voice. Client fires this
// per beat after a scene loads; server has a per-call timeout so MiMo
// tail-latency cannot block the UI. A null audio response means "play silent."
export type BeatAudioRequest = {
beat: {
id: string;
line: string;
lineDelivery?: string;
};
/** The speaker's already-provisioned voice. Optional now — when the server
* runs a DIFFERENT provider than `voice.provider` (e.g. the client holds a
* Xiaomi voice from a prebaked card but the server runs StepFun), the
* client may omit `voice` and send `voiceDescription` + `stepfunVoiceId`
* instead to save the ~220KB reference-audio transfer. The server then
* re-provisions against its own provider before synthesizing. */
voice?: CharacterVoice;
/** Voice-design card (中文). Used by the server to re-provision when
* `voice` is absent or its provider doesn't match the server's TTS. */
voiceDescription?: string;
/** Speaker name — used as the StepFun provision salt for archetype spreading
* when the server falls back to pickStepfunVoiceId. */
characterName?: string;
/** Pre-selected StepFun preset id (from a live CharacterDesigner pick or a
* prebaked card). Honored directly when the server runs StepFun, skipping
* both the keyword scorer and a network provision. */
stepfunVoiceId?: string;
};
export type BeatAudioResponse = {
audio: BeatAudio | null;
};
// /api/vision — interprets a background click on the current image and
// classifies whether it should insert a beat (in-scene exploration) or
// trigger a scene change.
export type VisionRequest = {
session: Session;
/**
* Raw PNG base64 (no `data:` prefix) of the scene image WITH the player's
* click marker already drawn on it by the browser's Canvas 2D. The server
* forwards this straight to the vision LLM as an OpenAI-compatible
* image_url.
*
* Annotation lives client-side so the engine has no Node-native image
* dependency (sharp doesn't run on Cloudflare Workers) and we save a
* server-side image re-fetch per click.
*/
annotatedImageBase64: string;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type VisionResponse = {
intent: ClickIntent;
classify: VisionClassify;
};
// /api/classify-freeform — classifies a player's freeform text input
// into one of three paths: match an existing choice, insert a beat
// in-scene, or trigger a scene change.
export type FreeformClassifyRequest = {
session: Session;
freeformText: string;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
export type FreeformClassify = "insert-beat" | "change-scene";
export type FreeformClassifyResponse = {
classify: FreeformClassify;
freeformAction: string;
};
// /api/insert-beat — generates a single transient beat in response to
// a freeform vision action. Does NOT regenerate the image.
export type InsertBeatRequest = {
session: Session;
freeformAction: string;
/** See StartRequest.clientTts — drops server-side TTS for BYO-key clients. */
clientTts?: boolean;
/** See StartRequest.byo — BYOK LLM keys. */
byo?: ByoLlmKeys;
};
/** Partial beat fields produced by the insert-beat director. */
export type InsertBeatPartial = {
narration?: string;
speaker?: string;
line?: string;
lineDelivery?: string;
};
/** Multi-beat response: 1-3 beats. */
export type InsertBeatMulti = {
beats: InsertBeatPartial[];
};
export type InsertBeatResponse = {
partial: InsertBeatPartial;
/** Additional beats beyond the first (for richer insert-beat interactions). */
extraBeats?: InsertBeatPartial[];
characters: Character[];
};
// ──────────────────────────────────────────────────────────────────────
// Paradigm D — streaming primitives (chatStream / StreamRouter / SSE)
//
// Output-side counterpart to prompt caching's input-side stable prefix
// (the two are orthogonal). chatStream yields incremental text + an
// end-of-stream usage promise. The StreamRouter slices the Writer's
// tagged stream into plan/story/choices and dispatches downstream. API
// routes serialize assembled fragments as SSE events for progressive
// client playback.
// ──────────────────────────────────────────────────────────────────────
/** Token usage stats returned at stream end. Kept SDK-agnostic so the type
* file doesn't depend on any specific provider package. */
export type ChatStreamUsage = {
prompt_tokens?: number;
completion_tokens?: number;
prompt_tokens_details?: { cached_tokens?: number };
};
/** Return shape of the streaming chat primitive (ai-client `chatStream`).
* `textStream` yields incremental chunks; `usage` resolves at stream end
* so `summarizeSdkUsage` cache accounting works unchanged. */
export type ChatStreamResult = {
textStream: AsyncIterable<string>;
usage: Promise<ChatStreamUsage | undefined>;
};
/** Callbacks the StreamRouter fires as it slices the Writer's tagged stream.
* All optional so a caller can subscribe to a subset. */
export type StreamRouterHandlers = {
/** `</plan>` closed — dispatch downstream media translators in parallel. */
onPlan?: (plan: WriterScenePlan) => void;
/** `<story>` incremental text — push to client for progressive playback. */
onBeat?: (beatChunk: string) => void;
/** `</story>` closed — prose finalized, ready for splitting. */
onStoryComplete?: (rawStory: string) => void;
/** `</choices>` closed. */
onChoices?: (choices: BeatChoice[]) => void;
};
/** Aggregate result of routing one Writer stream to completion. `degraded` is
* true when tag parsing fell back (missing / misordered / unclosed / timeout),
* per the degrade-before-main-path reliability rule. */
export type StreamRouterResult = {
plan?: WriterScenePlan;
beats: Beat[];
choices?: BeatChoice[];
/** Raw prose content of the <story> segment (not JSON-parsed). The director
* feeds this to proseSplitter to produce Beat[]. */
rawStorySegment?: string;
degraded: boolean;
};
/** Server → client SSE events for progressive scene playback (paradigm D).
* `TDone` is the terminal full-assembly payload — `SceneResponse` for
* `/api/scene`, `StartResponse` for `/api/start`. The prefetch path
* consumes events to `done` and reassembles a complete response. */
export type SceneStreamEvent<TDone = SceneResponse> =
| { type: "plan"; plan: WriterScenePlan }
| { type: "beat"; beat: Beat }
| { type: "background"; imageUrl: string; sceneKey?: string }
| { type: "voice"; name: string; voice: CharacterVoice }
| { type: "choices"; choices: BeatChoice[] }
| { type: "done"; response: TDone }
| { type: "error"; message: string; degraded?: boolean };