// ────────────────────────────────────────────────────────────────────── // Beat — one dialogue / narration moment within a Scene. // Multiple beats share the same background image; tapping or choosing // advances among them WITHOUT regenerating the image. // ────────────────────────────────────────────────────────────────────── export type Beat = { id: string; narration?: string; speaker?: string; line?: string; /** Free-form voice-acting direction for the line, sent to TTS only. Never displayed. */ lineDelivery?: string; /** * Characters visible in this beat with their pose / expression for this moment. * Read by the Cinematographer when composing the scene's establishing shot — * the beat the entry beat lands in is the visual anchor for the image. */ activeCharacters?: BeatActiveCharacter[]; next: BeatNext; }; export type BeatActiveCharacter = { name: string; /** Free-form 中文 description of pose / expression / what the character is doing. */ pose?: string; }; export type BeatNext = | { type: "continue"; nextBeatId: string } | { type: "choice"; choices: BeatChoice[] }; export type BeatChoice = { id: string; label: string; effect: BeatChoiceEffect; }; export type BeatChoiceEffect = | { kind: "advance-beat"; targetBeatId: string } | { kind: "change-scene"; nextSceneSeed: string }; // ────────────────────────────────────────────────────────────────────── // Scene — one background image + a graph of beats. // The Director emits an entire Scene per call; the player navigates // through its beats locally with zero network until exiting. // ────────────────────────────────────────────────────────────────────── export type Scene = { id: string; scenePrompt: string; beats: Beat[]; entryBeatId: string; /** * Stable English slug identifying the visual scene's location + time, * e.g. "classroom-dusk", "rooftop-night". When the next Scene shares this * key, the Painter slots the previous Scene's image into Runware's * `referenceImages` (alongside character portraits) so the same physical * space stays visually consistent across cuts. */ sceneKey?: string; /** * Runware UUID of this Scene's generated image. Cheapest form to send back * to Runware's `referenceImages` in subsequent calls (UUID > URL > base64 * in transport cost). Not shown to the client — `imageUrl` is what renders. */ imageUuid?: string; /** * Public CDN URL of this Scene's generated image. Returned to the client for * `` rendering; the client also feeds it through a Canvas 2D click * annotator before posting to `/api/vision` (see * `VisionRequest.annotatedImageBase64`). * * For MOCK_IMAGE=true this is a `data:image/svg+xml;...` data URI, not a * Runware URL — the client renders both forms transparently. */ imageUrl?: string; }; export type SceneExit = | { kind: "choice"; choiceId: string; label: string; nextSceneSeed: string; } | { kind: "freeform"; action: string }; export type SceneHistoryEntry = { scene: Scene; visitedBeatIds: string[]; exit?: SceneExit; }; // ────────────────────────────────────────────────────────────────────── // Characters & voices (TTS) // ────────────────────────────────────────────────────────────────────── export type CharacterVoice = { provider: "xiaomi"; /** Xiaomi MiMo design output stored as reference audio for later clones. */ referenceAudioBase64: string; mimeType: string; }; export type Character = { name: string; /** * 中文 voice-acting direction card. Must begin with explicit gender, then * age / timbre / personality / speed / accent. Fed to Xiaomi MiMo's * voicedesign endpoint when the voice is first provisioned. */ voiceDescription: string; /** * English appearance card — comma-separated visual attributes following * Runware/FLUX prompt-engineering convention. Fed to the Painter as a * character archetype anchor so the same face/outfit/style stays consistent * across every scene this character appears in. */ visualDescription?: string; /** * Runware UUID for the base portrait. Generated by the CharacterDesigner * once, reused as a `referenceImages` entry on every subsequent scene the * character appears in. UUID is the cheapest reference form for Runware. */ basePortraitUuid?: string; /** * Public CDN URL for the base portrait. Same image as `basePortraitUuid`; * kept around for the client (if it ever wants to render character cards) * and as a fallback reference form for `referenceImages` when UUID is absent. */ basePortraitUrl?: string; /** Xiaomi MiMo voice reference audio. */ voice?: CharacterVoice; }; /** A single beat's synthesized audio, attached to the response. */ export type BeatAudio = { base64: string; mime: string; }; // ────────────────────────────────────────────────────────────────────── // StoryState — the persistent "story bible" + evolving narrative memory. // // Created once at session start by the Architect agent (rich opening // planning), then carried across every scene and incrementally updated by // the Writer. This is the single throughline that keeps tone, cast, and // stakes coherent across scene cuts — without it each Writer call would // re-derive the whole arc from a flat beat log and drift. // // Split into STABLE fields (set by the Architect, rarely change) and // VOLATILE fields (rewritten each scene via StoryStatePatch). // ────────────────────────────────────────────────────────────────────── export type StoryState = { // ── Stable (Architect-authored; persists unless deliberately revised) ── /** One-line central dramatic question / 主线钩子. */ logline: string; /** Genre + tone tags anchoring the 爽点 framework, e.g. "甜宠 / 校园 / 慢热治愈". */ genreTags: string; /** Second-person protagonist card: who 你 are, the immediate situation, the * core want, and a flaw/secret. The audience proxy — never rendered. */ protagonist: string; /** Key supporting cast and their relationship/tension with 你 (one per line). */ castNotes?: string; // ── Volatile (rewritten each scene by the Writer's StoryStatePatch) ── /** Rolling, compressed synopsis of what has happened so far (~3-5 句). */ synopsis: string; /** Unresolved hooks / mysteries / questions still owed to the player. */ openThreads?: string[]; /** Current relationship/emotion state per character, e.g. * "夏海:好感升温,刚向你告白了一半". */ relationships?: string[]; /** Where the story is heading next — the conflict/reversal/suspense the * next scene should drive toward. Seeds the next scene's hook. */ nextHook?: string; }; /** The volatile subset the Writer rewrites after each scene. Stable fields * (logline/genreTags/protagonist/castNotes) are preserved by the merge. */ export type StoryStatePatch = { synopsis?: string; openThreads?: string[]; relationships?: string[]; nextHook?: string; }; // ────────────────────────────────────────────────────────────────────── // Session // ────────────────────────────────────────────────────────────────────── export type Session = { id: string; createdAt: number; worldSetting: string; styleGuide: string; history: SceneHistoryEntry[]; /** Character registry — accumulates across scenes; voices + portraits persist for reuse. */ characters: Character[]; /** * Persistent story bible + evolving narrative memory. Set at session start * by the Architect, carried by the client across every /api/scene call, and * updated by the Writer each scene. Optional for back-compat with any * session payload created before this field existed. */ storyState?: StoryState; }; // ────────────────────────────────────────────────────────────────────── // Vision // ────────────────────────────────────────────────────────────────────── export type ClickIntent = { freeformAction: string; reasoning: string; }; export type VisionClassify = "insert-beat" | "change-scene"; // ────────────────────────────────────────────────────────────────────── // Provider config // ────────────────────────────────────────────────────────────────────── export type ProviderConfig = { baseUrl: string; apiKey: string; model: string; }; export type TtsConfig = { baseUrl: string; apiKey: string; /** Base model name; adapter derives "-voicedesign" / "-voiceclone" suffixes. */ speechModel: string; }; export type EngineConfig = { text: ProviderConfig; image: ProviderConfig; vision: ProviderConfig; /** Optional — when missing the game runs silently (no TTS). */ tts?: TtsConfig; /** When true the renderer returns a placeholder PNG instead of calling the image API. */ mockImage?: boolean; }; // ────────────────────────────────────────────────────────────────────── // API contracts // ────────────────────────────────────────────────────────────────────── export type StartRequest = { worldSetting: string; styleGuide: string; }; export type StartResponse = { sessionId: string; scene: Scene; /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */ imageUrl: string; /** Character registry with voice references + visual cards provisioned. */ characters: Character[]; /** Story bible created by the Architect + updated by the opening scene's * Writer. The client persists this into the session for later /api/scene calls. */ storyState: StoryState; }; // /api/scene — generates the next Scene, given session whose latest // history entry has `exit` set. Also used for prefetch speculation // (frontend synthesizes a speculative exit). export type SceneRequest = { session: Session; }; export type SceneResponse = { scene: Scene; /** Public CDN URL (or data URI in MOCK_IMAGE mode) for the rendered scene background. */ imageUrl: string; characters: Character[]; /** Story bible after this scene's Writer applied its update. The client * must persist this back into the session so the throughline survives the * next scene cut. */ storyState: StoryState; }; // /api/beat-audio — lazily synthesize one beat's voice. Client fires this // per beat after a scene loads; server has a per-call timeout so MiMo // tail-latency cannot block the UI. A null audio response means "play silent." export type BeatAudioRequest = { beat: { id: string; line: string; lineDelivery?: string; }; voice: CharacterVoice; }; export type BeatAudioResponse = { audio: BeatAudio | null; }; // /api/vision — interprets a background click on the current image and // classifies whether it should insert a beat (in-scene exploration) or // trigger a scene change. export type VisionRequest = { session: Session; /** * Raw PNG base64 (no `data:` prefix) of the scene image WITH the player's * click marker already drawn on it by the browser's Canvas 2D. The server * forwards this straight to the vision LLM as an OpenAI-compatible * image_url. * * Annotation lives client-side so the engine has no Node-native image * dependency (sharp doesn't run on Cloudflare Workers) and we save a * server-side image re-fetch per click. */ annotatedImageBase64: string; }; export type VisionResponse = { intent: ClickIntent; classify: VisionClassify; }; // /api/insert-beat — generates a single transient beat in response to // a freeform vision action. Does NOT regenerate the image. export type InsertBeatRequest = { session: Session; freeformAction: string; }; /** Partial beat fields produced by the insert-beat director. */ export type InsertBeatPartial = { narration?: string; speaker?: string; line?: string; lineDelivery?: string; }; export type InsertBeatResponse = { partial: InsertBeatPartial; characters: Character[]; };