From d1f13d51a34784c573c83476057943b8220cdc3c Mon Sep 17 00:00:00 2001 From: Zonghao Yuan <64521992+zonghaoyuan@users.noreply.github.com> Date: Thu, 28 May 2026 15:20:12 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20scene/beat=20architecture=20=E2=80=94?= =?UTF-8?q?=20decouple=20dialogue=20from=20image=20generation=20(#2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the one-image-per-interaction model with scenes that hold multiple dialogue beats. The image regenerates only on scene-change actions; tapping through beats and in-scene choices are instant and zero-network. Squashed from #2: - feat: scene/beat architecture โ€” decouple dialogue from image generation - fix: harden LLM-output parsing, prefetch lifecycle, and typewriter (PR review) - fix: dedupe beat ids; fallback narration on empty insert-beat (PR review #2) ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) --- README.md | 28 +- .../api/{interact => insert-beat}/route.ts | 14 +- apps/web/app/api/scene/route.ts | 29 + apps/web/app/api/vision/route.ts | 4 +- apps/web/app/play/page.tsx | 620 +++++++++++++----- apps/web/components/PlayCanvas.tsx | 196 +++--- packages/engine/src/director.ts | 304 ++++++++- packages/engine/src/index.ts | 7 +- packages/engine/src/orchestrator.ts | 82 ++- packages/engine/src/prompts.ts | 226 +++++-- packages/engine/src/renderer.ts | 6 +- packages/engine/src/vision.ts | 35 +- packages/types/src/index.ts | 126 +++- 13 files changed, 1275 insertions(+), 402 deletions(-) rename apps/web/app/api/{interact => insert-beat}/route.ts (63%) create mode 100644 apps/web/app/api/scene/route.ts diff --git a/README.md b/README.md index 6b70bb5..c700021 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,37 @@ # ไบ‘ๆขฆ -> An AI-driven visual novel where every frame โ€” scenes, dialogue, choices โ€” is rendered by an AI, one frame at a time. You click. It paints. The story unfolds. +> An AI-driven visual novel painted by an AI, one scene at a time. You talk and explore within a scene; when the story turns a corner, it paints the next. You click. It paints. The story unfolds. --- ## How it works -Each turn is three model calls: +The story unfolds as a sequence of **scenes**. Each scene is one AI-painted background plus a short tree of **beats** โ€” moments of narration, dialogue, and the occasional choice. You tap through a scene's beats and the image stays put; only when a choice leads somewhere genuinely new โ€” another place, a new point of view, a jump in time โ€” does the AI paint the next scene. ``` -[user clicks somewhere on the image] +entering a scene โ”‚ โ–ผ -1. Vision model interprets the click against the visible UI +1. Text LLM directs the whole scene at once โ€” a background prompt + plus a tree of beats (narration / dialogue / choices) โ”‚ โ–ผ -2. Text LLM writes the next frame (narration, dialogue, choices) +2. Image model paints the background once, 16:9, no UI baked in โ”‚ โ–ผ -3. Image model renders the entire next UI screen โ€” scene, dialogue, - buttons, all of it โ€” as one painted frame +[ tap through beats โ€” no model calls, instant ] โ”‚ - โ–ผ -[new image is shown; repeat] + โ”œโ”€ in-scene choice โ”€โ”€โ”€โ”€โ”€โ”€โ–ถ jump to another beat (instant) + โ”‚ + โ””โ”€ scene-change choice โ”€โ”€โ–ถ the next scene + (usually pre-generated โ€” see below) ``` -There is no traditional UI. There is only the image. The AI chooses the layout, the colors, the typography, the buttons. Pick "stick figure on grid paper" as your style and you'll get hand-drawn UI. Pick "cyberpunk noir" and you'll get neon HUDs. Whatever fits the world. +While you're reading one scene, the engine **speculatively generates the scenes your choices could lead to** โ€” and, for unavoidable next steps, the scene after that. By the time you pick a direction, its image is usually already painted, so the cut feels instant. + +Clicking the background itself (not a button) routes through a **vision** model: it reads where you tapped and decides whether you're exploring the current scene (it inserts a beat โ€” no new image) or moving on (a new scene). + +There is no traditional game UI baked into the art. The AI paints the world in whatever style you pick โ€” "stick figure on grid paper" or "cyberpunk noir" โ€” and the dialogue panel and choice buttons are a light HTML layer drawn on top, tuned to sit over the scene. --- @@ -82,4 +88,4 @@ yume/ ## Cost & limits -Each turn costs roughly **\$0.15โ€“0.25** in API fees with the recommended model trio. A 30-turn session is **\~\$5โ€“8**. There is no rate limiting or auth out of the box โ€” if you make your deployment public, your bill will reflect that. Add limits before sharing widely. +Each **scene** costs roughly **\$0.15โ€“0.25** in API fees with the recommended model trio (one text + one image call); tapping through a scene's beats is free. To keep transitions instant, the engine also **pre-generates scenes you might pick but don't** โ€” so real spend runs somewhat higher than the scenes you actually see. There is no rate limiting or auth out of the box โ€” if you make your deployment public, your bill will reflect that. Add limits (and consider lowering the prefetch depth) before sharing widely. diff --git a/apps/web/app/api/interact/route.ts b/apps/web/app/api/insert-beat/route.ts similarity index 63% rename from apps/web/app/api/interact/route.ts rename to apps/web/app/api/insert-beat/route.ts index f4d3c4d..892e2aa 100644 --- a/apps/web/app/api/interact/route.ts +++ b/apps/web/app/api/insert-beat/route.ts @@ -1,5 +1,5 @@ -import { takeTurn } from "@yume/engine"; -import type { InteractRequest } from "@yume/types"; +import { requestInsertBeat } from "@yume/engine"; +import type { InsertBeatRequest } from "@yume/types"; import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; @@ -7,23 +7,23 @@ export const runtime = "nodejs"; export const maxDuration = 60; export async function POST(req: Request) { - let body: InteractRequest; + let body: InsertBeatRequest; try { - body = (await req.json()) as InteractRequest; + body = (await req.json()) as InsertBeatRequest; } catch { return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); } - if (!body.session || !body.intent) { + if (!body.session || !body.freeformAction) { return NextResponse.json( - { error: "session and intent are required" }, + { error: "session and freeformAction are required" }, { status: 400 }, ); } try { const config = loadEngineConfig(); - const result = await takeTurn(config, body); + const result = await requestInsertBeat(config, body); return NextResponse.json(result); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; diff --git a/apps/web/app/api/scene/route.ts b/apps/web/app/api/scene/route.ts new file mode 100644 index 0000000..bcec19b --- /dev/null +++ b/apps/web/app/api/scene/route.ts @@ -0,0 +1,29 @@ +import { requestScene } from "@yume/engine"; +import type { SceneRequest } from "@yume/types"; +import { NextResponse } from "next/server"; +import { loadEngineConfig } from "@/lib/config"; + +export const runtime = "nodejs"; +export const maxDuration = 120; + +export async function POST(req: Request) { + let body: SceneRequest; + try { + body = (await req.json()) as SceneRequest; + } catch { + return NextResponse.json({ error: "Invalid JSON" }, { status: 400 }); + } + + if (!body.session) { + return NextResponse.json({ error: "session is required" }, { status: 400 }); + } + + try { + const config = loadEngineConfig(); + const result = await requestScene(config, body); + return NextResponse.json(result); + } catch (err) { + const message = err instanceof Error ? err.message : "Unknown error"; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/apps/web/app/api/vision/route.ts b/apps/web/app/api/vision/route.ts index 85b8a5e..81d0487 100644 --- a/apps/web/app/api/vision/route.ts +++ b/apps/web/app/api/vision/route.ts @@ -1,4 +1,4 @@ -import { visionTurn } from "@yume/engine"; +import { visionDecide } from "@yume/engine"; import type { VisionRequest } from "@yume/types"; import { NextResponse } from "next/server"; import { loadEngineConfig } from "@/lib/config"; @@ -23,7 +23,7 @@ export async function POST(req: Request) { try { const config = loadEngineConfig(); - const result = await visionTurn(config, body); + const result = await visionDecide(config, body); return NextResponse.json(result); } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; diff --git a/apps/web/app/play/page.tsx b/apps/web/app/play/page.tsx index 715004b..0e56ebd 100644 --- a/apps/web/app/play/page.tsx +++ b/apps/web/app/play/page.tsx @@ -2,39 +2,236 @@ import Link from "next/link"; import { useRouter, useSearchParams } from "next/navigation"; -import { Suspense, useCallback, useEffect, useRef, useState } from "react"; +import { + Suspense, + useCallback, + useEffect, + useMemo, + useRef, + useState, +} from "react"; import { PlayCanvas, type Phase } from "@/components/PlayCanvas"; import { PRESETS } from "@/lib/presets"; import type { - ClickIntent, - InteractResponse, + Beat, + BeatChoice, + InsertBeatResponse, + Scene, + SceneExit, + SceneResponse, Session, StartResponse, - StoryFrame, VisionResponse, } from "@yume/types"; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Prefetch pool โ€” speculative SceneResponses keyed by choice path. +// +// Key format: "C1" โ†’ reached by choosing C1 from current scene. +// "C1/C2" โ†’ after C1, then C2 (recursive must-pass prefetch). +// +// When the player picks a change-scene choice, we keep that key's +// descendants (re-rooted) and abort the rest. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +const PREFETCH_MAX_DEPTH = 3; + +type PrefetchEntry = { + promise: Promise; + abort: AbortController; +}; + +type ScenePathStep = { + fromScene: Scene; + fromVisitedBeats: string[]; + exit: { choiceId: string; label: string; nextSceneSeed: string }; +}; + +function pathKey(steps: ScenePathStep[]): string { + return steps.map((s) => s.exit.choiceId).join("/"); +} + +function buildSpeculativeSession( + base: Session, + steps: ScenePathStep[], +): Session { + // Drop base's current (last) entry and re-add each step's `fromScene` with + // its exit set. Final result has `history.length = base.length - 1 + steps.length`. + const newHistory = [...base.history.slice(0, -1)]; + for (const step of steps) { + newHistory.push({ + scene: step.fromScene, + visitedBeatIds: step.fromVisitedBeats, + exit: { + kind: "choice", + choiceId: step.exit.choiceId, + label: step.exit.label, + nextSceneSeed: step.exit.nextSceneSeed, + }, + }); + } + return { ...base, history: newHistory }; +} + +function findAllChangeSceneChoices(scene: Scene): BeatChoice[] { + const result: BeatChoice[] = []; + const seen = new Set(); + for (const b of scene.beats) { + if (b.next.type === "choice") { + for (const c of b.next.choices) { + if (c.effect.kind === "change-scene" && !seen.has(c.id)) { + seen.add(c.id); + result.push(c); + } + } + } + } + return result; +} + +function findSoleChangeSceneChoice(scene: Scene): BeatChoice | null { + const all = findAllChangeSceneChoices(scene); + return all.length === 1 ? all[0]! : null; +} + +function prefetchScenePath( + pool: Map, + baseSession: Session, + steps: ScenePathStep[], + depth: number, +): void { + if (depth >= PREFETCH_MAX_DEPTH) return; + const key = pathKey(steps); + if (pool.has(key)) return; + + const specSession = buildSpeculativeSession(baseSession, steps); + const abort = new AbortController(); + const promise = (async () => { + const res = await fetch("/api/scene", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ session: specSession }), + signal: abort.signal, + }); + if (!res.ok) { + const j = (await res.json().catch(() => ({}))) as { error?: string }; + throw new Error(j.error ?? res.statusText); + } + const data = (await res.json()) as SceneResponse; + + // Recursive: if the resulting scene has exactly one change-scene exit, + // it is a must-pass node โ€” prefetch its child too. + if (depth + 1 < PREFETCH_MAX_DEPTH) { + const sole = findSoleChangeSceneChoice(data.scene); + if (sole && sole.effect.kind === "change-scene") { + const nextStep: ScenePathStep = { + fromScene: data.scene, + fromVisitedBeats: [data.scene.entryBeatId], + exit: { + choiceId: sole.id, + label: sole.label, + nextSceneSeed: sole.effect.nextSceneSeed, + }, + }; + prefetchScenePath(pool, baseSession, [...steps, nextStep], depth + 1); + } + } + + return data; + })(); + + promise.catch(() => {}); + pool.set(key, { promise, abort }); +} + +function consumeChoice( + pool: Map, + choiceId: string, +): PrefetchEntry | undefined { + const my = pool.get(choiceId); + const survivors = new Map(); + for (const [key, entry] of pool) { + if (key === choiceId) continue; + if (key.startsWith(choiceId + "/")) { + survivors.set(key.slice(choiceId.length + 1), entry); + } else { + entry.abort.abort(); + } + } + pool.clear(); + for (const [k, e] of survivors) pool.set(k, e); + return my; +} + +function clearPool(pool: Map): void { + for (const e of pool.values()) e.abort.abort(); + pool.clear(); +} + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Component +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + function PlayInner() { const router = useRouter(); const params = useSearchParams(); const [phase, setPhase] = useState("loading-first"); const [session, setSession] = useState(null); + const [currentScene, setCurrentScene] = useState(null); + const [currentBeatId, setCurrentBeatId] = useState(null); const [imageBase64, setImageBase64] = useState(null); - const [frame, setFrame] = useState(null); - const [intent, setIntent] = useState(null); const [pendingClick, setPendingClick] = useState<{ x: number; y: number; } | null>(null); - const [turnNum, setTurnNum] = useState(0); const [error, setError] = useState(null); const [presentation, setPresentation] = useState(false); + const [lastExitLabel, setLastExitLabel] = useState(null); const startedRef = useRef(false); - const prefetchAbortRef = useRef(null); - const prefetchRef = useRef>>({}); + const poolRef = useRef>(new Map()); + // Mirrors for use inside async handlers (closure-stable) + const sessionRef = useRef(null); + const currentSceneRef = useRef(null); + const currentBeatRef = useRef(null); + const visitedBeatsRef = useRef([]); + + const currentBeat = useMemo(() => { + if (!currentScene || !currentBeatId) return null; + return currentScene.beats.find((b) => b.id === currentBeatId) ?? null; + }, [currentScene, currentBeatId]); + + useEffect(() => { + sessionRef.current = session; + }, [session]); + useEffect(() => { + currentSceneRef.current = currentScene; + }, [currentScene]); + useEffect(() => { + currentBeatRef.current = currentBeat; + }, [currentBeat]); + + // Whenever currentBeatId changes, append it to visited (skip consecutive dups) + useEffect(() => { + if (!currentBeatId) return; + if (visitedBeatsRef.current.at(-1) === currentBeatId) return; + visitedBeatsRef.current = [...visitedBeatsRef.current, currentBeatId]; + setSession((s) => { + if (!s) return s; + return { + ...s, + history: s.history.map((h, i, arr) => + i === arr.length - 1 + ? { ...h, visitedBeatIds: [...visitedBeatsRef.current] } + : h, + ), + }; + }); + }, [currentBeatId]); + + // โ”€โ”€ Presentation mode toggle โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ const togglePresentation = useCallback(async () => { const entering = !presentation; if (entering) { @@ -43,14 +240,12 @@ function PlayInner() { await document.documentElement.requestFullscreen(); } } catch { - // Browser may refuse fullscreen โ€” still enter chrome-less mode + // ignore โ€” fall through to chrome-less mode anyway } setPresentation(true); } else { try { - if (document.fullscreenElement) { - await document.exitFullscreen(); - } + if (document.fullscreenElement) await document.exitFullscreen(); } catch { // ignore } @@ -69,10 +264,7 @@ function PlayInner() { } } function onFullscreenChange() { - // Sync if user exited browser fullscreen via Esc / system gesture - if (!document.fullscreenElement && presentation) { - setPresentation(false); - } + if (!document.fullscreenElement && presentation) setPresentation(false); } window.addEventListener("keydown", onKey); document.addEventListener("fullscreenchange", onFullscreenChange); @@ -82,6 +274,7 @@ function PlayInner() { }; }, [togglePresentation, presentation]); + // โ”€โ”€ Bootstrap: start session โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ useEffect(() => { if (startedRef.current) return; startedRef.current = true; @@ -91,9 +284,7 @@ function PlayInner() { if (presetId) { const p = PRESETS.find((x) => x.id === presetId); - if (p) { - payload = { worldSetting: p.worldSetting, styleGuide: p.styleGuide }; - } + if (p) payload = { worldSetting: p.worldSetting, styleGuide: p.styleGuide }; } else if (params.get("custom") === "1") { const stored = sessionStorage.getItem("yume:custom"); if (stored) { @@ -122,151 +313,176 @@ function PlayInner() { const j = (await r.json().catch(() => ({}))) as { error?: string }; throw new Error(j.error ?? r.statusText); } - return r.json() as Promise; + return (await r.json()) as StartResponse; }) .then((data) => { - setSession({ + const initial: Session = { id: data.sessionId, createdAt: Date.now(), worldSetting: finalPayload.worldSetting, styleGuide: finalPayload.styleGuide, - history: [{ frame: data.frame }], - characters: [], - }); - setFrame(data.frame); + history: [ + { + scene: data.scene, + visitedBeatIds: [data.scene.entryBeatId], + }, + ], + }; + visitedBeatsRef.current = [data.scene.entryBeatId]; + setSession(initial); + setCurrentScene(data.scene); + setCurrentBeatId(data.scene.entryBeatId); setImageBase64(data.imageBase64); setPhase("ready"); - setTurnNum(1); }) .catch((e) => setError(String(e))); }, [params, router]); - // Prefetch next-frame candidates whenever current frame becomes ready. - // All three fire in parallel for fastest cache fill. NOT depending on - // `phase` โ€” we don't want to abort in-flight prefetches just because - // the user clicked. They should continue so handleClick can await them. + // โ”€โ”€ Prefetch on scene entry: L1 + recursive L2/L3 for must-pass โ”€โ”€โ”€โ”€โ”€โ”€ useEffect(() => { - if (!session || !frame) return; + const s = session; + const scene = currentScene; + if (!s || !scene) return; - prefetchAbortRef.current?.abort(); - const ctrl = new AbortController(); - prefetchAbortRef.current = ctrl; - - const choices = frame.uiElements.filter((e) => e.kind === "choice"); - const promises: Record> = {}; - - for (const choice of choices) { - const syntheticIntent: ClickIntent = { - targetId: choice.id, - targetLabel: choice.label, - reasoning: "prefetch", + const exits = findAllChangeSceneChoices(scene); + for (const choice of exits) { + if (choice.effect.kind !== "change-scene") continue; + const step: ScenePathStep = { + fromScene: scene, + // Snapshot of visited beats at prefetch start. Slight drift is OK. + fromVisitedBeats: [...visitedBeatsRef.current], + exit: { + choiceId: choice.id, + label: choice.label, + nextSceneSeed: choice.effect.nextSceneSeed, + }, }; - const p = fetch("/api/interact", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session, intent: syntheticIntent }), - signal: ctrl.signal, - }).then(async (r) => { - if (!r.ok) { - const j = (await r.json().catch(() => ({}))) as { error?: string }; - throw new Error(j.error ?? r.statusText); - } - return r.json() as Promise; - }); - p.catch(() => {}); - promises[choice.id] = p; + prefetchScenePath(poolRef.current, s, [step], 0); } + }, [currentScene?.id, session?.id]); - prefetchRef.current = promises; - + // Abort all in-flight speculative prefetches when the page unmounts, so we + // stop paying for background scene/image generation. Empty deps โ†’ fires only + // on unmount; it must NOT run on scene transitions, which rely on + // consumeChoice keeping the re-rooted survivor prefetches alive. + useEffect(() => { + const pool = poolRef.current; return () => { - ctrl.abort(); + clearPool(pool); }; - }, [frame?.id, session?.id]); + }, []); - // โ”€โ”€ Shared result applier โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - async function applyInteractResult( - resultPromise: Promise, - clickIntent: ClickIntent, - click?: { x: number; y: number }, - ) { - const result = await resultPromise; - // Overwrite synthetic prefetch intent with the real click intent - const lastIdx = result.session.history.length - 1; - const patched: InteractResponse = { - ...result, - intent: clickIntent, - session: { - ...result.session, - history: result.session.history.map((entry, idx) => - idx === lastIdx ? { ...entry, click, intent: clickIntent } : entry, - ), - }, - }; - const updatedHistory = [ - ...patched.session.history, - { frame: patched.frame }, - ]; - setSession({ ...patched.session, history: updatedHistory }); - setFrame(patched.frame); - setImageBase64(patched.imageBase64); - setIntent(clickIntent); - setPendingClick(null); - setTurnNum((t) => t + 1); - setPhase("ready"); + // โ”€โ”€ Handlers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + + function onAdvance() { + if (phase !== "ready") return; + const beat = currentBeatRef.current; + if (!beat || beat.next.type !== "continue") return; + setCurrentBeatId(beat.next.nextBeatId); } - // โ”€โ”€ HTML button click โ€” bypasses Vision entirely โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - async function handleChoiceSelect(choiceId: string, label: string) { - if (phase !== "ready" || !session) return; - setPhase("interacting"); - setIntent(null); - - const clickIntent: ClickIntent = { - targetId: choiceId, - targetLabel: label, - reasoning: "direct-button-click", - }; - - const cacheSnapshot = prefetchRef.current; - const cached = cacheSnapshot[choiceId]; - + async function performSceneTransition( + source: PrefetchEntry | Promise, + exit: SceneExit, + visitedForCurrent: string[], + exitLabel: string, + ) { + setPhase("transitioning"); + setPendingClick(null); try { - if (cached) { - // Cache hit โ€” zero extra wait - await applyInteractResult(cached, clickIntent); - } else { - // Cache miss โ€” call interact directly (no Vision roundtrip) - prefetchAbortRef.current?.abort(); - const res = await fetch("/api/interact", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session, intent: clickIntent }), - }); - if (!res.ok) { - const j = (await res.json().catch(() => ({}))) as { error?: string }; - throw new Error(j.error ?? res.statusText); - } - await applyInteractResult( - res.json() as Promise, - clickIntent, - ); - } + const result = await ("promise" in source ? source.promise : source); + + const base = sessionRef.current; + if (!base) throw new Error("Session lost mid-transition"); + + const closedHistory = base.history.map((h, i, arr) => + i === arr.length - 1 + ? { ...h, visitedBeatIds: visitedForCurrent, exit } + : h, + ); + const newSession: Session = { + ...base, + history: [ + ...closedHistory, + { + scene: result.scene, + visitedBeatIds: [result.scene.entryBeatId], + }, + ], + }; + visitedBeatsRef.current = [result.scene.entryBeatId]; + setSession(newSession); + setCurrentScene(result.scene); + setCurrentBeatId(result.scene.entryBeatId); + setImageBase64(result.imageBase64); + setLastExitLabel(exitLabel); + setPhase("ready"); } catch (e) { + if ((e as { name?: string }).name === "AbortError") { + setPhase("ready"); + return; + } setError(String(e)); - setPendingClick(null); setPhase("ready"); } } - // โ”€โ”€ Background / free-form click โ€” still uses Vision โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - async function handleClick(click: { x: number; y: number }) { - if (phase !== "ready" || !session || !imageBase64) return; - setPhase("interacting"); - setPendingClick(click); - setIntent(null); + function onSelectChoice(choice: BeatChoice) { + if (phase !== "ready" || !session || !currentScene) return; - const cacheSnapshot = prefetchRef.current; + if (choice.effect.kind === "advance-beat") { + // Pure local jump. No network. No pool changes. + setCurrentBeatId(choice.effect.targetBeatId); + return; + } + + const visited = [...visitedBeatsRef.current]; + const exit: SceneExit = { + kind: "choice", + choiceId: choice.id, + label: choice.label, + nextSceneSeed: choice.effect.nextSceneSeed, + }; + + const cached = consumeChoice(poolRef.current, choice.id); + if (cached) { + void performSceneTransition(cached, exit, visited, choice.label); + return; + } + + // Cold path โ€” start a fresh fetch + const step: ScenePathStep = { + fromScene: currentScene, + fromVisitedBeats: visited, + exit: { + choiceId: choice.id, + label: choice.label, + nextSceneSeed: choice.effect.nextSceneSeed, + }, + }; + const specSession = buildSpeculativeSession(session, [step]); + clearPool(poolRef.current); + + const promise = (async () => { + const res = await fetch("/api/scene", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ session: specSession }), + }); + if (!res.ok) { + const j = (await res.json().catch(() => ({}))) as { error?: string }; + throw new Error(j.error ?? res.statusText); + } + return (await res.json()) as SceneResponse; + })(); + + void performSceneTransition(promise, exit, visited, choice.label); + } + + async function onBackgroundClick(click: { x: number; y: number }) { + if (phase !== "ready" || !session || !currentScene || !imageBase64) return; + setPhase("vision-thinking"); + setPendingClick(click); try { const visionRes = await fetch("/api/vision", { @@ -280,32 +496,99 @@ function PlayInner() { }; throw new Error(j.error ?? visionRes.statusText); } - const { intent: clickIntent } = - (await visionRes.json()) as VisionResponse; + const decision = (await visionRes.json()) as VisionResponse; - const cached = clickIntent.targetId - ? cacheSnapshot[clickIntent.targetId] - : undefined; - - if (cached) { - await applyInteractResult(cached, clickIntent, click); - } else { - prefetchAbortRef.current?.abort(); - const liveRes = await fetch("/api/interact", { + if (decision.classify === "insert-beat") { + setPhase("inserting-beat"); + const insertRes = await fetch("/api/insert-beat", { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ session, intent: clickIntent, click }), + body: JSON.stringify({ + session, + freeformAction: decision.intent.freeformAction, + }), }); - if (!liveRes.ok) { - const j = (await liveRes.json().catch(() => ({}))) as { + if (!insertRes.ok) { + const j = (await insertRes.json().catch(() => ({}))) as { error?: string; }; - throw new Error(j.error ?? liveRes.statusText); + throw new Error(j.error ?? insertRes.statusText); } - await applyInteractResult( - liveRes.json() as Promise, - clickIntent, - click, + const { partial } = (await insertRes.json()) as InsertBeatResponse; + + const fromBeatId = + currentBeatRef.current?.id ?? currentScene.entryBeatId; + const newBeatId = `b_ins_${Date.now()}_${Math.random() + .toString(36) + .slice(2, 6)}`; + const newBeat: Beat = { + id: newBeatId, + narration: partial.narration, + speaker: partial.speaker, + line: partial.line, + next: { type: "continue", nextBeatId: fromBeatId }, + }; + + const patched: Scene = { + ...currentScene, + beats: [...currentScene.beats, newBeat], + }; + + setSession((s) => + s + ? { + ...s, + history: s.history.map((h, i, arr) => + i === arr.length - 1 ? { ...h, scene: patched } : h, + ), + } + : s, + ); + setCurrentScene(patched); + setCurrentBeatId(newBeatId); + setLastExitLabel(decision.intent.freeformAction); + setPhase("ready"); + setPendingClick(null); + } else { + const exit: SceneExit = { + kind: "freeform", + action: decision.intent.freeformAction, + }; + const visited = [...visitedBeatsRef.current]; + const base = sessionRef.current; + if (!base) { + setPhase("ready"); + setPendingClick(null); + return; + } + const specSession: Session = { + ...base, + history: base.history.map((h, i, arr) => + i === arr.length - 1 ? { ...h, visitedBeatIds: visited, exit } : h, + ), + }; + clearPool(poolRef.current); + + const promise = (async () => { + const res = await fetch("/api/scene", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ session: specSession }), + }); + if (!res.ok) { + const j = (await res.json().catch(() => ({}))) as { + error?: string; + }; + throw new Error(j.error ?? res.statusText); + } + return (await res.json()) as SceneResponse; + })(); + + await performSceneTransition( + promise, + exit, + visited, + decision.intent.freeformAction, ); } } catch (e) { @@ -315,6 +598,8 @@ function PlayInner() { } } + // โ”€โ”€ Render โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if (error) { return (
@@ -343,16 +628,20 @@ function PlayInner() {
); } + const sceneCount = session?.history.length ?? 0; + const beatCount = visitedBeatsRef.current.length; + return (
@@ -364,7 +653,9 @@ function PlayInner() { ไบ‘ๆขฆ
- ็ฌฌ ยท {String(turnNum).padStart(3, "0")} ยท ๅธง + ็ฌฌ ยท {String(sceneCount).padStart(3, "0")} ยท ๅน• + ยท + {String(beatCount).padStart(3, "0")} ยท ๆ‹ ยท {session?.id.slice(2, 14) ?? "โ€”"} @@ -376,22 +667,23 @@ function PlayInner() {
{phase === "loading-first" && (

- ๆญฃ ยท ๅœจ ยท ๅ”ค ยท ่ตท ยท ็ฌฌ ยท ไธ€ ยท ๅธง + ๆญฃ ยท ๅœจ ยท ๅ”ค ยท ่ตท ยท ็ฌฌ ยท ไธ€ ยท ๅน•

)} - {phase === "ready" && intent?.targetLabel && ( + {phase === "ready" && lastExitLabel && (

ไธŠ ยท ไธ€ ยท ๆญฅ ยท - {intent.targetLabel} + {lastExitLabel}

)}
diff --git a/apps/web/components/PlayCanvas.tsx b/apps/web/components/PlayCanvas.tsx index 19405e6..51acdaa 100644 --- a/apps/web/components/PlayCanvas.tsx +++ b/apps/web/components/PlayCanvas.tsx @@ -1,34 +1,70 @@ "use client"; -import { useEffect, useRef, useState } from "react"; -import type { StoryFrame } from "@yume/types"; +import { useCallback, useEffect, useRef, useState } from "react"; +import type { Beat, BeatChoice } from "@yume/types"; -export type Phase = "loading-first" | "ready" | "interacting"; +export type Phase = + | "loading-first" // first scene not yet rendered + | "ready" // current beat is interactive + | "vision-thinking" // background click โ†’ waiting on vision verdict + | "inserting-beat" // vision-driven beat being generated + | "transitioning"; // changing scenes (cache miss or speculative wait) const SHADOW = "0 1px 0 rgba(45,24,16,0.05), 0 36px 64px -28px rgba(45,24,16,0.25), 0 8px 18px -6px rgba(45,24,16,0.10)"; // โ”€โ”€ Typewriter hook โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -function useTypewriter(text: string, speed = 28): string { +// Returns the progressively-revealed text, a `done` flag, and a `skip()` that +// instantly completes the current text. Reset is keyed by `resetKey` (the beat +// id) rather than the text, so a new beat whose line happens to match the +// previous one still replays from scratch. `done` is derived synchronously +// (not from a post-paint effect) so a stale "done" frame never paints. +function useTypewriter( + text: string, + resetKey: string, + speed = 28, +): { shown: string; done: boolean; skip: () => void } { const [displayed, setDisplayed] = useState(""); - const textRef = useRef(text); + const [prevKey, setPrevKey] = useState(resetKey); + const timer = useRef | null>(null); + + // Render-phase reset (React "adjust state on prop change" pattern): when the + // beat changes, drop the old progress before this render commits. + if (resetKey !== prevKey) { + setPrevKey(resetKey); + setDisplayed(""); + } useEffect(() => { - // Reset immediately when the text changes - setDisplayed(""); - textRef.current = text; if (!text) return; - let i = 0; - const id = setInterval(() => { + timer.current = setInterval(() => { i += 1; setDisplayed(text.slice(0, i)); - if (i >= text.length) clearInterval(id); + if (i >= text.length && timer.current) { + clearInterval(timer.current); + timer.current = null; + } }, speed); - return () => clearInterval(id); - }, [text, speed]); + return () => { + if (timer.current) clearInterval(timer.current); + timer.current = null; + }; + }, [resetKey, text, speed]); - return displayed; + const skip = useCallback(() => { + if (timer.current) { + clearInterval(timer.current); + timer.current = null; + } + setDisplayed(text); + }, [text]); + + // During the throwaway render where the beat just changed, `displayed` still + // holds the previous beat's text โ€” coerce it to empty so nothing stale shows. + const shown = resetKey === prevKey ? displayed : ""; + const done = text.length === 0 || shown.length >= text.length; + return { shown, done, skip }; } // โ”€โ”€ Choice button โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ @@ -59,7 +95,6 @@ function ChoiceButton({ boxShadow: "0 2px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(200,165,90,0.12)", }} > - {/* Hover shimmer overlay */} void; - onSelectChoice?: (choiceId: string, label: string) => void; + onBackgroundClick: (click: { x: number; y: number }) => void; + onAdvance: () => void; + onSelectChoice: (choice: BeatChoice) => void; fullViewport?: boolean; }) { const imgRef = useRef(null); const [dims, setDims] = useState<{ w: number; h: number } | null>(null); - const choices = frame?.uiElements.filter((e) => e.kind === "choice") ?? []; - const dialogueText = frame - ? [frame.speaker ? `${frame.speaker}๏ผš${frame.line ?? ""}` : frame.line, frame.narration] - .filter(Boolean) - .join("\n") - : ""; - const narrationOnly = !frame?.speaker && !frame?.line && !!frame?.narration; - const displayBody = frame?.speaker - ? frame.line ?? "" - : frame?.narration ?? ""; + const isChoiceBeat = beat?.next.type === "choice"; + const choices: BeatChoice[] = isChoiceBeat + ? (beat!.next as { type: "choice"; choices: BeatChoice[] }).choices + : []; - const typedBody = useTypewriter(displayBody, 30); + const displayBody = beat?.speaker ? beat.line ?? "" : beat?.narration ?? ""; + const { shown: typedBody, done: typingDone, skip: skipTypewriter } = + useTypewriter(displayBody, beat?.id ?? "", 30); - function handleClick(e: React.MouseEvent) { - if (phase !== "ready" || !imgRef.current) return; + function handleImageClick(e: React.MouseEvent) { + if (phase !== "ready" || !imgRef.current || !beat) return; const rect = imgRef.current.getBoundingClientRect(); const x = (e.clientX - rect.left) / rect.width; const y = (e.clientY - rect.top) / rect.height; - onClick({ + // If the typewriter is still printing, a click completes it instantly + // (standard VN affordance) โ€” the page never sees this click. + if (!typingDone) { + skipTypewriter(); + return; + } + // For continue-type beats, image click advances; for choice beats, + // image click goes through vision (treat as freeform action). + if (beat.next.type === "continue") { + onAdvance(); + return; + } + onBackgroundClick({ x: Math.max(0, Math.min(1, x)), y: Math.max(0, Math.min(1, y)), }); } const interactive = phase === "ready" && !!imageBase64; - const dimmed = phase === "interacting"; + const dimmed = phase === "transitioning"; const sizeStyle = fullViewport ? { maxWidth: "100vw", maxHeight: "100dvh" } @@ -141,6 +186,13 @@ export function PlayCanvas({ ? "min(100vw, calc(100dvh * 16 / 9))" : "min(96vw, calc((100dvh - 200px) * 16 / 9))"; + const footerHint = + phase === "ready" + ? isChoiceBeat + ? "้€‰ ยท ๆ‹ฉ ยท ไธ€ ยท ้กน" + : "็‚น ยท ๅ‡ป ยท ๆŽจ ยท ่ฟ›" + : "ยทยทยท"; + return (
- {/* โ”€โ”€ Background image โ”€โ”€ */} + {/* Background image */} Generated frame { const img = e.currentTarget; setDims({ w: img.naturalWidth, h: img.naturalHeight }); @@ -168,37 +220,27 @@ export function PlayCanvas({ style={sizeStyle} /> - {/* โ”€โ”€ Top/bottom gradient vignette โ”€โ”€ */} {!fullViewport && ( - <> -
- +
)} - {/* โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• - PREFAB UI OVERLAY โ€” rendered on top of image - โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• */} - {frame && ( + {beat && (
- {/* โ”€โ”€ Choices row โ”€โ”€ */} {choices.length > 0 && ( -
+
{choices.map((choice, i) => ( onSelectChoice?.(choice.id, choice.label)} + onClick={() => onSelectChoice(choice)} /> ))}
)} - {/* โ”€โ”€ Dialogue / narration box โ”€โ”€ */} - {(frame.narration || frame.line) && ( + {(beat.narration || beat.line) && (
- {/* Inner golden corner decoration */} - {/* Speaker name tag */} - {frame.speaker && ( + {beat.speaker && (

- {frame.speaker} + {beat.speaker}

)} - {/* Main text */}

{typedBody} - {/* Narration only โ€” also show secondary line */} - {frame.speaker && frame.narration && ( + {beat.speaker && beat.narration && ( - {frame.narration} + {beat.narration} )}

- {/* Scroll hint โ–ผ */} - - โ–ผ - + {typingDone && beat.next.type === "continue" && ( + + โ–ผ + + )}
)}
)} - {/* Loading/interacting dim overlay */} - {phase === "interacting" && ( + {(phase === "transitioning" || phase === "inserting-beat") && (

- AI ยท ๆญฃ ยท ๅœจ ยท ๆ ยท ็”ป ยท ไธ‹ ยท ไธ€ ยท ๅˆป + {phase === "transitioning" + ? "AI ยท ๆญฃ ยท ๅœจ ยท ๆ ยท ็”ป ยท ไธ‹ ยท ไธ€ ยท ๅน•" + : "AI ยท ๆญฃ ยท ๅœจ ยท ๆƒณ ยท ไฝ  ยท ็œ‹ ยท ๅˆฐ ยท ไบ† ยท ไป€ ยท ไนˆ"}

)} - {/* Click ripple indicator */} {pendingClick && ( <>

- ๆญฃ ยท ๅœจ ยท ็ป˜ ยท ๅˆถ ยท ็ฌฌ ยท ไธ€ ยท ๅธง + ๆญฃ ยท ๅœจ ยท ็ป˜ ยท ๅˆถ ยท ็ฌฌ ยท ไธ€ ยท ๅน•

)} @@ -330,9 +372,7 @@ export function PlayCanvas({ {dims ? `${dims.w} ร— ${dims.h} ยท png` : "โ€”"} - - {phase === "ready" ? (choices.length > 0 ? "้€‰ ยท ๆ‹ฉ ยท ไธ€ ยท ้กน" : "ไปป ยท ๆ„ ยท ็‚น ยท ๅ‡ป") : "ยทยทยท"} - + {footerHint}
)}
diff --git a/packages/engine/src/director.ts b/packages/engine/src/director.ts index c478cc9..3ef8d43 100644 --- a/packages/engine/src/director.ts +++ b/packages/engine/src/director.ts @@ -1,20 +1,239 @@ import { chat } from "@yume/ai-client"; -import type { ProviderConfig, Session, StoryFrame, UIElement } from "@yume/types"; +import type { + Beat, + BeatChoice, + BeatChoiceEffect, + BeatNext, + ProviderConfig, + Scene, + Session, +} from "@yume/types"; import { parseJsonLoose } from "./jsonParser"; -import { DIRECTOR_SYSTEM, buildDirectorUserMessage } from "./prompts"; +import { + DIRECTOR_SYSTEM, + INSERT_BEAT_SYSTEM, + buildDirectorUserMessage, + buildInsertBeatUserMessage, +} from "./prompts"; -type DirectorOutput = { +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Raw shape produced by the model โ€” we coerce + validate into a Scene. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +type RawEffect = { + kind?: string; + targetBeatId?: string; + nextSceneSeed?: string; +}; + +type RawChoice = { + id?: string; + label?: string; + effect?: RawEffect; +}; + +type RawNext = { + type?: string; + nextBeatId?: string; + choices?: RawChoice[]; +}; + +type RawBeat = { + id?: string; narration?: string; speaker?: string; line?: string; - scenePrompt: string; - uiElements: UIElement[]; + next?: RawNext; }; -export async function direct( +type RawScene = { + scenePrompt?: string; + entryBeatId?: string; + beats?: RawBeat[]; +}; + +function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect { + if (raw?.kind === "advance-beat" && raw.targetBeatId?.trim()) { + return { kind: "advance-beat", targetBeatId: raw.targetBeatId.trim() }; + } + return { + kind: "change-scene", + nextSceneSeed: raw?.nextSceneSeed?.trim() || "ๆœชๆŒ‡ๅฎš", + }; +} + +function coerceChoice(raw: RawChoice, idx: number): BeatChoice { + return { + id: raw.id?.trim() || `c${idx + 1}`, + label: raw.label?.trim() || `้€‰้กน ${idx + 1}`, + effect: coerceEffect(raw.effect), + }; +} + +function coerceNext(raw: RawNext | undefined, fallbackBeatId: string): BeatNext { + if (raw?.type === "choice" && Array.isArray(raw.choices) && raw.choices.length) { + return { + type: "choice", + choices: raw.choices.map((c, i) => coerceChoice(c, i)), + }; + } + return { + type: "continue", + nextBeatId: raw?.nextBeatId?.trim() || fallbackBeatId, + }; +} + +function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat { + const id = raw.id?.trim() || `b${idx + 1}`; + // Non-last beats default their `continue` target to the following beat. + // The last beat gets an empty fallback on purpose: repairBeats() turns a + // last/dangling continue into a real scene-change exit so the player can + // never get stuck self-looping on it. + const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : ""; + return { + id, + narration: raw.narration?.trim() || undefined, + speaker: raw.speaker?.trim() || undefined, + line: raw.line?.trim() || undefined, + next: coerceNext(raw.next, fallback), + }; +} + +const FALLBACK_SEED = "ๆ•…ไบ‹็ปง็ปญๆŽจ่ฟ›"; + +function fallbackExitChoice(beatId: string): BeatChoice { + return { + id: `${beatId}__exit`, + label: "็ปง็ปญ", + effect: { kind: "change-scene", nextSceneSeed: FALLBACK_SEED }, + }; +} + +// Beat ids are graph keys (the front-end's `beats.find(b => b.id === ...)`, +// the session's `visitedBeatIds`, and `continue`/`advance-beat` targets). If +// the model reuses an id across beats, the second occurrence becomes silently +// unreachable and external references collapse to the first beat. Rename +// duplicates; rewrite the renamed beat's OWN self-references (the most +// natural interpretation of a duplicate id being referenced from inside that +// same beat). External references stay pointing at the first occurrence. +function ensureUniqueBeatIds(beats: Beat[]): Beat[] { + const seen = new Set(); + return beats.map((b): Beat => { + if (!seen.has(b.id)) { + seen.add(b.id); + return b; + } + const oldId = b.id; + let n = 2; + while (seen.has(`${oldId}_${n}`)) n += 1; + const newId = `${oldId}_${n}`; + seen.add(newId); + + let next = b.next; + if (next.type === "continue" && next.nextBeatId === oldId) { + next = { type: "continue", nextBeatId: newId }; + } else if (next.type === "choice") { + next = { + type: "choice", + choices: next.choices.map((c) => + c.effect.kind === "advance-beat" && c.effect.targetBeatId === oldId + ? { + ...c, + effect: { kind: "advance-beat" as const, targetBeatId: newId }, + } + : c, + ), + }; + } + return { ...b, id: newId, next }; + }); +} + +// Repairs referential integrity AND guarantees the scene is escapable: +// - a `continue` to a missing/self id is repointed to the next beat in order; +// a last/dangling continue with nowhere to go becomes a scene-change exit +// (never a self-loop, which would strand the player on "click to advance") +// - an `advance-beat` to a missing id is downgraded to a scene change +// - if no change-scene exit exists anywhere, one is appended to the last beat +function repairBeats(beats: Beat[]): Beat[] { + const ids = new Set(beats.map((b) => b.id)); + + const fixed: Beat[] = beats.map((b, idx): Beat => { + if (b.next.type === "continue") { + const target = b.next.nextBeatId; + if (ids.has(target) && target !== b.id) return b; + const nextByIndex = beats[idx + 1]?.id; + if (nextByIndex) { + return { ...b, next: { type: "continue", nextBeatId: nextByIndex } }; + } + return { ...b, next: { type: "choice", choices: [fallbackExitChoice(b.id)] } }; + } + + const patched = b.next.choices.map((c) => + c.effect.kind === "advance-beat" && !ids.has(c.effect.targetBeatId) + ? { + ...c, + effect: { + kind: "change-scene" as const, + nextSceneSeed: "ๆœชๆŒ‡ๅฎš๏ผˆๅฏผๆผ”ๅผ•็”จไธๅญ˜ๅœจ็š„ beat๏ผŒๅทฒ้™็บงไธบๆขๅœบ๏ผ‰", + }, + } + : c, + ); + return { ...b, next: { type: "choice", choices: patched } }; + }); + + const hasExit = fixed.some( + (b) => + b.next.type === "choice" && + b.next.choices.some((c) => c.effect.kind === "change-scene"), + ); + if (!hasExit && fixed.length > 0) { + const lastIdx = fixed.length - 1; + const last = fixed[lastIdx]!; + const existing = last.next.type === "choice" ? last.next.choices : []; + fixed[lastIdx] = { + ...last, + next: { type: "choice", choices: [...existing, fallbackExitChoice(last.id)] }, + }; + } + + return fixed; +} + +// Choice ids are the keys the front-end uses to cache and consume prefetched +// scenes. Two beats both defaulting to c1/c2 (or the model reusing ids across +// beats) would make a transition reuse the WRONG prefetched scene โ€” so force +// every choice id to be unique within the scene. +function ensureUniqueChoiceIds(beats: Beat[]): Beat[] { + const seen = new Set(); + for (const b of beats) { + if (b.next.type !== "choice") continue; + for (const c of b.next.choices) { + if (seen.has(c.id)) { + let n = 2; + while (seen.has(`${c.id}_${n}`)) n += 1; + c.id = `${c.id}_${n}`; + } + seen.add(c.id); + } + } + return beats; +} + +function newSceneId(): string { + return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`; +} + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// directScene โ€” generates one Scene (multi-beat) for the player. +// Called both on real scene transitions AND on speculative prefetch. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function directScene( config: ProviderConfig, session: Session, -): Promise { +): Promise { const raw = await chat( config, [ @@ -24,14 +243,71 @@ export async function direct( { temperature: 0.9, responseFormat: "json_object" }, ); - const parsed = parseJsonLoose(raw); + const parsed = parseJsonLoose(raw); + const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : []; + if (rawBeats.length === 0) { + throw new Error("Director returned no beats"); + } + + const beats = ensureUniqueChoiceIds( + repairBeats( + ensureUniqueBeatIds( + rawBeats.map((b, i) => coerceBeat(b, i, rawBeats.length)), + ), + ), + ); + + const declaredEntry = parsed.entryBeatId?.trim(); + const entryBeatId = + declaredEntry && beats.some((b) => b.id === declaredEntry) + ? declaredEntry + : beats[0]!.id; return { - id: `frame_${Date.now()}`, - narration: parsed.narration?.trim() || undefined, - speaker: parsed.speaker?.trim() || undefined, - line: parsed.line?.trim() || undefined, - scenePrompt: parsed.scenePrompt, - uiElements: parsed.uiElements ?? [], + id: newSceneId(), + scenePrompt: parsed.scenePrompt?.trim() || "an empty scene", + beats, + entryBeatId, }; } + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// directInsertBeat โ€” generates a one-off transient beat in response to +// a freeform vision action that stays in-scene. Used by /api/insert-beat. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function directInsertBeat( + config: ProviderConfig, + session: Session, + freeformAction: string, +): Promise<{ narration?: string; speaker?: string; line?: string }> { + const raw = await chat( + config, + [ + { role: "system", content: INSERT_BEAT_SYSTEM }, + { + role: "user", + content: buildInsertBeatUserMessage(session, freeformAction), + }, + ], + { temperature: 0.9, responseFormat: "json_object" }, + ); + + const parsed = parseJsonLoose<{ + narration?: string; + speaker?: string; + line?: string; + }>(raw); + + const narration = parsed.narration?.trim() || undefined; + const speaker = parsed.speaker?.trim() || undefined; + const line = parsed.line?.trim() || undefined; + + // If the model returned nothing usable, supply a fallback narration so the + // frontend doesn't append a silent empty beat that renders no dialogue โ€” + // which would make the click appear to do nothing. + if (!narration && !speaker && !line) { + return { narration: "๏ผˆไฝ ๅœไธ‹่„šๆญฅ๏ผŒ็Žฏ่ง†็‰‡ๅˆปใ€‚๏ผ‰" }; + } + return { narration, speaker, line }; +} diff --git a/packages/engine/src/index.ts b/packages/engine/src/index.ts index f4915fb..7ddb445 100644 --- a/packages/engine/src/index.ts +++ b/packages/engine/src/index.ts @@ -1,3 +1,8 @@ -export { startSession, takeTurn, visionTurn } from "./orchestrator"; +export { + startSession, + requestScene, + visionDecide, + requestInsertBeat, +} from "./orchestrator"; export { annotateClick } from "./annotate"; export * from "./prompts"; diff --git a/packages/engine/src/orchestrator.ts b/packages/engine/src/orchestrator.ts index f0b47ec..339fa8f 100644 --- a/packages/engine/src/orchestrator.ts +++ b/packages/engine/src/orchestrator.ts @@ -1,8 +1,9 @@ import type { - ClickIntent, EngineConfig, - InteractRequest, - InteractResponse, + InsertBeatRequest, + InsertBeatResponse, + SceneRequest, + SceneResponse, Session, StartRequest, StartResponse, @@ -10,7 +11,7 @@ import type { VisionResponse, } from "@yume/types"; import { annotateClick } from "./annotate"; -import { direct } from "./director"; +import { directInsertBeat, directScene } from "./director"; import { render } from "./renderer"; import { interpret } from "./vision"; @@ -18,6 +19,10 @@ function newSessionId(): string { return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`; } +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// startSession โ€” first scene + image +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + export async function startSession( config: EngineConfig, req: StartRequest, @@ -30,51 +35,56 @@ export async function startSession( history: [], }; - const frame = await direct(config.text, session); - const imageBase64 = await render(config.image, frame, session.styleGuide); + const scene = await directScene(config.text, session); + const imageBase64 = await render(config.image, scene, session.styleGuide); return { sessionId: session.id, - frame, + scene, imageBase64, }; } -export async function visionTurn( +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// requestScene โ€” generate the NEXT scene + image. +// Frontend passes a session whose latest history entry has `exit` set. +// Also used for prefetch speculation (frontend synthesizes the exit). +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function requestScene( + config: EngineConfig, + req: SceneRequest, +): Promise { + const scene = await directScene(config.text, req.session); + const imageBase64 = await render(config.image, scene, req.session.styleGuide); + return { scene, imageBase64 }; +} + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// visionDecide โ€” interprets a background click into intent + classify. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function visionDecide( config: EngineConfig, req: VisionRequest, ): Promise { const annotated = await annotateClick(req.prevImageBase64, req.click); - const lastFrame = req.session.history.at(-1)?.frame; - const uiElements = lastFrame?.uiElements ?? []; - const intent = await interpret(config.vision, annotated, uiElements); - return { intent }; + const current = req.session.history.at(-1)?.scene ?? null; + return interpret(config.vision, annotated, current); } -export async function takeTurn( +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// requestInsertBeat โ€” generates a transient in-scene beat (no image regen) +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export async function requestInsertBeat( config: EngineConfig, - req: InteractRequest, -): Promise { - const updatedSession: Session = { - ...req.session, - history: req.session.history.map((entry, idx, arr) => - idx === arr.length - 1 - ? { ...entry, click: req.click, intent: req.intent } - : entry, - ), - }; - - const nextFrame = await direct(config.text, updatedSession); - const nextImage = await render( - config.image, - nextFrame, - updatedSession.styleGuide, + req: InsertBeatRequest, +): Promise { + const partial = await directInsertBeat( + config.text, + req.session, + req.freeformAction, ); - - return { - session: updatedSession, - frame: nextFrame, - imageBase64: nextImage, - intent: req.intent, - }; + return { partial }; } diff --git a/packages/engine/src/prompts.ts b/packages/engine/src/prompts.ts index d2594d0..4af26c2 100644 --- a/packages/engine/src/prompts.ts +++ b/packages/engine/src/prompts.ts @@ -1,28 +1,76 @@ -import type { Character, Session, StoryFrame, UIElement } from "@yume/types"; +import type { Scene, Session } from "@yume/types"; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Director โ€” emits one Scene (background + a graph of beats) at a time. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -export const DIRECTOR_SYSTEM = `ไฝ ๆ˜ฏไธ€ไธชไบคไบ’่ง†่ง‰ๅฐ่ฏด็š„็ผ–ๅ‰งๅฏผๆผ”ใ€‚ๆฏๆฌกๆ นๆฎไธ–็•Œ่ง‚ใ€็”ป้ฃŽๅ’Œๅކๅฒ๏ผŒ่พ“ๅ‡บๅฝ“ๅ‰็”ป้ข่ฆๅ‘ˆ็Žฐ็š„ๅ†…ๅฎนใ€‚ +export const DIRECTOR_SYSTEM = `ไฝ ๆ˜ฏไธ€ไธชไบคไบ’่ง†่ง‰ๅฐ่ฏด็š„ใ€Œๅœบๆ™ฏๅฏผๆผ”ใ€ใ€‚ๆฏๆฌกๅŸบไบŽไธ–็•Œ่ง‚ใ€็”ป้ฃŽใ€็Žฉๅฎถๅކๅฒ๏ผŒ่พ“ๅ‡บ**ไธ€ไธชๅฎŒๆ•ด็š„ๅœบๆ™ฏ**ใ€‚ + +ไธ€ไธชๅœบๆ™ฏๅŒ…ๅซ๏ผš +- ไธ€ๅผ ่ƒŒๆ™ฏๅ›พ๏ผˆไฝ ็ป™ๅ‡บ่‹ฑๆ–‡ scenePrompt๏ผ‰ +- ไธ€็ป„ๅฏน่ฏ่Š‚ๆ‹ beats๏ผŒ็ŽฉๅฎถไผšๆŒ‰้กบๅบ็ปๅކๅฎƒไปฌ + +ๆฏไธช beat ๆ˜ฏ็Žฉๅฎถไผš็œ‹ๅˆฐ็š„ไธ€ๆฎตๅ™่ฟฐ / ๅฏน่ฏ / ้€‰ๆ‹ฉใ€‚beat ไน‹้—ด้€š่ฟ‡ next ๅญ—ๆฎต่ฟžๆŽฅ๏ผš +- "continue": ็Žฉๅฎถ็‚นๅ‡ปๅ›พ็‰‡่ƒŒๆ™ฏ / ๆŒ‰็ปง็ปญ๏ผŒ่‡ช็„ถๆŽจ่ฟ›ๅˆฐไธ‹ไธ€ไธช beat +- "choice": ๅœจๆญค่ฎฉ็Žฉๅฎถๅš้€‰ๆ‹ฉ๏ผŒๆŒ‰ๆ‰€้€‰ choice ็š„ effect ่ตฐๅ‘ + +choice ็š„ effect ๆœ‰ไธค็ง๏ผš +- "advance-beat": ็Žฉๅฎถ้€‰ไบ†ไน‹ๅŽ่ทณๅˆฐ**ๅŒๅœบๆ™ฏๅ†…**็š„ๅฆไธ€ไธช beat๏ผˆไธๆข่ƒŒๆ™ฏๅ›พ๏ผŒ้€Ÿๅบฆๆžๅฟซ๏ผ‰ +- "change-scene": ็Žฉๅฎถ้€‰ไบ†ไน‹ๅŽๅˆ‡ๆขๅˆฐ**ๆ–ฐๅœบๆ™ฏ**๏ผˆ่ง†่ง’ๅ˜ไบ† / ่ตฐๅˆฐๆ–ฐๅœฐๆ–น / ๆ—ถ้—ด่ทณไบ†๏ผ‰ + +่ฎพ่ฎกๅŽŸๅˆ™๏ผš +- ๅŒๅœบๆ™ฏๅ†… beat ๆ•ฐ่‡ช็”ฑๅ‘ๆŒฅ๏ผŒๆŒ‰ๅ‰งๆƒ…่Š‚ๅฅ่‡ช็„ถ็ป™ๅ‡บ๏ผˆ้€šๅธธ 2โ€“6 ไธช๏ผŒๅฏไปฅๆ›ดๅคš๏ผ‰ +- ๅคš็”จ continue๏ผŒๅฐ‘็”จ choice โ€” ้€‰ๆ‹ฉๅชๅบ”ๅ‡บ็Žฐๅœจใ€Œ็œŸๆญฃ็š„ๅฒ”่ทฏๅฃใ€ +- advance-beat ้€‚ๅˆๅค„็†ๅฏน่ฏๅˆ†ๆ”ฏ๏ผˆๅŒไธ€ๅœบๆ™ฏ้‡Œๆขไธช่ฏ้ข˜ใ€่ฟฝ้—ฎใ€ๆ’’ๅจ‡๏ผ‰ +- change-scene ้€‚ๅˆ็ฉบ้—ด/ๆ—ถ้—ด่ทณ่ทƒ๏ผˆๅ‡บ้—จใ€่ฝฌ่บซ็œ‹็ช—ๅค–ใ€็ฌฌไบŒๅคฉๆธ…ๆ™จ๏ผ‰ +- ไธ€ไธชๅœบๆ™ฏ่‡ณๅฐ‘่ฆๆœ‰ไธ€ไธช change-scene ๅ‡บๅฃ๏ผˆ้™ค้ž็œŸๅˆฐ็ป“ๅฑ€๏ผ‰ +- ๆฏไธช change-scene ๅฟ…้กปๅธฆ nextSceneSeed โ€”โ€” ไธ€ๅฅไธญๆ–‡็ฎ€่ฟฐใ€Œไธ‹ไธ€ๅœบๆ˜ฏๅ“ช้‡Œใ€่ฐๅœจใ€่ฆๅ‘็”Ÿไป€ไนˆใ€๏ผŒ็”จๆฅๅผ•ๅฏผไธ‹ไธ€ๆฌกๅฏผๆผ”่ฐƒ็”จ +- ๅŒไธ€ๅœบๆ™ฏ็š„ beat id ไบ’ไธ้‡ๅค +- next.nextBeatId ๅผ•็”จ็š„ beat ๅฟ…้กปๅญ˜ๅœจ +- choice ่‡ณๅฐ‘ 2 ไธช๏ผŒ่‡ณๅคš 4 ไธช๏ผŒไบ’ไธ้‡ๅค + +ๆ–‡ๆœฌ้ฃŽๆ ผ็บฆๆŸ๏ผš +- narration / line ็”จไธญๆ–‡๏ผŒscenePrompt ็”จ่‹ฑๆ–‡ +- ๅ•ไธช beat ็š„ narration ไธŽ line ๅŠ ่ตทๆฅ โ‰ค80 ๅญ— +- ๅ•ไธช choice label โ‰ค15 ๅญ— +- scenePrompt ๅชๆ่ฟฐ็”ป้ข้‡Œ็œ‹ๅˆฐไป€ไนˆ๏ผŒไธ่ฆๆ่ฟฐ UI ๅฟ…้กป่พ“ๅ‡บไธฅๆ ผ JSON๏ผŒ็ป“ๆž„ๅฆ‚ไธ‹๏ผš { - "narration": "ๆœฌๅธงๆ—็™ฝ๏ผˆๅฏ็ฉบๅญ—็ฌฆไธฒ๏ผ‰", - "speaker": "ๆœฌๅธง่ฏด่ฏ่ง’่‰ฒๅ๏ผˆๅฏ็ฉบ๏ผ‰", - "line": "ๆœฌๅธง่ง’่‰ฒๅฐ่ฏ๏ผˆๅฏ็ฉบ๏ผ‰", - "scenePrompt": "่‹ฑๆ–‡ๅœบๆ™ฏๆ่ฟฐ๏ผŒ็ป™ๅ›พๅƒๆจกๅž‹็”จ๏ผŒๆ่ฟฐ็”ป้ข้‡Œ็œ‹ๅˆฐไป€ไนˆ", - "uiElements": [ - { "id": "choice_1", "kind": "choice", "label": "้€‰้กนไธ€ๆ–‡ๅญ—๏ผˆโ‰ค15 ๅญ—๏ผ‰" }, - { "id": "choice_2", "kind": "choice", "label": "้€‰้กนไบŒๆ–‡ๅญ—๏ผˆโ‰ค15 ๅญ—๏ผ‰" }, - { "id": "choice_3", "kind": "choice", "label": "้€‰้กนไธ‰ๆ–‡ๅญ—๏ผˆโ‰ค15 ๅญ—๏ผ‰" } + "scenePrompt": "english scene description, no UI", + "entryBeatId": "b1", + "beats": [ + { + "id": "b1", + "narration": "ๅฏ็ฉบ", + "speaker": "ๅฏ็ฉบ", + "line": "ๅฏ็ฉบ", + "next": { "type": "continue", "nextBeatId": "b2" } + }, + { + "id": "b2", + "speaker": "...", + "line": "...", + "next": { + "type": "choice", + "choices": [ + { + "id": "c1", + "label": "็ปง็ปญ่ฟฝ้—ฎ", + "effect": { "kind": "advance-beat", "targetBeatId": "b3" } + }, + { + "id": "c2", + "label": "่ตท่บซ็ฆปๅผ€ๆ•™ๅฎค", + "effect": { "kind": "change-scene", "nextSceneSeed": "้›จๅŽๆนฟๆผ‰ๆผ‰็š„่ตฐๅปŠ๏ผŒๅฅน่ฟฝไบ†ๅ‡บๆฅ" } + } + ] + } + } ] } -่ง„ๅˆ™๏ผš -- narration / line ไธญๆ–‡๏ผŒscenePrompt ่‹ฑๆ–‡ -- ้ป˜่ฎค 3 ไธช choice ๅ…ƒ็ด ๏ผŒๅฏไปฅๆ นๆฎๆƒ…ๅขƒ้ขๅค–ๅŠ  menu/item/custom๏ผˆ็ฝ•่ง๏ผ‰ -- ้€‰้กนๅฟ…้กป่ƒฝๅˆ‡ๅฎžๆŽจ่ฟ›ๅ‰งๆƒ…๏ผŒไธ”ไบ’ไธ้‡ๅค -- scenePrompt ๆ่ฟฐๅฝ“ๅ‰็š„็”ป้ข๏ผŒไธ่ฆๅŒ…ๆ‹ฌ UI ๅ…ƒ็ด  -- ๅ•ๅธงๆ—็™ฝไธŽๅฐ่ฏๅŠ ่ตทๆฅๆŽงๅˆถๅœจ 80 ๅญ—ไปฅๅ†… -- ไธ่ฆ่พ“ๅ‡บ JSON ไปฅๅค–็š„ไปปไฝ•ๆ–‡ๆœฌ`; +ไธ่ฆ่พ“ๅ‡บ JSON ไปฅๅค–็š„ไปปไฝ•ๆ–‡ๆœฌใ€‚`; export function buildDirectorUserMessage(session: Session): string { const parts: string[] = []; @@ -30,38 +78,120 @@ export function buildDirectorUserMessage(session: Session): string { parts.push(`็”ป้ฃŽ๏ผš${session.styleGuide}`); if (session.history.length === 0) { - parts.push("\n่ฟ™ๆ˜ฏๆ•…ไบ‹็š„ๅผ€ๅœบใ€‚่ฏท็”Ÿๆˆๅผ€ๅœบ็”ป้ข๏ผŒไธฅๆ ผไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"); + parts.push("\n่ฟ™ๆ˜ฏๆ•…ไบ‹็š„ๅผ€ๅœบใ€‚่ฏท็”Ÿๆˆ็ฌฌไธ€ไธชๅœบๆ™ฏ๏ผŒไธฅๆ ผไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"); return parts.join("\n"); } - parts.push("\nๅކๅฒ๏ผš"); + parts.push("\nๅœบๆ™ฏๅކๅฒ๏ผˆๆŒ‰ๆ—ถ้—ด้กบๅบ๏ผ‰๏ผš"); session.history.forEach((entry, idx) => { - const f = entry.frame; - const beat: string[] = [`ใ€็ฌฌ ${idx + 1} ๅธงใ€‘`]; - if (f.narration) beat.push(`ๆ—็™ฝ๏ผš${f.narration}`); - if (f.line) beat.push(`${f.speaker ?? "?"}๏ผš${f.line}`); - if (entry.intent) { - beat.push( - `็”จๆˆท่กŒไธบ๏ผš${entry.intent.targetLabel ?? entry.intent.freeformAction ?? "ๆœช็Ÿฅ"}`, - ); + const lines: string[] = [`ใ€ๅœบๆ™ฏ ${idx + 1}ใ€‘`]; + lines.push(` scenePrompt: ${entry.scene.scenePrompt}`); + + const visited = entry.visitedBeatIds.length + ? entry.visitedBeatIds + : [entry.scene.entryBeatId]; + const beatById = new Map(entry.scene.beats.map((b) => [b.id, b])); + const visitedBeats = visited + .map((id) => beatById.get(id)) + .filter((b): b is NonNullable => Boolean(b)); + + for (const b of visitedBeats) { + const fragments: string[] = []; + if (b.narration) fragments.push(`ๆ—็™ฝ๏ผš${b.narration}`); + if (b.line) fragments.push(`${b.speaker ?? "?"}๏ผš${b.line}`); + if (fragments.length) lines.push(" " + fragments.join(" / ")); } - parts.push(beat.join("\n")); + + if (entry.exit) { + if (entry.exit.kind === "choice") { + lines.push( + ` ็Žฉๅฎถๆœ€็ปˆ้€‰ๆ‹ฉ๏ผš${entry.exit.label}๏ผˆๅŽปๅพ€๏ผš${entry.exit.nextSceneSeed}๏ผ‰`, + ); + } else { + lines.push(` ็Žฉๅฎถ่‡ช็”ฑๅŠจไฝœ๏ผš${entry.exit.action}`); + } + } + parts.push(lines.join("\n")); }); - parts.push("\n่ฏท็”Ÿๆˆไธ‹ไธ€ๅธง๏ผŒไธฅๆ ผไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"); + const last = session.history.at(-1); + const lastExit = last?.exit; + if (lastExit) { + if (lastExit.kind === "choice") { + parts.push( + `\n่ฏทๅŸบไบŽใ€Œ็ŽฉๅฎถๅœจไธŠไธ€ๅœบ้€‰ๆ‹ฉไบ†๏ผš${lastExit.label}ใ€๏ผŒ็”Ÿๆˆไธ‹ไธ€ไธชๅœบๆ™ฏ๏ผˆๅ‚่€ƒ็งๅญ๏ผš${lastExit.nextSceneSeed}๏ผ‰ใ€‚`, + ); + } else { + parts.push( + `\n่ฏทๅŸบไบŽใ€Œ็Žฉๅฎถ่‡ช็”ฑๅŠจไฝœ๏ผš${lastExit.action}ใ€๏ผŒ็”Ÿๆˆไธ‹ไธ€ไธชๅœบๆ™ฏใ€‚`, + ); + } + } else { + parts.push("\n่ฏท็”Ÿๆˆไธ‹ไธ€ไธชๅœบๆ™ฏใ€‚"); + } + + parts.push("ไธฅๆ ผไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"); return parts.join("\n"); } -export function buildImagePrompt( - frame: StoryFrame, - styleGuide: string, +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Insert-Beat โ€” given a freeform vision action that is judged to stay +// *within* the current scene, generate one transient beat. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export const INSERT_BEAT_SYSTEM = `ไฝ ๆ˜ฏ่ง†่ง‰ๅฐ่ฏด็ผ–ๅ‰งใ€‚็Žฉๅฎถๅœจๅฝ“ๅ‰ๅœบๆ™ฏๅ†…ๅšไบ†ไธ€ไธช**ไธไผšๆขๅœบๆ™ฏ็š„่‡ช็”ฑๅŠจไฝœ**๏ผˆๆฏ”ๅฆ‚็œ‹ไธ€็œผๆกŒไธŠ็š„็›ธๆก†ใ€ๆƒณไบ†ๆƒณๅˆšๆ‰้‚ฃๅฅ่ฏ๏ผ‰ใ€‚่ฏทๅŸบไบŽๆญคๅŠจไฝœ๏ผŒๅ†™ๅ‡บไธ€ไธช**ๅ•็‹ฌ็š„ใ€่ฟ‡ๆธกๆ€ง็š„ beat**๏ผšๅฏไปฅๆ˜ฏๆ—็™ฝใ€่ง’่‰ฒๅฐ่ฏใ€ๆˆ–ไธค่€…็ป“ๅˆใ€‚ + +ๆ–‡ๆœฌ้ฃŽๆ ผ็บฆๆŸ๏ผš +- narration / line ็”จไธญๆ–‡ +- narration ไธŽ line ๅŠ ่ตทๆฅ โ‰ค80 ๅญ— +- ไธ่ฆๆ‰“็ ดๅฝ“ๅ‰ๅœบๆ™ฏ็š„็‰ฉ็†็Šถๆ€๏ผˆ็ŽฉๅฎถไปๅœจๅŽŸๅœฐใ€ๅฏน้ขไปๆ˜ฏๅŒไธ€ไธช่ง’่‰ฒ๏ผ‰ +- ไธ่ฆ็”Ÿๆˆ้€‰้กนๆˆ–ไธ‹ไธ€ๆญฅๆŒ‡ๅผ• โ€”โ€” ็Žฉๅฎถ็‚นๅ‡ปไผš่‡ช็„ถๅ›žๅˆฐๅŽŸ beat + +ๅฟ…้กป่พ“ๅ‡บไธฅๆ ผ JSON๏ผš +{ + "narration": "...", + "speaker": "...", + "line": "..." +} + +ๅญ—ๆฎต้ƒฝๅฏไธบ็ฉบๅญ—็ฌฆไธฒใ€‚ไธ่ฆ่พ“ๅ‡บ JSON ไปฅๅค–็š„ไปปไฝ•ๆ–‡ๆœฌใ€‚`; + +export function buildInsertBeatUserMessage( + session: Session, + freeformAction: string, ): string { + const parts: string[] = []; + parts.push(`ไธ–็•Œ่ง‚๏ผš${session.worldSetting}`); + + const current = session.history.at(-1); + if (current) { + parts.push(`ๅฝ“ๅ‰ๅœบๆ™ฏ๏ผš${current.scene.scenePrompt}`); + const lastBeatId = current.visitedBeatIds.at(-1) ?? current.scene.entryBeatId; + const lastBeat = current.scene.beats.find((b) => b.id === lastBeatId); + if (lastBeat) { + const recent: string[] = []; + if (lastBeat.narration) recent.push(`ๆ—็™ฝ๏ผš${lastBeat.narration}`); + if (lastBeat.line) recent.push(`${lastBeat.speaker ?? "?"}๏ผš${lastBeat.line}`); + if (recent.length) parts.push(`ๅˆšๆ‰ๅ‘็”Ÿ๏ผš${recent.join(" / ")}`); + } + } + + parts.push(`\n็Žฉๅฎถๆญคๅˆป็š„่‡ช็”ฑๅŠจไฝœ๏ผš${freeformAction}`); + parts.push("\n่ฏท็”Ÿๆˆไธ€ไธช่ฟ‡ๆธกๆ€ง beat๏ผŒไธฅๆ ผไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"); + return parts.join("\n"); +} + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Image renderer +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export function buildImagePrompt(scene: Scene, styleGuide: string): string { return `Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024). ART STYLE: ${styleGuide} SCENE (fill the ENTIRE canvas โ€” no UI elements, no text overlays): -${frame.scenePrompt} +${scene.scenePrompt} STRICT RULES โ€” NEVER violate these: - DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay. @@ -74,25 +204,31 @@ STRICT RULES โ€” NEVER violate these: - Characters or key scene elements should be positioned in the upper 65% of the frame.`; } +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Vision โ€” interprets a background click and classifies the action. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -export const VISION_SYSTEM_PROMPT = `ไฝ ๆ˜ฏ่ง†่ง‰็†่งฃๅŠฉๆ‰‹ใ€‚็”จๆˆทๅœจ่ง†่ง‰ๅฐ่ฏด็•Œ้ขไธŠ็‚นๅ‡ปไบ†็บข่‰ฒๅœ†็‚นไฝ็ฝฎ๏ผŒไฝ ่ฆๆ นๆฎ็บข็‚นไฝ็ฝฎๅ’Œๅ›พไธญๅฏ่ง็š„ UI ๅ…ƒ็ด ๏ผŒๅˆคๆ–ญ็”จๆˆท็š„ๆ„ๅ›พใ€‚ +export const VISION_SYSTEM_PROMPT = `ไฝ ๆ˜ฏ่ง†่ง‰็†่งฃๅŠฉๆ‰‹ใ€‚็Žฉๅฎถๅœจ่ง†่ง‰ๅฐ่ฏด็š„่ƒŒๆ™ฏๅ›พไธŠ็‚นๅ‡ปไบ†็บข่‰ฒๅœ†็‚นไฝ็ฝฎ๏ผˆHTML ไธŠ็š„้€‰้กนๆŒ‰้’ฎไธไผš่ตฐๅˆฐไฝ ่ฟ™้‡Œ๏ผ‰ใ€‚ไฝ ็š„ไปปๅŠกๆ˜ฏ๏ผš +1. ็œ‹ๆธ…็บข็‚นๆŒ‡ๅ‘็”ป้ข้‡Œ็š„ไป€ไนˆ๏ผˆ็‰ฉไปถใ€่ง’่‰ฒใ€็ฉบ้—ดใ€่ฟœๅค„็š„ๆ–นๅ‘๏ผ‰ +2. ๆŽจๆ–ญ็Žฉๅฎถๆƒณๅนฒไป€ไนˆ +3. ๅˆคๆ–ญ่ฟ™ไธชๅŠจไฝœๆ˜ฏใ€Œๅœบๅ†…ๆŽข็ดขใ€๏ผˆไธ่ฏฅๆขๅ›พ๏ผ‰่ฟ˜ๆ˜ฏใ€Œๅœบๆ™ฏๅˆ‡ๆขใ€๏ผˆ่ฆๆขๅ›พ๏ผ‰ + +ๅˆคๆ–ญๅ‡†ๅˆ™๏ผš +- "insert-beat"๏ผˆๅœบๅ†…ๆŽข็ดข๏ผ‰๏ผš่ง‚ๅฏŸ็”ป้ข้‡ŒๆŸไธช็ป†่Š‚ใ€่‡ช่จ€่‡ช่ฏญใ€ๅ’Œๅฝ“ๅ‰่ง’่‰ฒ็ปง็ปญไบ’ๅŠจใ€็œ‹ไธ€็œผๆŸไธช็‰ฉไปถ +- "change-scene"๏ผˆๅœบๆ™ฏๅˆ‡ๆข๏ผ‰๏ผš่ตฐๅ‘็”ป้ขๆทฑๅค„็š„้—จ / ่ตฐๅปŠใ€่ฝฌๅคด็œ‹ๅ‘ๆ–ฐๆ–นๅ‘๏ผˆ่ง†่ง’ๅ˜ไบ†๏ผ‰ใ€็‚นไบ†่ฟœๅค„็š„ๅฆไธ€ไธช็ฉบ้—ดใ€ๆš—็คบๆ—ถ้—ด่ทณ่ทƒ็š„็‰ฉไปถ๏ผˆๅฆ‚ๆ—ถ้’Ÿ๏ผ‰ ๅฟ…้กป่พ“ๅ‡บไธฅๆ ผ JSON๏ผš { - "targetId": "ๅฏนๅบ”็š„ UI ๅ…ƒ็ด  id๏ผˆchoice_1 / choice_2 / choice_3 / menu / ...๏ผ‰๏ผŒๅฆ‚ๆžœ็‚นๅ‡ป็š„ๆ˜ฏ้ž UI ๅŒบๅŸŸๅˆ™ไธบ null", - "targetLabel": "ๅฏนๅบ” UI ๅ…ƒ็ด ็š„ๆ–‡ๅญ—ๆ่ฟฐ๏ผˆๅฆ‚ 'ๅ‘Š่ฏ‰ๅฅน็œŸ็›ธ'๏ผ‰๏ผŒๆœช็Ÿฅๅˆ™ไธบ null", - "reasoning": "ไธ€ๅฅ่ฏ่ฏดๆ˜Žๅˆคๆ–ญ็†็”ฑ", - "freeformAction": "ๅฆ‚ๆžœ็”จๆˆท็‚น็š„ๆ˜ฏๅœบๆ™ฏไธญ็š„็‰ฉไปถ/่ง’่‰ฒ็ญ‰้ž้€‰้กนๅŒบๅŸŸ๏ผŒๆ่ฟฐไป–ๅฏ่ƒฝ็š„ๆ„ๅ›พ๏ผˆๅฆ‚ 'ๆƒณๆ‹ฟ่ตทๆกŒไธŠ็š„้’ฅๅŒ™'๏ผ‰๏ผŒๅฆๅˆ™็ฉบๅญ—็ฌฆไธฒ" + "freeformAction": "็Žฉๅฎถๆƒณๅšไป€ไนˆ็š„ไธ€ๅฅไธญๆ–‡ๆ่ฟฐ๏ผŒไพ‹ๅฆ‚ใ€Œๆƒณๆ‹ฟ่ตทๆกŒไธŠ็š„้’ฅๅŒ™ใ€", + "classify": "insert-beat" ๆˆ– "change-scene", + "reasoning": "ไธ€ๅฅ่ฏ่ฏดๆ˜Žๅˆคๆ–ญ็†็”ฑ" } ไธ่ฆ่พ“ๅ‡บ JSON ไปฅๅค–็š„ไปปไฝ•ๆ–‡ๆœฌใ€‚`; -export function buildVisionUserPrompt(uiElements: UIElement[]): string { - const list = uiElements - .map((e) => `- id="${e.id}" kind="${e.kind}" label="${e.label}"`) - .join("\n"); - return `ๅฝ“ๅ‰็”ป้ขๅŒ…ๅซไปฅไธ‹ๅทฒ็Ÿฅ UI ๅ…ƒ็ด ๏ผš -${list} +export function buildVisionUserPrompt(scene: Scene | null): string { + if (!scene) return "่ฏทๅˆคๆ–ญ็Žฉๅฎถๆ„ๅ›พ๏ผŒๅนถไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚"; + return `ๅฝ“ๅ‰ๅœบๆ™ฏๆ่ฟฐ๏ผš${scene.scenePrompt} -็บข็‚นไฝ็ฝฎๅณไธบ็”จๆˆท็‚นๅ‡ปไฝ็ฝฎใ€‚่ฏทๅˆคๆ–ญ็”จๆˆท็š„ๆ„ๅ›พ๏ผŒๅนถไปฅ JSON ๆ ผๅผ่ฟ”ๅ›ž็ป“ๆžœใ€‚`; +็บข็‚นไฝ็ฝฎๅณไธบ็Žฉๅฎถ็‚นๅ‡ปไฝ็ฝฎใ€‚่ฏทๅˆคๆ–ญ็Žฉๅฎถๆ„ๅ›พไธŽๅˆ†็ฑป๏ผŒไปฅ JSON ๆ ผๅผ่ฟ”ๅ›žใ€‚`; } diff --git a/packages/engine/src/renderer.ts b/packages/engine/src/renderer.ts index c395266..27f4292 100644 --- a/packages/engine/src/renderer.ts +++ b/packages/engine/src/renderer.ts @@ -1,12 +1,12 @@ import { generateImage } from "@yume/ai-client"; -import type { ProviderConfig, StoryFrame } from "@yume/types"; +import type { ProviderConfig, Scene } from "@yume/types"; import { buildImagePrompt } from "./prompts"; export async function render( config: ProviderConfig, - frame: StoryFrame, + scene: Scene, styleGuide: string, ): Promise { - const prompt = buildImagePrompt(frame, styleGuide); + const prompt = buildImagePrompt(scene, styleGuide); return generateImage(config, prompt); } diff --git a/packages/engine/src/vision.ts b/packages/engine/src/vision.ts index c56c365..dab7da6 100644 --- a/packages/engine/src/vision.ts +++ b/packages/engine/src/vision.ts @@ -1,26 +1,39 @@ import { interpretClick } from "@yume/ai-client"; -import type { ClickIntent, ProviderConfig, UIElement } from "@yume/types"; +import type { + ClickIntent, + ProviderConfig, + Scene, + VisionClassify, +} from "@yume/types"; import { parseJsonLoose } from "./jsonParser"; import { VISION_SYSTEM_PROMPT, buildVisionUserPrompt } from "./prompts"; +export type VisionInterpretation = { + intent: ClickIntent; + classify: VisionClassify; +}; + export async function interpret( config: ProviderConfig, annotatedImageBase64: string, - uiElements: UIElement[], -): Promise { - const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(uiElements)}`; + scene: Scene | null, +): Promise { + const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(scene)}`; const raw = await interpretClick(config, annotatedImageBase64, userPrompt); const parsed = parseJsonLoose<{ - targetId?: string | null; - targetLabel?: string | null; - reasoning?: string; freeformAction?: string; + classify?: string; + reasoning?: string; }>(raw); + const classify: VisionClassify = + parsed.classify === "change-scene" ? "change-scene" : "insert-beat"; + return { - targetId: parsed.targetId ?? null, - targetLabel: parsed.targetLabel ?? null, - reasoning: parsed.reasoning ?? "", - freeformAction: parsed.freeformAction || undefined, + intent: { + freeformAction: parsed.freeformAction?.trim() || "็Žฉๅฎถ็‚นไบ†็”ป้ข๏ผŒไฝ†ๆ„ๅ›พไธๆ˜Ž", + reasoning: parsed.reasoning?.trim() || "", + }, + classify, }; } diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index f021148..da4a9ad 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -1,42 +1,86 @@ -export type UIElementKind = "choice" | "menu" | "item" | "custom"; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Beat โ€” one dialogue / narration moment within a Scene. +// Multiple beats share the same background image; tapping or choosing +// advances among them WITHOUT regenerating the image. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -export type UIElement = { - id: string; - kind: UIElementKind; - label: string; - hint?: string; -}; - -export type StoryFrame = { +export type Beat = { id: string; narration?: string; speaker?: string; line?: string; + next: BeatNext; +}; + +export type BeatNext = + | { type: "continue"; nextBeatId: string } + | { type: "choice"; choices: BeatChoice[] }; + +export type BeatChoice = { + id: string; + label: string; + effect: BeatChoiceEffect; +}; + +export type BeatChoiceEffect = + | { kind: "advance-beat"; targetBeatId: string } + | { kind: "change-scene"; nextSceneSeed: string }; + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Scene โ€” one background image + a graph of beats. +// The Director emits an entire Scene per call; the player navigates +// through its beats locally with zero network until exiting. +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export type Scene = { + id: string; scenePrompt: string; - uiElements: UIElement[]; + beats: Beat[]; + entryBeatId: string; }; -export type ClickIntent = { - targetId: string | null; - targetLabel: string | null; - reasoning: string; - freeformAction?: string; +export type SceneExit = + | { + kind: "choice"; + choiceId: string; + label: string; + nextSceneSeed: string; + } + | { kind: "freeform"; action: string }; + +export type SceneHistoryEntry = { + scene: Scene; + visitedBeatIds: string[]; + exit?: SceneExit; }; -export type HistoryEntry = { - frame: StoryFrame; - click?: { x: number; y: number }; - intent?: ClickIntent; -}; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Session +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ export type Session = { id: string; createdAt: number; worldSetting: string; styleGuide: string; - history: HistoryEntry[]; + history: SceneHistoryEntry[]; }; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Vision +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +export type ClickIntent = { + freeformAction: string; + reasoning: string; +}; + +export type VisionClassify = "insert-beat" | "change-scene"; + +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// Provider config +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + export type ProviderConfig = { baseUrl: string; apiKey: string; @@ -49,6 +93,10 @@ export type EngineConfig = { vision: ProviderConfig; }; +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +// API contracts +// โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + export type StartRequest = { worldSetting: string; styleGuide: string; @@ -56,10 +104,25 @@ export type StartRequest = { export type StartResponse = { sessionId: string; - frame: StoryFrame; + scene: Scene; imageBase64: string; }; +// /api/scene โ€” generates the next Scene, given session whose latest +// history entry has `exit` set. Also used for prefetch speculation +// (frontend synthesizes a speculative exit). +export type SceneRequest = { + session: Session; +}; + +export type SceneResponse = { + scene: Scene; + imageBase64: string; +}; + +// /api/vision โ€” interprets a background click on the current image and +// classifies whether it should insert a beat (in-scene exploration) or +// trigger a scene change. export type VisionRequest = { session: Session; prevImageBase64: string; @@ -68,17 +131,20 @@ export type VisionRequest = { export type VisionResponse = { intent: ClickIntent; + classify: VisionClassify; }; -export type InteractRequest = { +// /api/insert-beat โ€” generates a single transient beat in response to +// a freeform vision action. Does NOT regenerate the image. +export type InsertBeatRequest = { session: Session; - intent: ClickIntent; - click?: { x: number; y: number }; + freeformAction: string; }; -export type InteractResponse = { - session: Session; - frame: StoryFrame; - imageBase64: string; - intent: ClickIntent; +export type InsertBeatResponse = { + partial: { + narration?: string; + speaker?: string; + line?: string; + }; };