refactor: flatten monorepo to single web package (#12)

Flatten the pnpm monorepo (apps/web + packages/*) into a single web package at the repo root.

- Move app/lib/components/scripts/public to root; drop apps/web and packages/* wrappers
- Rewrite tsconfig paths (@infiplot/*) to ./lib/*; turbopack.root = __dirname
- Update Vercel (no root-directory) and Cloudflare (pnpm build:cf at root) deploy paths
- Regenerate pnpm-lock.yaml to drop stale workspace importers
- Bump engines.node to >=22 to match wrangler

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Zonghao Yuan
2026-06-03 00:55:45 +08:00
committed by GitHub
parent 9543c3dba1
commit dc5ecd60f6
221 changed files with 241 additions and 379 deletions
+90
View File
@@ -0,0 +1,90 @@
import { chat } from "@infiplot/ai-client";
import type { ProviderConfig, Session, StoryState } from "@infiplot/types";
import { parseJsonLoose } from "../jsonParser";
import { ARCHITECT_SYSTEM, buildArchitectUserMessage } from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Architect agent — ONE LLM call at session start.
//
// Expands the user's (often terse) world + style prompt into a real story
// bible: a second-person protagonist with a want and a flaw, a single
// central dramatic question (logline), a genre frame that anchors the
// 爽点 rhythm, an engineered cold-open for scene 1 (nextHook), and a small
// intentional cast. Seeds the StoryState that the Writer reads and updates
// every scene — so the story has a spine from beat one instead of being
// improvised cold.
//
// Everything is best-effort coerced with fallbacks: a malformed LLM
// response can never abort session start — worst case the Writer just gets
// a thinner bible and improvises more.
// ──────────────────────────────────────────────────────────────────────
type RawStoryState = {
logline?: unknown;
genreTags?: unknown;
protagonist?: unknown;
castNotes?: unknown;
synopsis?: unknown;
openThreads?: unknown;
relationships?: unknown;
nextHook?: unknown;
};
function str(raw: unknown): string {
return typeof raw === "string" ? raw.trim() : "";
}
function strArray(raw: unknown): string[] | undefined {
if (!Array.isArray(raw)) return undefined;
const out = raw
.map((x) => (typeof x === "string" ? x.trim() : ""))
.filter((x) => x.length > 0);
return out.length > 0 ? out : undefined;
}
export async function runArchitect(
config: ProviderConfig,
session: Session,
): Promise<StoryState> {
try {
const raw = await chat(
config,
[
{ role: "system", content: ARCHITECT_SYSTEM },
{ role: "user", content: buildArchitectUserMessage(session) },
],
{ temperature: 0.85, responseFormat: "json_object" },
);
const parsed = parseJsonLoose<RawStoryState>(raw);
return {
// Stable spine — fall back to the raw world/style prompt so the bible is
// never wholly empty even if the model returns garbage.
logline: str(parsed.logline) || session.worldSetting,
genreTags: str(parsed.genreTags),
protagonist:
str(parsed.protagonist) ||
"你是这个故事的主角(第二人称视角,永不出现在画面里)。",
castNotes: str(parsed.castNotes) || undefined,
// Volatile seeds — the opening Writer will rewrite these via its patch.
synopsis: str(parsed.synopsis) || "故事即将开始。",
openThreads: strArray(parsed.openThreads),
relationships: strArray(parsed.relationships),
nextHook: str(parsed.nextHook) || undefined,
};
} catch (err) {
// chat() or parseJsonLoose() can throw (network / unrepairable JSON).
// The Architect is best-effort: never let it abort session start — return
// a minimal bible seeded from the raw prompt and let the Writer improvise.
const msg = err instanceof Error ? err.message : String(err);
console.error(`[architect] failed, using minimal bible: ${msg}`);
return {
logline: session.worldSetting,
genreTags: "",
protagonist:
"你是这个故事的主角(第二人称视角,永不出现在画面里)。",
synopsis: "故事即将开始。",
};
}
}
+155
View File
@@ -0,0 +1,155 @@
import { chat, generateImage } from "@infiplot/ai-client";
import { provisionVoice } from "@infiplot/tts-client";
import type {
Character,
CharacterVoice,
EngineConfig,
Session,
} from "@infiplot/types";
import { parseJsonLoose } from "../jsonParser";
import { mockImageDataUri } from "../mockImage";
import {
CHARACTER_DESIGNER_SYSTEM,
buildCharacterDesignerUserMessage,
buildCharacterPortraitPrompt,
} from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// CharacterDesigner agent — designs ONE new character.
//
// Exposed as three GRANULAR stages so the director can schedule the slow
// parts around the Painter (a voice is never needed to paint a scene, and
// only entry-beat characters' portraits are referenced by the Painter):
//
// 1. designCharacterCard — ONE LLM call → visual + voice TEXT cards
// (intentional bundling: the same agent thinks about who this character
// IS, keeping appearance and vocal personality coherent)
// 2. renderCharacterPortrait — base portrait image (Runware URL + UUID)
// 3. provisionCharacterVoice — Xiaomi MiMo voicedesign → reference audio
//
// Each step degrades gracefully — if image gen fails the character just has
// no portrait; if voice gen fails it has no voice. The game keeps running.
// ──────────────────────────────────────────────────────────────────────
type CharacterDesignOutput = {
visualDescription?: string;
voiceDescription?: string;
};
// TEMP: per-phase timing for latency diagnosis. Same convention as the
// orchestrator's tlog. Remove after we have data on real-world numbers.
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
async function runDesignLLM(
config: EngineConfig,
session: Session,
charName: string,
): Promise<CharacterDesignOutput> {
const raw = await chat(
config.text,
[
{ role: "system", content: CHARACTER_DESIGNER_SYSTEM },
{
role: "user",
content: buildCharacterDesignerUserMessage(charName, session),
},
],
{ temperature: 0.7, responseFormat: "json_object" },
);
return parseJsonLoose<CharacterDesignOutput>(raw);
}
// Generate the per-character base portrait. The portrait is a "concept
// sheet" — single character, neutral pose, plain background — so it works
// well as a Runware referenceImages anchor for later scenes.
//
// Returns the URL (for any client display + URL-form references) and the
// UUID (cheapest reference form for subsequent Painter calls). Both come
// back in one `imageInference` response now that we use outputType=URL —
// no separate upload step needed.
//
// In mock mode we return the data URI as basePortraitUrl with no UUID
// (Painter is short-circuited anyway, so the lack of a UUID is moot).
export async function renderCharacterPortrait(
config: EngineConfig,
charName: string,
visualDescription: string,
styleGuide: string,
): Promise<{ basePortraitUrl?: string; basePortraitUuid?: string }> {
try {
if (config.mockImage) {
return { basePortraitUrl: await mockImageDataUri() };
}
const prompt = buildCharacterPortraitPrompt(
charName,
visualDescription,
styleGuide,
);
const { imageUrl, imageUuid } = await generateImage(config.image, prompt);
return { basePortraitUrl: imageUrl, basePortraitUuid: imageUuid };
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[characterDesigner] portrait gen failed for ${charName}: ${msg}`);
return {}; // no portrait at all — degrade gracefully
}
}
export async function provisionCharacterVoice(
config: EngineConfig,
voiceDescription: string,
charName: string,
): Promise<CharacterVoice | undefined> {
if (!config.tts) return undefined;
try {
return await provisionVoice(config.tts, voiceDescription);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[characterDesigner] voice provision failed for ${charName}: ${msg}`);
return undefined;
}
}
// The cheap first stage: design the visual + voice TEXT cards in one LLM
// call. The director then schedules renderCharacterPortrait /
// provisionCharacterVoice around the Painter. Multiple new characters in the
// same scene run this stage in parallel at the director level.
export type CharacterCard = {
name: string;
visualDescription?: string;
voiceDescription: string;
};
export async function designCharacterCard(
config: EngineConfig,
session: Session,
charName: string,
): Promise<CharacterCard> {
const tDesign = Date.now();
const design = await runDesignLLM(config, session, charName);
tlog(`[charDesigner ${charName}] design LLM`, tDesign);
return {
name: charName,
visualDescription: design.visualDescription?.trim() || undefined,
voiceDescription:
design.voiceDescription?.trim() ||
`请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`,
};
}
// Provision voice ONLY for an existing character that the LLM mentioned
// without us having designed them yet (e.g., 编剧 referenced a name that
// wasn't in `activeCharacters` but appeared as a speaker). Used by
// directInsertBeat path and as a safety net in directScene. No portrait
// is generated for these — they get a name + voice only.
export async function provisionVoiceForName(
config: EngineConfig,
session: Session,
charName: string,
): Promise<Character> {
const voiceDescription = `请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
const voice = await provisionCharacterVoice(config, voiceDescription, charName);
return { name: charName, voiceDescription, voice };
}
+86
View File
@@ -0,0 +1,86 @@
import { chat } from "@infiplot/ai-client";
import type { BeatActiveCharacter, ProviderConfig } from "@infiplot/types";
import { parseJsonLoose } from "../jsonParser";
import {
CINEMATOGRAPHER_SYSTEM,
buildCinematographerUserMessage,
} from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Cinematographer agent — translates the Writer's narrative scene
// summary into an English compositional prompt for FLUX.
//
// Reads: sceneSummary + entry beat's activeCharacters (poses)
// + prior sceneKey (for continuity hints)
// Writes: { shotType, integratedPrompt }
//
// Does NOT describe character APPEARANCE — that's appended at the
// Painter stage from session.characters[].visualDescription. The
// Cinematographer only positions named characters in the frame and
// describes the environment + lighting + camera framing.
//
// This separation lets the Cinematographer run IN PARALLEL with the
// CharacterDesigner — neither needs the other's output. They both
// feed independently into the Painter prompt.
// ──────────────────────────────────────────────────────────────────────
export type CinematographerOutput = {
shotType: string;
integratedPrompt: string;
};
type RawCinematographerOutput = {
shotType?: string;
integratedPrompt?: string;
};
export type CinematographerInput = {
sceneSummary: string;
styleGuide: string;
entryBeatActive: BeatActiveCharacter[];
/** Entry beat's speaker — drives the dynamic camera policy:
* NPC name → NPC looks toward camera (close-up)
* "你" → medium shot, NPC listens
* undefined → wide establishing shot */
entryBeatSpeaker?: string;
priorSceneKey?: string;
currentSceneKey?: string;
};
export async function runCinematographer(
config: ProviderConfig,
input: CinematographerInput,
): Promise<CinematographerOutput> {
const raw = await chat(
config,
[
{ role: "system", content: CINEMATOGRAPHER_SYSTEM },
{
role: "user",
content: buildCinematographerUserMessage(
input.sceneSummary,
input.styleGuide,
input.entryBeatActive,
input.entryBeatSpeaker,
input.priorSceneKey,
input.currentSceneKey,
),
},
],
{ temperature: 0.6, responseFormat: "json_object" },
);
const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
// Fallback: if the LLM produced nothing usable, synthesize a minimal
// integratedPrompt from the Writer's sceneSummary so the Painter has
// SOMETHING to work with rather than blowing up the whole pipeline.
const integratedPrompt =
parsed.integratedPrompt?.trim() ||
`A cinematic illustration depicting: ${input.sceneSummary}. Wide establishing shot, natural lighting, atmospheric mood.`;
return {
shotType: parsed.shotType?.trim() || "medium shot",
integratedPrompt,
};
}
+163
View File
@@ -0,0 +1,163 @@
import { generateImage } from "@infiplot/ai-client";
import type { GenerateImageOptions, GenerateImageResult } from "@infiplot/ai-client";
import type {
Beat,
Character,
EngineConfig,
ProviderConfig,
} from "@infiplot/types";
import { mockImageDataUri } from "../mockImage";
import { buildPainterPrompt } from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Painter — final image generation with multi-reference anchoring.
//
// FLUX.2 [klein] 9B KV does NOT support seedImage (img2img). Instead,
// visual continuity comes entirely from `referenceImages` (capped at 4),
// which the KV-optimized variant accelerates ~2.5× via key-value caching
// of reference latents.
//
// References are slotted in priority order (max 4):
// 1. Prior scene image — when sceneKey matched a previous scene, this
// anchors the same physical space (lighting/layout/style continuity)
// 2. Entry beat's speaker portrait — the NPC the player is talking with
// (most visually prominent)
// 3. Other on-stage NPCs' portraits — secondary characters in the frame
//
// References are sent as UUIDs (preferred — cheapest in transport) or URLs
// (fallback — still cheaper than base64). Base64 fallback was removed when
// generateImage switched to outputType=URL, which always returns both a UUID
// and a URL so we never lack a cheap reference handle.
//
// Failure handling — two-tier degradation:
// A. referenceImages call (preferred — full visual anchoring)
// B. pure text-to-image fallback (last resort if Runware refs API errors)
// ──────────────────────────────────────────────────────────────────────
const MAX_REFERENCE_IMAGES = 4;
export type PainterInput = {
integratedPrompt: string;
styleGuide: string;
onStageCharacters: Character[];
/**
* Prior scene's Runware UUID or URL. When set (= sceneKey hit a prior
* scene), it slots into referenceImages[0] for spatial continuity.
* Capacity-wise this displaces ONE character portrait — slot is shared
* with character refs, capped at 4 total per Runware spec.
*/
priorSceneImage?: string;
};
// Pick the references we send to Runware as `referenceImages`. Priority:
// slot 0: priorSceneImage (if any — sceneKey continuity)
// slot 1: entry beat's speaker portrait (the NPC speaking to the player)
// slot 2+: other on-stage NPCs from entry beat's activeCharacters
// Caps at 4 total. Returns the array exactly as it'll be sent — already
// truncated, already deduplicated.
export function collectReferenceImages(
characters: Character[],
entryBeat: Beat | undefined,
priorSceneImage: string | undefined,
): string[] {
const refs: string[] = [];
const seen = new Set<string>();
// Slot 0 — prior scene image for spatial continuity. Goes first because
// backdrop drift is the most jarring discontinuity across same-sceneKey
// scenes; character drift is partially masked by character archetype text
// in the prompt anyway.
if (priorSceneImage) {
refs.push(priorSceneImage);
}
// Slot 1+ — character portraits, speaker-first.
//
// Prefer URL over UUID: Runware's `imageInference` returns a UUID, but that
// UUID isn't always recognized by the `referenceImages` pipeline (the error
// surfaces as `failedToTransferImage`). The URL is Runware's own CDN link —
// they can always fetch it from their own infra. UUID is kept as a backstop
// for any edge case where URL is missing (e.g., legacy session state).
const speakerName = entryBeat?.speaker;
if (speakerName) {
const speaker = characters.find((c) => c.name === speakerName);
const ref = speaker?.basePortraitUrl ?? speaker?.basePortraitUuid;
if (ref && refs.length < MAX_REFERENCE_IMAGES) {
refs.push(ref);
seen.add(speakerName);
}
}
for (const c of entryBeat?.activeCharacters ?? []) {
if (refs.length >= MAX_REFERENCE_IMAGES) break;
if (seen.has(c.name)) continue;
const char = characters.find((x) => x.name === c.name);
const ref = char?.basePortraitUrl ?? char?.basePortraitUuid;
if (ref) {
refs.push(ref);
seen.add(c.name);
}
}
return refs.slice(0, MAX_REFERENCE_IMAGES);
}
async function tryGenerate(
config: ProviderConfig,
prompt: string,
options: GenerateImageOptions,
label: string,
): Promise<GenerateImageResult | null> {
try {
return await generateImage(config, prompt, options);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.warn(`[painter] ${label} failed: ${msg}`);
return null;
}
}
export type PainterResult =
| { kind: "real"; imageUrl: string; imageUuid: string }
| { kind: "mock"; imageUrl: string };
export async function runPainter(
config: EngineConfig,
input: PainterInput,
entryBeat: Beat | undefined,
): Promise<PainterResult> {
if (config.mockImage) {
return { kind: "mock", imageUrl: await mockImageDataUri() };
}
const prompt = buildPainterPrompt(
input.integratedPrompt,
input.styleGuide,
input.onStageCharacters,
);
const refs = collectReferenceImages(
input.onStageCharacters,
entryBeat,
input.priorSceneImage,
);
// Tier A — with referenceImages (priorSceneImage + character portraits).
// FLUX.2 [klein] 9B KV's KV cache accelerates this multi-reference path
// ~2.5× compared to the non-KV variant.
if (refs.length > 0) {
const r = await tryGenerate(
config.image,
prompt,
{ referenceImages: refs },
`referenceImages (${refs.length})`,
);
if (r) return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}
// Tier B — pure text-to-image. Last resort, used when Tier A failed OR
// there are no references to send (first scene with no characters yet).
// Errors here propagate to the caller.
const r = await generateImage(config.image, prompt);
return { kind: "real", imageUrl: r.imageUrl, imageUuid: r.imageUuid };
}
+425
View File
@@ -0,0 +1,425 @@
import { chat } from "@infiplot/ai-client";
import type {
Beat,
BeatActiveCharacter,
BeatChoice,
BeatChoiceEffect,
BeatNext,
ProviderConfig,
Session,
StoryStatePatch,
} from "@infiplot/types";
import { parseJsonLoose } from "../jsonParser";
import { WRITER_SYSTEM, buildWriterUserMessage } from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// Writer agent — owns the narrative half of scene generation.
//
// Output: { sceneSummary, sceneKey, entryBeatId, beats[] }
// Each beat carries activeCharacters[] (names + poses) the
// Cinematographer reads when composing the establishing shot.
//
// Character DESIGN (visual + voice) is NOT this agent's job —
// it only names characters; the CharacterDesigner picks up any
// unknown name from beats[].activeCharacters.
// ──────────────────────────────────────────────────────────────────────
export type WriterOutput = {
sceneSummary: string;
sceneKey?: string;
entryBeatId: string;
beats: Beat[];
/** Rewritten volatile story memory — merged onto the carried StoryState by
* the director. Absent when the model omitted it (rare; bible just stales). */
storyStatePatch?: StoryStatePatch;
};
// Raw shapes — what the LLM produces before validation / coercion.
type RawActiveCharacter = {
name?: string;
pose?: string;
};
type RawEffect = {
kind?: string;
targetBeatId?: string;
nextSceneSeed?: string;
};
type RawChoice = {
id?: string;
label?: string;
effect?: RawEffect;
};
type RawNext = {
type?: string;
nextBeatId?: string;
choices?: RawChoice[];
};
type RawBeat = {
id?: string;
narration?: string;
speaker?: string;
line?: string;
lineDelivery?: string;
activeCharacters?: RawActiveCharacter[];
next?: RawNext;
};
type RawStoryStatePatch = {
synopsis?: unknown;
openThreads?: unknown;
relationships?: unknown;
nextHook?: unknown;
};
type RawScene = {
sceneSummary?: string;
sceneKey?: string;
entryBeatId?: string;
beats?: RawBeat[];
storyStatePatch?: RawStoryStatePatch;
};
// ──────────────────────────────────────────────────────────────────────
// POV (player viewpoint) handling — Pattern B (galgame standard):
// - speaker = "你" → ALLOWED (renders as dialog box, never TTS'd)
// - any other POV term → normalized to "你" (LLM slip-up safety net)
// - activeCharacters → POV is NEVER allowed (player has no body in-scene)
// - CharacterDesigner → never invoked for "你" or POV variants
// ──────────────────────────────────────────────────────────────────────
const POV_DISPLAY_NAME = "你";
const POV_VARIANTS = new Set([
"玩家",
"我",
"主角",
"protagonist",
"Protagonist",
"player",
"Player",
"PLAYER",
"MC",
"mc",
"Mc",
"I",
"i",
"me",
"Me",
"ME",
]);
function isPovName(name: string): boolean {
return name === POV_DISPLAY_NAME || POV_VARIANTS.has(name);
}
// Normalize a speaker name: any POV variant collapses to "你"; an NPC name
// passes through unchanged. Caller passes already-trimmed input.
function normalizeSpeakerName(name: string): string {
return POV_VARIANTS.has(name) ? POV_DISPLAY_NAME : name;
}
function coerceEffect(raw: RawEffect | undefined): BeatChoiceEffect {
if (raw?.kind === "advance-beat" && raw.targetBeatId?.trim()) {
return { kind: "advance-beat", targetBeatId: raw.targetBeatId.trim() };
}
return {
kind: "change-scene",
nextSceneSeed: raw?.nextSceneSeed?.trim() || "未指定",
};
}
function coerceChoice(raw: RawChoice, idx: number): BeatChoice {
return {
id: raw.id?.trim() || `c${idx + 1}`,
label: raw.label?.trim() || `选项 ${idx + 1}`,
effect: coerceEffect(raw.effect),
};
}
function coerceNext(raw: RawNext | undefined, fallbackBeatId: string): BeatNext {
if (raw?.type === "choice" && Array.isArray(raw.choices) && raw.choices.length) {
return {
type: "choice",
choices: raw.choices.map((c, i) => coerceChoice(c, i)),
};
}
return {
type: "continue",
nextBeatId: raw?.nextBeatId?.trim() || fallbackBeatId,
};
}
function coerceActiveCharacters(
raw: RawActiveCharacter[] | undefined,
): BeatActiveCharacter[] | undefined {
if (!Array.isArray(raw)) return undefined;
const out = raw
.map((c): BeatActiveCharacter | null => {
const name = c.name?.trim();
if (!name) return null;
// POV is never IN the picture — strip the LLM's slip-up silently so
// CharacterDesigner doesn't end up generating a portrait for the player.
if (isPovName(name)) return null;
const pose = c.pose?.trim();
return pose ? { name, pose } : { name };
})
.filter((c): c is BeatActiveCharacter => Boolean(c));
return out.length > 0 ? out : undefined;
}
function coerceBeat(raw: RawBeat, idx: number, totalBeats: number): Beat {
const id = raw.id?.trim() || `b${idx + 1}`;
// Non-last beats default their `continue` target to the following beat.
// The last beat gets an empty fallback on purpose: repairBeats() turns a
// last/dangling continue into a real scene-change exit so the player can
// never get stuck self-looping on it.
const fallback = idx + 1 < totalBeats ? `b${idx + 2}` : "";
const rawSpeaker = raw.speaker?.trim() || undefined;
// Normalize any POV variant (玩家/我/主角/protagonist/...) to "你".
// NPC names pass through unchanged. This means the LLM can slip and
// write "玩家" or "I" and we still render the dialog box correctly with
// speaker="你" — and TTS is automatically skipped because no Character
// record exists for "你".
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = raw.line?.trim() || undefined;
return {
id,
narration: raw.narration?.trim() || undefined,
speaker,
line,
// lineDelivery is meaningful only for NPC speakers (TTS). For POV
// speaker ("你") TTS is skipped, so lineDelivery would never be used.
lineDelivery:
line && speaker !== POV_DISPLAY_NAME
? raw.lineDelivery?.trim() || undefined
: undefined,
activeCharacters: coerceActiveCharacters(raw.activeCharacters),
next: coerceNext(raw.next, fallback),
};
}
const FALLBACK_SEED = "故事继续推进";
function fallbackExitChoice(beatId: string): BeatChoice {
return {
id: `${beatId}__exit`,
label: "继续",
effect: { kind: "change-scene", nextSceneSeed: FALLBACK_SEED },
};
}
// Beat ids are graph keys (the front-end's `beats.find(b => b.id === ...)`,
// the session's `visitedBeatIds`, and `continue`/`advance-beat` targets). If
// the model reuses an id across beats, the second occurrence becomes silently
// unreachable and external references collapse to the first beat. Rename
// duplicates; rewrite the renamed beat's OWN self-references. External
// references stay pointing at the first occurrence.
function ensureUniqueBeatIds(beats: Beat[]): Beat[] {
const seen = new Set<string>();
return beats.map((b): Beat => {
if (!seen.has(b.id)) {
seen.add(b.id);
return b;
}
const oldId = b.id;
let n = 2;
while (seen.has(`${oldId}_${n}`)) n += 1;
const newId = `${oldId}_${n}`;
seen.add(newId);
let next = b.next;
if (next.type === "continue" && next.nextBeatId === oldId) {
next = { type: "continue", nextBeatId: newId };
} else if (next.type === "choice") {
next = {
type: "choice",
choices: next.choices.map((c) =>
c.effect.kind === "advance-beat" && c.effect.targetBeatId === oldId
? {
...c,
effect: { kind: "advance-beat" as const, targetBeatId: newId },
}
: c,
),
};
}
return { ...b, id: newId, next };
});
}
// Repairs referential integrity AND guarantees the scene is escapable:
// - a `continue` to a missing/self id is repointed to the next beat in order;
// a last/dangling continue with nowhere to go becomes a scene-change exit
// - an `advance-beat` to a missing id is downgraded to a scene change
// - if no change-scene exit exists anywhere, one is appended to the last beat
function repairBeats(beats: Beat[]): Beat[] {
const ids = new Set(beats.map((b) => b.id));
const fixed: Beat[] = beats.map((b, idx): Beat => {
if (b.next.type === "continue") {
const target = b.next.nextBeatId;
if (ids.has(target) && target !== b.id) return b;
const nextByIndex = beats[idx + 1]?.id;
if (nextByIndex) {
return { ...b, next: { type: "continue", nextBeatId: nextByIndex } };
}
return { ...b, next: { type: "choice", choices: [fallbackExitChoice(b.id)] } };
}
const patched = b.next.choices.map((c) =>
c.effect.kind === "advance-beat" && !ids.has(c.effect.targetBeatId)
? {
...c,
effect: {
kind: "change-scene" as const,
nextSceneSeed: "未指定(导演引用不存在的 beat,已降级为换场)",
},
}
: c,
);
return { ...b, next: { type: "choice", choices: patched } };
});
const hasExit = fixed.some(
(b) =>
b.next.type === "choice" &&
b.next.choices.some((c) => c.effect.kind === "change-scene"),
);
if (!hasExit && fixed.length > 0) {
const lastIdx = fixed.length - 1;
const last = fixed[lastIdx]!;
const existing = last.next.type === "choice" ? last.next.choices : [];
fixed[lastIdx] = {
...last,
next: { type: "choice", choices: [...existing, fallbackExitChoice(last.id)] },
};
}
return fixed;
}
// Choice ids are keys the front-end uses to cache + consume prefetched
// scenes. Two beats both defaulting to c1/c2 would make a transition reuse
// the WRONG prefetched scene — so force every choice id to be unique within
// the scene.
function ensureUniqueChoiceIds(beats: Beat[]): Beat[] {
const seen = new Set<string>();
for (const b of beats) {
if (b.next.type !== "choice") continue;
for (const c of b.next.choices) {
if (seen.has(c.id)) {
let n = 2;
while (seen.has(`${c.id}_${n}`)) n += 1;
c.id = `${c.id}_${n}`;
}
seen.add(c.id);
}
}
return beats;
}
// Normalize sceneKey to a safe lowercase-with-dashes English slug. If the
// model returns something weird (中文 / spaces / mixed case), best-effort
// fix; if it ends up empty, return undefined (the scene just won't be
// considered for img2img reuse).
function normalizeSceneKey(raw: string | undefined): string | undefined {
if (!raw) return undefined;
const slug = raw
.trim()
.toLowerCase()
.replace(/[^a-z0-9-]+/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
return slug.length > 0 ? slug : undefined;
}
function coerceStringArray(raw: unknown): string[] | undefined {
if (!Array.isArray(raw)) return undefined;
const out = raw
.map((x) => (typeof x === "string" ? x.trim() : ""))
.filter((x) => x.length > 0);
return out.length > 0 ? out : undefined;
}
// Pull the volatile story-memory rewrite out of the Writer's JSON. Only
// non-empty fields are kept; an all-empty/absent patch returns undefined so
// the director leaves the carried StoryState untouched.
function coerceStoryStatePatch(
raw: RawStoryStatePatch | undefined,
): StoryStatePatch | undefined {
if (!raw || typeof raw !== "object") return undefined;
const patch: StoryStatePatch = {};
const synopsis = typeof raw.synopsis === "string" ? raw.synopsis.trim() : "";
if (synopsis) patch.synopsis = synopsis;
const openThreads = coerceStringArray(raw.openThreads);
if (openThreads) patch.openThreads = openThreads;
const relationships = coerceStringArray(raw.relationships);
if (relationships) patch.relationships = relationships;
const nextHook = typeof raw.nextHook === "string" ? raw.nextHook.trim() : "";
if (nextHook) patch.nextHook = nextHook;
return Object.keys(patch).length > 0 ? patch : undefined;
}
export async function runWriter(
config: ProviderConfig,
session: Session,
): Promise<WriterOutput> {
const raw = await chat(
config,
[
{ role: "system", content: WRITER_SYSTEM },
{ role: "user", content: buildWriterUserMessage(session) },
],
{ temperature: 0.9, responseFormat: "json_object" },
);
const parsed = parseJsonLoose<RawScene>(raw);
const rawBeats = Array.isArray(parsed.beats) ? parsed.beats : [];
if (rawBeats.length === 0) {
throw new Error("Writer returned no beats");
}
const beats = ensureUniqueChoiceIds(
repairBeats(
ensureUniqueBeatIds(
rawBeats.map((b, i) => coerceBeat(b, i, rawBeats.length)),
),
),
);
const declaredEntry = parsed.entryBeatId?.trim();
const entryBeatId =
declaredEntry && beats.some((b) => b.id === declaredEntry)
? declaredEntry
: beats[0]!.id;
return {
sceneSummary: parsed.sceneSummary?.trim() || "未指定场景概要",
sceneKey: normalizeSceneKey(parsed.sceneKey),
entryBeatId,
beats,
storyStatePatch: coerceStoryStatePatch(parsed.storyStatePatch),
};
}
// Surface the set of character names introduced by this scene's beats,
// so the orchestrator can decide which ones need the CharacterDesigner to
// fire. Pulls names from both `speaker` fields AND `activeCharacters`
// (a character can be on-screen without speaking).
//
// Excludes POV ("你" / 玩家 / 主角 / ...) entirely — the player is never
// designed (no portrait, no voice, no archetype).
export function collectActiveCharacterNames(beats: Beat[]): string[] {
const seen = new Set<string>();
for (const b of beats) {
if (b.speaker && !isPovName(b.speaker)) seen.add(b.speaker);
if (b.activeCharacters) {
for (const c of b.activeCharacters) {
if (!isPovName(c.name)) seen.add(c.name);
}
}
}
return Array.from(seen);
}
// Re-export POV constants for downstream filters (director's orphanSpeakers).
export { POV_DISPLAY_NAME, POV_VARIANTS, isPovName, normalizeSpeakerName };
+429
View File
@@ -0,0 +1,429 @@
import { chat } from "@infiplot/ai-client";
import type {
Character,
EngineConfig,
InsertBeatPartial,
ProviderConfig,
Scene,
Session,
StoryState,
StoryStatePatch,
} from "@infiplot/types";
import type { CharacterCard } from "./agents/characterDesigner";
import {
designCharacterCard,
provisionCharacterVoice,
provisionVoiceForName,
renderCharacterPortrait,
} from "./agents/characterDesigner";
import { runCinematographer } from "./agents/cinematographer";
import { runPainter } from "./agents/painter";
import {
collectActiveCharacterNames,
isPovName,
normalizeSpeakerName,
POV_DISPLAY_NAME,
runWriter,
} from "./agents/writer";
import { parseJsonLoose } from "./jsonParser";
import { INSERT_BEAT_SYSTEM, buildInsertBeatUserMessage } from "./prompts";
// ══════════════════════════════════════════════════════════════════════
// director.ts — multi-agent orchestrator for one full Scene generation.
//
// Critical path (per Scene call):
//
// Writer LLM (~3s, serial)
// │
// ├─ CharacterCard LLM × N (parallel per new char — TEXT only)
// ├─ Cinematographer LLM (parallel with the cards)
// │
// └─ wait for cards + cinema
// │
// ├─ entry-beat portraits ──┐ (block the Painter — its refs)
// ▼ │
// Painter — generateImage │ (overlapped, NOT on the paint path):
// with referenceImages ├─ non-entry-beat portraits
// │ └─ ALL voice provisioning + orphan voices
// ▼
// await the overlapped work, fold into the registry
// │
// ▼
// return { scene, sceneImageUrl, characters, storyState }
//
// Two deliberate decouplings unlock the parallelism:
// 1. The Cinematographer only POSITIONS named characters, so it needs no
// visualDescription and runs alongside the card LLMs.
// 2. The Painter only needs visualDescription TEXT (all on-stage) + the
// entry-beat characters' PORTRAITS (its referenceImages). Voices are
// never needed to paint, and non-entry portraits are never referenced —
// so both overlap the (longest) paint call instead of blocking it.
// ══════════════════════════════════════════════════════════════════════
function newSceneId(): string {
return `scene_${Date.now()}_${Math.random().toString(36).slice(2, 6)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// Merge a freshly-designed Character into a registry, preserving any
// previously-set voice/portrait that the new design didn't fill in (so
// re-designing a known character can't silently drop their voice or wipe
// out an already-generated portrait UUID). Match by name.
export function mergeCharacters(
existing: Character[],
updates: Character[],
): Character[] {
if (updates.length === 0) return existing;
const byName = new Map(existing.map((c) => [c.name, c]));
for (const u of updates) {
const prev = byName.get(u.name);
if (!prev) {
byName.set(u.name, u);
continue;
}
// Preserve any prior provisioned resource that the new design omitted.
byName.set(u.name, {
...u,
voice: u.voice ?? prev.voice,
visualDescription: u.visualDescription ?? prev.visualDescription,
basePortraitUrl: u.basePortraitUrl ?? prev.basePortraitUrl,
basePortraitUuid: u.basePortraitUuid ?? prev.basePortraitUuid,
voiceDescription: u.voiceDescription || prev.voiceDescription,
});
}
return Array.from(byName.values());
}
// Pick a reference to the prior scene image when sceneKey matches a prior
// scene — used by the Painter as one of the `referenceImages` (NOT as a
// seedImage, because FLUX.2 [klein] 9B KV does not support seedImage).
//
// Prefer URL over UUID for the same reason painter.collectReferenceImages
// does: the UUID returned by `imageInference` isn't always recognized by
// Runware's `referenceImages` pipeline, surfacing as `failedToTransferImage`.
// The URL is Runware's own CDN link — they can always fetch it. UUID is kept
// as a backstop. Returns undefined when no prior scene shares the sceneKey.
function pickPriorSceneReference(
session: Session,
currentSceneKey: string | undefined,
): { priorSceneReference?: string; priorSceneKey?: string } {
if (!currentSceneKey) return {};
for (let i = session.history.length - 1; i >= 0; i--) {
const prior = session.history[i]!.scene;
if (prior.sceneKey === currentSceneKey) {
const ref = prior.imageUrl ?? prior.imageUuid;
if (ref) {
return { priorSceneReference: ref, priorSceneKey: prior.sceneKey };
}
}
}
return {};
}
// Merge the Writer's volatile story-memory patch onto the carried StoryState.
// The stable spine (logline/genreTags/protagonist/castNotes) is preserved;
// only the volatile fields the Writer is allowed to rewrite are overwritten,
// and only when the patch actually provided them. A missing carried state
// (legacy session from before the Architect existed) degrades to an empty
// spine rather than throwing.
function applyStoryStatePatch(
base: StoryState | undefined,
patch: StoryStatePatch | undefined,
): StoryState {
const start: StoryState =
base ?? { logline: "", genreTags: "", protagonist: "", synopsis: "" };
if (!patch) return start;
return {
...start,
synopsis: patch.synopsis ?? start.synopsis,
openThreads: patch.openThreads ?? start.openThreads,
relationships: patch.relationships ?? start.relationships,
nextHook: patch.nextHook ?? start.nextHook,
};
}
export type SceneResult = {
scene: Scene;
sceneImageUrl: string;
characters: Character[];
storyState: StoryState;
};
// ──────────────────────────────────────────────────────────────────────
// directScene — the multi-agent pipeline. Used by orchestrator's
// startSession and requestScene.
// ──────────────────────────────────────────────────────────────────────
export async function directScene(
config: EngineConfig,
session: Session,
): Promise<SceneResult> {
const tTotal = Date.now();
// Stage 1 — Writer (serial; everything downstream needs sceneSummary +
// beats[] to know who's on stage and what to compose around).
const tWriter = Date.now();
const writerOut = await runWriter(config.text, session);
tlog("[directScene] Writer", tWriter);
// Identify NEW characters introduced by this scene that need to be
// designed (LLM + portrait + voice). Existing characters in the registry
// are skipped — their cards / portraits / voices persist across scenes.
const allActiveNames = collectActiveCharacterNames(writerOut.beats);
const newCharNames = allActiveNames.filter(
(n) => !session.characters.some((c) => c.name === n),
);
// Find the entry beat for the Cinematographer (which characters are
// on-screen in the establishing shot).
const entryBeat = writerOut.beats.find((b) => b.id === writerOut.entryBeatId);
const entryBeatActive = entryBeat?.activeCharacters ?? [];
// For sceneKey-based visual continuity, look up the prior matching scene's
// image to slot into Painter's referenceImages (max 4 of which include
// character portraits too).
const { priorSceneReference, priorSceneKey } = pickPriorSceneReference(
session,
writerOut.sceneKey,
);
// ── Stage 2 — character cards (LLM) ∥ Cinematographer ──────────────────
// Both are cheap LLM calls and neither needs the other's output, so they
// run concurrently. The cards give us each new character's visualDescription
// TEXT; portraits + voices are deferred to Stage 3 so they can overlap the
// paint instead of blocking it.
const tParallel = Date.now();
const cardPromises = newCharNames.map((name) =>
designCharacterCard(config, session, name).catch((err): CharacterCard => {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[directScene] designCharacterCard(${name}) failed: ${msg}`);
// Last-resort fallback: a name + generic voice card so the speaker isn't
// unknown. No visualDescription → no portrait is attempted for them.
return {
name,
voiceDescription: `请根据角色名「${name}」推断其性别、年龄与气质。所属世界观:${session.worldSetting}`,
};
}),
);
const cinemaPromise = runCinematographer(config.text, {
sceneSummary: writerOut.sceneSummary,
styleGuide: session.styleGuide,
entryBeatActive,
entryBeatSpeaker: entryBeat?.speaker,
priorSceneKey,
currentSceneKey: writerOut.sceneKey,
});
const [cards, cinemaOut] = await Promise.all([
Promise.all(cardPromises),
cinemaPromise,
]);
tlog("[directScene] CharacterCards+Cinematographer parallel", tParallel);
// Working registry: existing characters + new cards. visualDescription text
// is present now; portraits + voices fill in over the next two phases.
let characters = mergeCharacters(
session.characters,
cards.map((c) => ({
name: c.name,
voiceDescription: c.voiceDescription,
visualDescription: c.visualDescription,
})),
);
// ── Stage 3 — portraits + voices, scheduled around the Painter ─────────
const tProvision = Date.now();
// Entry-beat character names: the ONLY portraits the Painter references
// (collectReferenceImages slots in the entry beat's speaker + activeChars).
const entryNames = new Set<string>();
if (entryBeat?.speaker && !isPovName(entryBeat.speaker)) {
entryNames.add(entryBeat.speaker);
}
for (const c of entryBeatActive) {
if (!isPovName(c.name)) entryNames.add(c.name);
}
type NamedPortrait = {
name: string;
basePortraitUrl?: string;
basePortraitUuid?: string;
};
// Kick off portrait gen for every NEW char that has a visualDescription.
// Entry-beat portraits block the Painter; the rest overlap it.
const entryPortraitPromises: Promise<NamedPortrait>[] = [];
const restPortraitPromises: Promise<NamedPortrait>[] = [];
for (const card of cards) {
const vd = card.visualDescription;
if (!vd) continue;
const p = renderCharacterPortrait(
config,
card.name,
vd,
session.styleGuide,
).then((res): NamedPortrait => ({ name: card.name, ...res }));
(entryNames.has(card.name) ? entryPortraitPromises : restPortraitPromises).push(p);
}
// Kick off voice provisioning for every NEW char (never on the paint path).
const voicePromises = cards.map((card) =>
provisionCharacterVoice(config, card.voiceDescription, card.name).then(
(voice): Character => ({
name: card.name,
voiceDescription: card.voiceDescription,
voice,
}),
),
);
// Edge case: a speaker the Writer referenced without listing in any beat's
// activeCharacters. collectActiveCharacterNames already includes speakers,
// so this is a rare defensive net. Provision a voice only (never on-screen).
const speakerNames = new Set(
writerOut.beats.map((b) => b.speaker).filter((n): n is string => Boolean(n)),
);
const orphanSpeakers = [...speakerNames].filter(
// Pattern B: "你" (player) is a valid speaker but never gets a Character
// record — TTS is intentionally skipped on the client.
(n) =>
!isPovName(n) &&
!characters.some((c) => c.name === n) &&
!cards.some((c) => c.name === n),
);
const orphanPromises = orphanSpeakers.map((n) =>
provisionVoiceForName(config, session, n),
);
// Block the Painter ONLY on entry-beat portraits (its referenceImages).
const entryPortraits = await Promise.all(entryPortraitPromises);
characters = mergeCharacters(
characters,
entryPortraits.map((p) => ({
name: p.name,
voiceDescription: "", // preserved from the card by mergeCharacters
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
tlog("[directScene] entry-beat portraits", tProvision);
// ── Stage 4 — Painter (depends on cinemaOut + on-stage visual cards +
// entry portraits). On-stage = everyone named in any beat, so the archetype
// block covers anyone the player might encounter in this scene.
const onStageCharacters = characters.filter((c) =>
allActiveNames.includes(c.name),
);
const tPainter = Date.now();
const painted = await runPainter(
config,
{
integratedPrompt: cinemaOut.integratedPrompt,
styleGuide: session.styleGuide,
onStageCharacters,
priorSceneImage: priorSceneReference,
},
entryBeat,
);
tlog("[directScene] Painter", tPainter);
// Fold in the work that overlapped the paint: remaining portraits, all
// voices, and any orphan-speaker voices. Awaited before returning so the
// session the client persists is fully provisioned for later scenes.
const tOverlap = Date.now();
const [restPortraits, voicedChars, orphanChars] = await Promise.all([
Promise.all(restPortraitPromises),
Promise.all(voicePromises),
Promise.all(orphanPromises),
]);
characters = mergeCharacters(
characters,
restPortraits.map((p) => ({
name: p.name,
voiceDescription: "",
basePortraitUrl: p.basePortraitUrl,
basePortraitUuid: p.basePortraitUuid,
})),
);
characters = mergeCharacters(characters, voicedChars);
if (orphanChars.length > 0) {
characters = mergeCharacters(characters, orphanChars);
}
tlog("[directScene] overlapped portraits+voices", tOverlap);
const scene: Scene = {
id: newSceneId(),
// scenePrompt is the cinematographer's English compositional output;
// the Writer's sceneSummary stays in the session log via beats[]/
// history. Keeping the original field name preserves compat with
// anything that already reads scene.scenePrompt (e.g., insert-beat
// user prompt).
scenePrompt: cinemaOut.integratedPrompt,
beats: writerOut.beats,
entryBeatId: writerOut.entryBeatId,
sceneKey: writerOut.sceneKey,
imageUuid: painted.kind === "real" ? painted.imageUuid : undefined,
imageUrl: painted.imageUrl,
};
// Merge the Writer's volatile memory rewrite onto the carried bible so the
// throughline survives the next scene cut (orchestrator returns it; the
// client persists it back into the session).
const storyState = applyStoryStatePatch(
session.storyState,
writerOut.storyStatePatch,
);
tlog("[directScene] TOTAL", tTotal);
return { scene, sceneImageUrl: painted.imageUrl, characters, storyState };
}
// ──────────────────────────────────────────────────────────────────────
// directInsertBeat — single-agent path for vision-driven in-scene
// exploration. Generates ONE transient beat with NO new image, NO new
// characters. Multi-agent pipeline doesn't apply here (no rendering, no
// character introduction allowed by the prompt).
// ──────────────────────────────────────────────────────────────────────
export async function directInsertBeat(
config: ProviderConfig,
session: Session,
freeformAction: string,
): Promise<InsertBeatPartial> {
const raw = await chat(
config,
[
{ role: "system", content: INSERT_BEAT_SYSTEM },
{
role: "user",
content: buildInsertBeatUserMessage(session, freeformAction),
},
],
{ temperature: 0.9, responseFormat: "json_object" },
);
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
const narration = parsed.narration?.trim() || undefined;
const rawSpeaker = parsed.speaker?.trim() || undefined;
// Pattern B (mirrors Writer): normalize POV variants → "你"; NPCs pass through.
const speaker = rawSpeaker ? normalizeSpeakerName(rawSpeaker) : undefined;
const line = parsed.line?.trim() || undefined;
// lineDelivery is only meaningful for NPC speakers (TTS). For POV ("你")
// TTS is intentionally skipped on the client, so lineDelivery is dropped.
const lineDelivery =
line && speaker !== POV_DISPLAY_NAME
? parsed.lineDelivery?.trim() || undefined
: undefined;
if (!narration && !speaker && !line) {
return { narration: "(你停下脚步,环视片刻。)" };
}
return { narration, speaker, line, lineDelivery };
}
+15
View File
@@ -0,0 +1,15 @@
export {
startSession,
requestScene,
visionDecide,
requestInsertBeat,
requestBeatAudio,
} from "./orchestrator";
export { synthesizeBeat } from "./voice";
export { mergeCharacters } from "./director";
export type { SceneResult } from "./director";
export { runArchitect } from "./agents/architect";
export type { WriterOutput } from "./agents/writer";
export type { CinematographerOutput } from "./agents/cinematographer";
export type { InsertBeatPartial } from "@infiplot/types";
export * from "./prompts";
+95
View File
@@ -0,0 +1,95 @@
import { jsonrepair, JSONRepairError } from "jsonrepair";
// Strict-then-forgiving JSON parser for LLM output. Tries in order:
// 1. Direct JSON.parse on the trimmed text.
// 2. Extract from ```json``` fenced block.
// 3. Slice between first { and last } and parse.
// 4. Apply targeted regex pre-repairs (see preRepair) and try jsonrepair.
//
// On final failure, logs the first 800 chars of the raw model output so we
// can diagnose the actual syntax error without flooding logs or leaking
// sensitive content.
//
// jsonrepair (npm package josdejong/jsonrepair — 2.3k+ stars) handles the
// broad LLM-output failure modes: truncated JSON, missing commas/brackets,
// single quotes, Python None/True/False, JS comments. We layer a small set
// of targeted pre-repairs in front of it for failure modes jsonrepair can't
// disambiguate on its own (see preRepair).
// ──────────────────────────────────────────────────────────────────────
// preRepair — fix specific LLM error patterns before handing to jsonrepair.
//
// Pattern 1: missing closing quote on a key.
// Broken: "lineDelivery: "语速稍快...",
// Correct: "lineDelivery": "语速稍快...",
//
// jsonrepair fails on this because it's ambiguous — "lineDelivery: " could
// be a complete string value, leaving "语速稍快..." as a syntax error. But
// if we see "<key-like>:<whitespace>" we know structurally it should be
// a key-colon-value triplet.
//
// Match constraints:
// - The key match excludes " \n : so we can't overrun into adjacent
// fields or absorb the colon as part of the key name.
// - The colon must be followed by whitespace and another " (the value
// string's opening quote). This is what disambiguates from a value
// string that happens to contain a colon.
// ──────────────────────────────────────────────────────────────────────
function preRepair(s: string): string {
return s.replace(/"([^"\n:]+):(\s+)"/g, '"$1":$2"');
}
export function parseJsonLoose<T>(raw: string): T {
const trimmed = raw.trim();
try {
return JSON.parse(trimmed) as T;
} catch {
// fall through
}
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
if (fenced?.[1]) {
try {
return JSON.parse(fenced[1]) as T;
} catch {
// fall through
}
}
const first = trimmed.indexOf("{");
const last = trimmed.lastIndexOf("}");
const slice =
first !== -1 && last > first ? trimmed.slice(first, last + 1) : trimmed;
// Try the brace-sliced version first; if there were no braces at all
// (slice === trimmed), this is just a second attempt at the raw text.
try {
return JSON.parse(slice) as T;
} catch {
// Targeted pre-repair (no-op on already-valid JSON) → jsonrepair.
const prefixed = preRepair(slice);
// If preRepair changed something, give the cheap path another shot —
// the input might already be valid now without needing jsonrepair.
if (prefixed !== slice) {
try {
return JSON.parse(prefixed) as T;
} catch {
// fall through to jsonrepair
}
}
try {
const repaired = jsonrepair(prefixed);
return JSON.parse(repaired) as T;
} catch (err) {
const isRepairErr = err instanceof JSONRepairError;
console.error(
`[parseJsonLoose] jsonrepair ${isRepairErr ? "could not repair" : "succeeded but JSON.parse rejected its output"}. Raw output (first 800 chars):\n${raw.slice(0, 800)}`,
);
throw err;
}
}
}
+25
View File
@@ -0,0 +1,25 @@
// Static SVG placeholder used when MOCK_IMAGE=true, so we can exercise the
// TTS path without paying for image generation. Returned as a data URI so the
// rest of the pipeline can treat it as an `imageUrl` interchangeably with
// real Runware URLs (the client's <img src> accepts both, and we never feed
// a mock image to Runware's referenceImages because mockImage mode
// short-circuits the Painter entirely).
//
// Previously rendered to PNG via sharp; switched to a self-describing SVG
// data URI so the engine has zero Node-native dependencies and runs on
// Cloudflare Workers. SVG also stays crisp at any display size.
const W = 1792;
const H = 1024;
const SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="${W}" height="${H}">
<rect width="${W}" height="${H}" fill="#161109"/>
<rect x="2" y="2" width="${W - 4}" height="${H - 4}" fill="none" stroke="#5a4628" stroke-width="3" stroke-dasharray="14 10"/>
<text x="50%" y="45%" fill="#b88f4a" font-family="Georgia, serif" font-size="72" letter-spacing="6" text-anchor="middle">MOCK IMAGE</text>
<text x="50%" y="53%" fill="#6e5430" font-family="Georgia, serif" font-size="30" letter-spacing="3" text-anchor="middle">TTS TEST — image generation skipped</text>
</svg>`;
const DATA_URI = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(SVG)}`;
export async function mockImageDataUri(): Promise<string> {
return DATA_URI;
}
+180
View File
@@ -0,0 +1,180 @@
import type {
BeatAudioRequest,
BeatAudioResponse,
EngineConfig,
InsertBeatRequest,
InsertBeatResponse,
Session,
SceneRequest,
SceneResponse,
StartRequest,
StartResponse,
VisionRequest,
VisionResponse,
} from "@infiplot/types";
import { runArchitect } from "./agents/architect";
import { directInsertBeat, directScene } from "./director";
import { synthesizeBeat } from "./voice";
import { interpret } from "./vision";
function newSessionId(): string {
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// ──────────────────────────────────────────────────────────────────────
// startSession — initial Scene via the multi-agent pipeline.
//
// directScene internally handles: Writer → (CharacterDesigner+
// Cinematographer parallel) → Painter → upload. Voice provisioning and
// portrait generation happen inside CharacterDesigner per new character,
// so the orchestrator no longer needs to coordinate them separately.
// ──────────────────────────────────────────────────────────────────────
export async function startSession(
config: EngineConfig,
req: StartRequest,
): Promise<StartResponse> {
const tTotal = Date.now();
const session: Session = {
id: newSessionId(),
createdAt: Date.now(),
worldSetting: req.worldSetting.trim(),
styleGuide: req.styleGuide.trim(),
history: [],
characters: [],
};
// Stage 0 — Architect: expand the terse world/style prompt into a story
// bible BEFORE the first scene. Serial by necessity (the opening Writer
// reads session.storyState), but it gives the whole story a spine from beat
// one — the latency is offset by the director's portrait/voice overlap win.
const tArchitect = Date.now();
session.storyState = await runArchitect(config.text, session);
tlog("[start] Architect", tArchitect);
const { scene, sceneImageUrl, characters, storyState } = await directScene(
config,
session,
);
tlog("[start] TOTAL", tTotal);
return {
sessionId: session.id,
scene,
imageUrl: sceneImageUrl,
characters,
storyState,
};
}
// ──────────────────────────────────────────────────────────────────────
// requestScene — next Scene from existing session.
// ──────────────────────────────────────────────────────────────────────
export async function requestScene(
config: EngineConfig,
req: SceneRequest,
): Promise<SceneResponse> {
const tTotal = Date.now();
const { scene, sceneImageUrl, characters, storyState } = await directScene(
config,
req.session,
);
tlog("[scene] TOTAL", tTotal);
return {
scene,
imageUrl: sceneImageUrl,
characters,
storyState,
};
}
// ──────────────────────────────────────────────────────────────────────
// visionDecide — interprets a background click into intent + classify.
// No change from staging — vision lives outside the scene-generation
// pipeline.
// ──────────────────────────────────────────────────────────────────────
export async function visionDecide(
config: EngineConfig,
req: VisionRequest,
): Promise<VisionResponse> {
const current = req.session.history.at(-1)?.scene ?? null;
return interpret(config.vision, req.annotatedImageBase64, current);
}
// ──────────────────────────────────────────────────────────────────────
// requestInsertBeat — single-agent transient beat (no image, no new
// characters). Stays single-LLM by design — the INSERT_BEAT prompt
// forbids new characters and there's nothing to render.
// ──────────────────────────────────────────────────────────────────────
export async function requestInsertBeat(
config: EngineConfig,
req: InsertBeatRequest,
): Promise<InsertBeatResponse> {
const tTotal = Date.now();
const partial = await directInsertBeat(
config.text,
req.session,
req.freeformAction,
);
// INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines
// to narration so the player still sees the text (the client only renders
// `line` when there is a `speaker`).
//
// Exception (Pattern B): speaker = "你" is the player speaking. No
// Character record exists for "你" (intentional — TTS is skipped), so we
// must NOT demote it; the client renders the dialog box correctly.
// directInsertBeat already normalized POV variants to "你" before this
// guard, so a literal "你" here is always Pattern B player dialog.
if (
partial.speaker &&
partial.speaker !== "你" &&
!req.session.characters.some((c) => c.name === partial.speaker)
) {
console.warn(
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
);
const promotedNarration =
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
tlog("[insert-beat] TOTAL", tTotal);
return {
partial: {
narration: promotedNarration,
speaker: undefined,
line: undefined,
lineDelivery: undefined,
},
characters: req.session.characters,
};
}
tlog("[insert-beat] TOTAL", tTotal);
return { partial, characters: req.session.characters };
}
// ──────────────────────────────────────────────────────────────────────
// requestBeatAudio — lazy per-beat synth. Returns audio:null on
// timeout / failure / TTS disabled, so the client just plays silent.
// ──────────────────────────────────────────────────────────────────────
export async function requestBeatAudio(
config: EngineConfig,
req: BeatAudioRequest,
): Promise<BeatAudioResponse> {
if (!config.tts) return { audio: null };
const audio = await synthesizeBeat(config.tts, req.voice, req.beat);
return { audio };
}
+739
View File
@@ -0,0 +1,739 @@
import type {
BeatActiveCharacter,
Character,
Scene,
Session,
StoryState,
} from "@infiplot/types";
// ══════════════════════════════════════════════════════════════════════
// Multi-agent scene generation pipeline:
// Architect (总编剧) — ONE-TIME at session start: the story bible
// (protagonist / logline / genre / opening hook /
// planned cast) → seeds StoryState
// Writer (编剧) — narrative + beats[] + per-beat activeCharacters,
// reads StoryState and emits a StoryStatePatch
// CharacterDesigner — per-new-character visual + voice cards
// Cinematographer (分镜导演) — sceneKey + English compositional prompt
// Painter (画师) — FLUX rendering with character archetypes
//
// Each agent owns one system prompt + one user-message builder below.
// All agents see the same world / style guide, but each only reads the
// slice of session state it needs to make its decision.
// ══════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────
// Shared — render the StoryState bible into a compact prompt block read
// by the Writer (and Architect, on revisions). Keeping one renderer means
// the bible looks identical to every agent that consumes it.
// ──────────────────────────────────────────────────────────────────────
export function renderStoryState(s: StoryState | undefined): string {
if (!s) return "";
const lines: string[] = ["【故事档案 / 主线记忆】"];
if (s.logline) lines.push(`主线(中心钩子):${s.logline}`);
if (s.genreTags) lines.push(`题材基调:${s.genreTags}`);
if (s.protagonist) lines.push(`主角「你」:${s.protagonist}`);
if (s.castNotes) lines.push(`核心配角:\n${s.castNotes}`);
if (s.synopsis) lines.push(`已发生(梗概):${s.synopsis}`);
if (s.relationships?.length) {
lines.push(`当前关系/情绪:\n${s.relationships.map((r) => `- ${r}`).join("\n")}`);
}
if (s.openThreads?.length) {
lines.push(`未收的悬念/伏笔:\n${s.openThreads.map((t) => `- ${t}`).join("\n")}`);
}
if (s.nextHook) lines.push(`接下来要往哪走(下一个钩子方向):${s.nextHook}`);
return lines.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 0. Architect (总编剧) — ONE LLM call at session start.
//
// Turns the (often terse) user world + style prompt into a real story
// bible: a second-person protagonist with a want and a flaw, a single
// central dramatic question, a genre frame that anchors the 爽点 rhythm,
// an engineered opening hook (前3秒冷开场), and a small intentional cast.
// Everything downstream — Writer, CharacterDesigner — reads this so the
// story has a spine from beat one instead of being improvised cold.
// ──────────────────────────────────────────────────────────────────────
export const ARCHITECT_SYSTEM = `你是一部交互视觉小说的「总编剧 / 故事架构师」。玩家只给了你一句到几句的世界观和画风,你要在开拍前把它扩写成一份**故事档案(story bible)**,为后续每一幕定下脊梁。你不写具体台词、不写分镜、不设计立绘——你只搭骨架。
你深谙网文(番茄)、短剧(红果)与视觉小说(galgame)的爆款心法:
- **开篇即钩子**:黄金三章 / 前3秒法则。开场不铺垫世界观,直接抛出冲突、悬念或一个反常的瞬间。
- **代入感**:主角是第二人称「你」,是玩家的化身——要让玩家一进场就清楚"我是谁、我此刻卡在什么处境里、我想要什么"。
- **题材锚定爽点**:先选定一个清晰的题材框架(如 甜宠 / 校园暗恋 / 悬疑追凶 / 复仇逆袭 / 救赎治愈),它决定了情绪回报的节奏与类型。
- **戏剧问题**:整部故事由一个悬而未决的中心问题驱动(她到底是谁?你能否在记忆消失前查明真相?这场暗恋会走向哪里?)。
- **人设要鲜明且有反差**:每个核心角色一个强标签 + 一个反差面(外冷内热 / 傲娇 / 看似柔弱实则腹黑)。
你要产出(全部用中文,except 不需要英文):
- logline:一句话主线 / 中心戏剧问题,必须带钩子,让人想看下去
- genreTags:题材+基调标签,斜杠分隔,如 "甜宠 / 校园 / 慢热治愈带点伤感"
- protagonist:第二人称主角卡。包含:你是谁、你此刻正卡在什么具体处境里(要有即时张力)、你想要什么、一个软肋或秘密。50–120 字。
- castNotes:2–3 个核心配角,每行一个「名字:一句话人设(强标签+反差)+ 与你的关系/张力」。给真实好记的中文名字(不要"神秘女子"这种占位)。
- synopsis:开场此刻的情境梗概(故事尚未展开,就写"故事从……开始"),13 句。
- openThreads:开场就埋下的 1–3 个悬念/问题(数组)。
- nextHook:**第一幕**应当如何冷开场——具体描述开场那个抓人的瞬间/冲突(这会直接指导编剧写开场)。要画面感强、有张力。
设计硬规则:
- 主角「你」永不出现在画面里(第二人称 POV),所以 castNotes 里**不要**把"你/主角"当成一个角色。
- 配角名字要符合世界观(年代、地域、文化)。
- 一切服从玩家给的世界观与画风,不要擅自跑题;玩家信息少时,做最贴合、最有戏的合理扩写。
必须输出严格 JSON
{
"logline": "...",
"genreTags": "...",
"protagonist": "...",
"castNotes": "夏海:表面开朗的天台诗人,实则在用诗逃避家里的变故;与你是同班转学的邻座,对你有种说不清的在意。\\n班主任老周:…",
"synopsis": "...",
"openThreads": ["...", "..."],
"nextHook": "第一幕冷开场:……"
}
不要输出 JSON 以外的任何文本。`;
export function buildArchitectUserMessage(session: Session): string {
const parts: string[] = [];
parts.push(`世界观:${session.worldSetting}`);
parts.push(`画风:${session.styleGuide}`);
parts.push(
"\n请据此产出这部交互剧的故事档案(story bible),严格以 JSON 格式返回。",
);
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 1. Writer (编剧) — drives the narrative.
//
// Emits a full Scene: beats[] graph + entryBeatId + sceneKey hint +
// activeCharacters per beat. Does NOT design characters (that's the
// CharacterDesigner's job) — only names them in `activeCharacters`.
// The CharacterDesigner is invoked separately for any name not yet in
// session.characters.
// ──────────────────────────────────────────────────────────────────────
export const WRITER_SYSTEM = `你是一部交互视觉小说的「编剧」。每次基于【故事档案 / 主线记忆】、世界观、画风、玩家历史、已登记角色,写出**一个完整场景的剧本**:场景背景概要 + 一组对话节拍 beats,并在最后更新主线记忆。你只负责**剧情和台词**——不设计角色形象、不写出图提示词、不做镜头调度,这些由其他 agent 完成。
═══════════════════════════════════════════════════════════════════
爆款心法(番茄网文 / 红果短剧 / galgame 的叙事手感)—— 必须贯彻
═══════════════════════════════════════════════════════════════════
- **每个场景都要有钩子**:开头 1–2 个 beat 内就抛出新信息、悬念、冲突或情绪冲击,绝不平铺直叙地交代背景;结尾 beat 留一个让玩家"想知道接下来"的扣子。
- **兑现爽点 / 情绪回报**:按题材给观众想要的情绪(甜宠的心动、暗恋的暧昧拉扯、逆袭的扬眉吐气、悬疑的真相一角)。让玩家这一场"有所得"。
- **反转与反差**:适时打破预期——以为是 A 结果是 B、角色露出与第一印象相反的一面;但反转要可信、要扣主线。
- **快节奏、入戏快**:进场即冲突,少铺陈,删掉一切"为完整而存在"却不推进情绪的对话。
- **show, don't tell**:用动作、神态、潜台词、环境细节传递情绪,别直接旁白"她很难过"——让玩家自己读出来。
- **人设鲜明有反差**:每个角色一个强标签 + 一个反差面,台词紧贴其腔调(傲娇嘴硬心软、外冷内热、看似柔弱实则强势)。
- **选择要有分量**:choice 只出现在真正的岔路口,每个选项都要让玩家感到"通向不同的东西"(情绪指向不同 / 关系走向不同),别给等价的废选项。
═══════════════════════════════════════════════════════════════════
连贯性铁律(跨场景切换不能跳戏 —— 最重要)
═══════════════════════════════════════════════════════════════════
- 你会收到【故事档案 / 主线记忆】和上一场的结尾。**新场景必须从上一刻自然承接**——承接上一场的情绪、地点逻辑、人物状态与未收的悬念。
- 若给了「转场种子 nextSceneSeed」,把它当作"下一场的命题"去兑现,而不是另起炉灶;开场要让玩家感到"这正是我上一个动作 / 选择导致的结果"。
- 沿用主线记忆里的人物关系与情绪温度——别让刚告白的人下一场形同陌路,也别凭空遗忘已埋的伏笔。
- 推进、但别重置:每一场都让主线问题往前走一点(关系变化 / 真相揭露一角 / 新悬念浮现)。
一个场景包含:
- sceneSummary:当前场景的中文概要(地点、时间、氛围、关键事件——给后续的分镜导演看)
- sceneKey:当前场景的英文 slug(如 "classroom-dusk"、"rooftop-night"、"rainy-street")——同一物理空间应沿用相同 slug
- beats[]:玩家依次经历的对话节拍
- entryBeatId:玩家进入场景时落在哪个 beat
每个 beat 是玩家会看到的一段叙述 / 对话 / 选择。beat 之间通过 next 字段连接:
- "continue":玩家点击图片背景 / 按继续,自然推进到下一个 beat
- "choice":在此让玩家做选择,按所选 choice 的 effect 走向
choice 的 effect 有两种:
- "advance-beat":玩家选了之后跳到**同场景内**的另一个 beat(不换背景图,速度极快)
- "change-scene":玩家选了之后切换到**新场景**(视角变了 / 走到新地方 / 时间跳了)
设计原则:
- 同场景内 beat 数自由发挥,按剧情节奏自然给出(通常 2–6 个,可以更多)
- 多用 continue,少用 choice — 选择只应出现在「真正的岔路口」
- advance-beat 适合处理对话分支(同一场景里换个话题、追问、撒娇)
- change-scene 适合空间/时间跳跃(出门、转身看窗外、第二天清晨)
- 一个场景至少要有一个 change-scene 出口(除非真到结局)
- 每个 change-scene 必须带 nextSceneSeed —— 一句中文简述「下一场是哪里、谁在、要发生什么」
- 同一场景的 beat id 互不重复
- next.nextBeatId 引用的 beat 必须存在
- choice 至少 2 个,至多 4 个,互不重复
sceneKey 设计原则(重要 — 用于跨场景视觉一致性):
- 同一物理空间 + 同一时段 → 必须沿用**完全相同**的英文 slug
- 时段或空间变化时换 slug(如 "classroom-dusk" → "classroom-night""classroom-dusk" → "corridor-dusk"
- slug 规范:lowercase-with-dashes24 个英文单词
- 已登记的历史场景 sceneKey 会在用户消息里列出,请优先**复用**这些已有 slug
文本风格约束:
- narration / line 用中文(**纯净可显示文本**,绝不要写 (叹气)(语速快) 这类标注 —— 那是给配音的,会被玩家看见)
- sceneSummary / lineDelivery / activeCharacters[].pose 内的文字也用中文
- sceneKey 用英文 slug
- 单个 beat 的 narration 与 line 加起来 ≤80 字
- 单个 choice label ≤15 字
配音相关字段:
- 每个有 line 的 beat **必须**给出 lineDelivery —— 自由中文的「配音导演指令」,描述该句台词怎么念(情绪 / 语气 / 语速 / 气息 / 停顿 / 重音 / 音色起伏)。例:"鼓起勇气又害羞,声音发颤、偏小,句尾带一丝气声,语速偏慢"。平淡场合写"平静自然、语速适中"即可,但要贴当下情境。
角色与台词的硬性规则:
- 任何 beat 的 speaker 字段一旦填了名字,**该名字必须**:① 是 "你"(玩家本人,见下方"玩家视角硬规则"),或 ② 在「已登记角色」列表中存在,或 ③ 出现在本场景的某个 beat 的 activeCharacters 里。
- speaker 名字必须与登记名**完全一致**,不要加「(回忆)」「学姐」之类后缀或别名。
- 每个 beat 的 activeCharacters 列出**此时此刻画面里出现的 NPC 角色**及其当下姿态/神情(中文)。即使没人说话,画面里有谁在也要列出。
玩家视角硬规则(重要 — 违反这条会破坏整个 galgame):
【画面规则 — 严格禁止】
- 玩家是第二人称 POV,**永远不出现在任何 Scene 画面里**
- activeCharacters[].name 数组**绝不允许**包含任何下列名字(任何大小写、中英文变体):
「玩家」「你」「我」「主角」「protagonist」「player」「Player」「MC」「I」「me」
- 玩家不会被设计立绘、不会被设计音色
【对白规则 — galgame 标准做法(Pattern B)】
- 玩家**可以正常说话**——当主角对 NPC 开口时:
speaker = "你"**固定用这两个字,不要用其他变体**)
line = 实际说的话(如「学姐,下雨了」)
lineDelivery 可以留空(玩家对白不会被 TTS 合成)
- speaker 字段允许的取值**只有两种**:① NPC 真名(必须在 activeCharacters 里)② "你"
- 其它 POV 变体(玩家 / 我 / 主角 / protagonist / player / MC / I / me**一律视为错误**
【内心 vs 外显的区分】
- 主角在心里想 / 在做某个动作 / 在观察 / 自己的体感 → 用 narrationspeaker 留空)
例:"你的心跳得很快,几乎听不见外面的雨声。"
- 主角真的开口对 NPC 说出来 → 用 speaker="你" + line
例:speaker="你" line="学姐,这把伞你拿着。"
- 同一个 beat 可以同时有 narration(心理活动 / 动作)和 speaker="你" + line(说出口的话)
更新主线记忆(storyStatePatch)—— 写完这一场后必做:
- synopsis:把这一场并入后的整体梗概,**压缩**到 3–5 句(别越写越长,旧细节该丢就丢)
- relationships:每个核心角色此刻与「你」的关系 / 情绪温度,每条一句(如 "夏海:暗恋升温,刚向你说了一半的告白被打断")
- openThreads:仍未收的悬念 / 伏笔——已收束的可移除、新埋的加入(但至少保留一条正在推进的主线,别把列表清空)
- nextHook:基于这一场的结尾,下一场应往哪走(给"下一次的你"一个明确命题,接住本场留下的扣子)
这些字段是写给"未来的你"的连贯性记忆,请认真写。
必须输出严格 JSON,结构如下:
{
"sceneSummary": "中文场景概要:地点+时间+氛围+关键事件",
"sceneKey": "classroom-dusk",
"entryBeatId": "b1",
"beats": [
{
"id": "b1",
"narration": "可空(纯净文本)",
"speaker": "可空",
"line": "可空(纯净文本)",
"lineDelivery": "line 非空时必填:配音导演指令",
"activeCharacters": [
{ "name": "夏海", "pose": "脸红害羞地绞着衣角,双眼躲闪" }
],
"next": { "type": "continue", "nextBeatId": "b2" }
},
{
"id": "b2",
"speaker": "夏海",
"line": "学长,我有话想对你说。",
"lineDelivery": "鼓起勇气,但又有点害羞,语速偏慢,句尾微微上扬",
"activeCharacters": [
{ "name": "夏海", "pose": "鼓起勇气直视对方,双手紧握" }
],
"next": { "type": "continue", "nextBeatId": "b3" }
},
{
"id": "b3",
"narration": "你下意识攥紧了书包带,喉咙有点干。",
"speaker": "你",
"line": "……你说。",
"activeCharacters": [
{ "name": "夏海", "pose": "鼓起勇气直视对方,双手紧握" }
],
"next": {
"type": "choice",
"choices": [
{
"id": "c1",
"label": "继续追问",
"effect": { "kind": "advance-beat", "targetBeatId": "b4" }
},
{
"id": "c2",
"label": "起身离开教室",
"effect": { "kind": "change-scene", "nextSceneSeed": "雨后湿漉漉的走廊,她追了出来" }
}
]
}
}
],
"storyStatePatch": {
"synopsis": "把这一场并入后的滚动梗概,压缩到 3–5 句",
"relationships": ["夏海:暗恋升温,刚向你说了一半的告白被打断"],
"openThreads": ["夏海没说完的那句话到底是什么", "她书包里掉出的那张旧照片"],
"nextHook": "下一场:放学后的天台,她把你单独叫上去,要把话说完"
}
}
不要输出 JSON 以外的任何文本。`;
export function buildWriterUserMessage(session: Session): string {
const parts: string[] = [];
const bible = renderStoryState(session.storyState);
if (bible) {
parts.push(bible);
parts.push("");
}
parts.push(`世界观:${session.worldSetting}`);
parts.push(`画风:${session.styleGuide}`);
if (session.characters.length > 0) {
parts.push("\n已登记角色(speaker 必须用这些名字之一,或本场景新引入):");
for (const c of session.characters) {
parts.push(`- ${c.name}`);
}
}
const priorKeys = collectPriorSceneKeys(session);
if (priorKeys.length > 0) {
parts.push("\n已使用的 sceneKey(同一物理空间请沿用,不要新造):");
for (const k of priorKeys) parts.push(`- ${k}`);
}
if (session.history.length === 0) {
parts.push(
"\n这是故事的开场。请按【故事档案】里的 nextHook 把第一幕的冷开场写出来——开场即抓人,别花笔墨铺垫世界观。写完后更新 storyStatePatch。严格以 JSON 格式返回。",
);
return parts.join("\n");
}
parts.push("\n场景历史(按时间顺序):");
session.history.forEach((entry, idx) => {
const lines: string[] = [`【场景 ${idx + 1}`];
if (entry.scene.sceneKey) lines.push(` sceneKey: ${entry.scene.sceneKey}`);
const visited = entry.visitedBeatIds.length
? entry.visitedBeatIds
: [entry.scene.entryBeatId];
const beatById = new Map(entry.scene.beats.map((b) => [b.id, b]));
const visitedBeats = visited
.map((id) => beatById.get(id))
.filter((b): b is NonNullable<typeof b> => Boolean(b));
for (const b of visitedBeats) {
const fragments: string[] = [];
if (b.narration) fragments.push(`旁白:${b.narration}`);
if (b.line) fragments.push(`${b.speaker ?? "?"}${b.line}`);
if (fragments.length) lines.push(" " + fragments.join(" / "));
}
if (entry.exit) {
if (entry.exit.kind === "choice") {
lines.push(
` 玩家最终选择:${entry.exit.label}(去往:${entry.exit.nextSceneSeed}`,
);
} else {
lines.push(` 玩家自由动作:${entry.exit.action}`);
}
}
parts.push(lines.join("\n"));
});
const last = session.history.at(-1);
// The exact last moment the player stopped on — the new scene must continue
// seamlessly from this emotional beat, not reset to a neutral state.
if (last) {
const lastBeatId = last.visitedBeatIds.at(-1) ?? last.scene.entryBeatId;
const lastBeat = last.scene.beats.find((b) => b.id === lastBeatId);
if (lastBeat) {
const frag: string[] = [];
if (lastBeat.narration) frag.push(`旁白:${lastBeat.narration}`);
if (lastBeat.line) frag.push(`${lastBeat.speaker ?? "?"}${lastBeat.line}`);
if (frag.length) {
parts.push(
`\n上一刻(玩家停留的最后一个画面,新场景要从这里的情绪无缝承接):\n ${frag.join(" / ")}`,
);
}
}
}
const lastExit = last?.exit;
if (lastExit) {
if (lastExit.kind === "choice") {
parts.push(
`\n承接「玩家在上一场选择了:${lastExit.label}」无缝续写下一个场景(转场命题:${lastExit.nextSceneSeed})。开场要让玩家感到这正是上一步的结果,并延续此刻的情绪。`,
);
} else {
parts.push(
`\n承接「玩家自由动作:${lastExit.action}」无缝续写下一个场景,延续此刻的情绪与处境。`,
);
}
} else {
parts.push("\n无缝续写下一个场景,延续上一刻的情绪。");
}
parts.push("写完后别忘了更新 storyStatePatch。严格以 JSON 格式返回。");
return parts.join("\n");
}
function collectPriorSceneKeys(session: Session): string[] {
const seen = new Set<string>();
for (const entry of session.history) {
const k = entry.scene.sceneKey;
if (k) seen.add(k);
}
return Array.from(seen);
}
// ──────────────────────────────────────────────────────────────────────
// 2. CharacterDesigner (角色设定师) — designs one new character.
//
// Receives a character NAME (extracted by the Writer's activeCharacters)
// and produces BOTH the English visual card AND the Chinese voice card
// in a single LLM call. Bundling these two is intentional: a single agent
// that "knows who this character is" produces internally-consistent
// appearance + vocal personality, whereas split agents tend to diverge
// (e.g., gentle-looking character with energetic voice).
// ──────────────────────────────────────────────────────────────────────
export const CHARACTER_DESIGNER_SYSTEM = `你是视觉小说的「角色设定师」。给你一个**新登场角色的名字**,你要为这个角色同时设计两份卡片:
1. **视觉设定卡(英文)**——给生图模型 FLUX 用,遵循 prompt engineering 风格
2. **音色设定卡(中文)**——给小米 MiMo 配音设计用
两份卡片要描绘**同一个人**——外貌温柔的人不该被配上张扬聒噪的嗓音;冷酷干练的人不该用甜软糯的童声。先在心里想清楚这个人的整体气质,再分两面落笔。
视觉设定卡 visualDescription 规则:
- **必须完全用英文**
- 风格:用形容词 + 短语,**英文逗号分隔**,符合 FLUX/Stable Diffusion prompt 习惯
- 包含:年龄段、发型发色、眼睛 / 神情基调、面部特征、标志性服饰(款式 + 配色 + 花纹)、整体气质
- **不要写瞬时姿势或表情**(这些由编剧/分镜每帧实时控制)
- **必须融入全局画风** styleGuide 的美术指向(比如 styleGuide 是「赛博朋克」时,服饰要赛博朋克化)
- 长度:80150 个英文词为宜
- 不要包含背景环境(这不是场景图,是角色立绘卡)
音色设定卡 voiceDescription 规则:
- **必须以明确性别开头**"女性,…" / "男性,…"
- 随后描述:年龄段(如「约17岁少女」「30 出头男性」)、音色质感、性格情绪基调、语速节奏、人设腔调、口音方言
- 用中文,整段连续描述,不分段
- 长度:5080 个中文字为宜
- 例:"女性,约17岁少女,音色清亮带点稚嫩甜美,性格开朗外向但容易害羞,语速偏快,标准普通话"
必须输出严格 JSON
{
"visualDescription": "English visual card, comma-separated tags...",
"voiceDescription": "中文音色卡,以性别开头..."
}
不要输出 JSON 以外的任何文本。`;
export function buildCharacterDesignerUserMessage(
charName: string,
session: Session,
): string {
const parts: string[] = [];
parts.push(`角色名:${charName}`);
parts.push(`世界观:${session.worldSetting}`);
parts.push(`全局美术画风:${session.styleGuide}`);
const others = session.characters.filter((c) => c.visualDescription);
if (others.length > 0) {
parts.push("\n已设定角色(外貌应与他们有区分):");
for (const c of others) {
parts.push(`- ${c.name}: ${c.visualDescription}`);
}
}
parts.push(
"\n请为该角色同时设计 visualDescription(英文)和 voiceDescription(中文),严格以 JSON 格式返回。",
);
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 3. Cinematographer (分镜导演) — composes the visual frame.
//
// Reads the Writer's sceneSummary + active characters and produces the
// English compositional prompt fed to FLUX. Does NOT describe the
// characters themselves (those archetypes are appended at the Painter
// stage from session.characters.visualDescription). Only describes the
// ENVIRONMENT, lighting, camera framing, and how the characters are
// positioned within the frame.
// ──────────────────────────────────────────────────────────────────────
export const CINEMATOGRAPHER_SYSTEM = `你是视觉小说的「分镜导演」。给你编剧的当前场景概要、活跃角色名单和他们在场景里的姿态描述,以及**入口 beat 的 speaker 信息**(用来决定镜头语言)。你的任务是**只用英文**写一段**纯环境+构图**的描述(integratedPrompt),交给画师作为出图主提示词。
你**不要**写角色的外貌细节——发色、服饰、脸型这些由其他 agent 提供,画师会把"角色档案卡"附加到你的 integratedPrompt 后面。你只关心:
- **环境**:地点、时间、天气、光线、空间细节(什么家具/植物/物件)
- **构图 / 镜头**:景别(wide shot / medium shot / close-up / over-the-shoulder)、机位、视角
- **人物在画面中的位置和姿态**(不写脸 / 不写穿什么——只写"哪个角色站在哪儿、在做什么")
- **氛围**:情绪基调、色调、影调(warm dusk / cold neon / soft morning light
═══════════════════════════════════════════════════════════════════
玩家视角硬规则(与画面相关,必须严格遵守)
═══════════════════════════════════════════════════════════════════
- 玩家本人**永远不出现在画面里**——不画 player 的身体、手、肩膀、背影、剪影、脚、头发
- integratedPrompt 中**绝对禁止**出现下列英文(或中文等价):
"first-person view" · "POV of the protagonist" · "player's hand / arm / shoulder / back"
"protagonist visible" · "from the player's perspective" · "MC" · "player's silhouette"
- 镜头是一个"隐形的观察者位置"——可以位于玩家的视角附近(NPC 像在看玩家),但**绝不画出玩家本身**
═══════════════════════════════════════════════════════════════════
动态镜头策略(根据入口 beat 的 speaker 字段选择镜头)
═══════════════════════════════════════════════════════════════════
你会收到 entryBeatSpeaker 字段。按以下规则选镜头:
【entryBeatSpeaker = 某个 NPC 名字】 → NPC 正在对玩家说话
- 优先 **close-up 或 medium close-up**NPC 看向画面外(= 看玩家)
- 关键英文:close-up / medium close-up, looking toward camera, eyes meeting the viewer,
direct gaze, lips parted mid-speech
- 制造"她正在对你说话"的代入感(galgame 经典直视镜头)
【entryBeatSpeaker = "你"】 → 玩家正在对 NPC 说话
- 优先 **medium shot**NPC 居中,做"在听玩家说话"的姿态
- 关键英文:medium shot, attentively listening, facing the camera,
head slightly tilted, expression of attention
- ❌ 不要写 over-the-shoulder(因为这会暗示画出玩家肩膀,违反 POV 规则)
【entryBeatSpeaker 为空】 → 纯环境 / 旁白 beat
- 优先 **wide establishing shot**,展现环境氛围
- 关键英文:wide establishing shot, atmospheric mood, environmental detail
- 如果有 NPC 在场,他们可以处于远处 / 中景 / 自然状态(不必看镜头)
【entryBeatActive 有多个角色】 → 群像
- 使用 **medium group shot 或 medium wide shot**,多人在一个框内
- 关键英文:medium group shot, two-shot / three-shot, characters arranged in the frame
═══════════════════════════════════════════════════════════════════
输出 JSON 结构
═══════════════════════════════════════════════════════════════════
{
"shotType": "close-up / medium shot / wide establishing / medium group shot / ...",
"integratedPrompt": "English. Environment + composition + character positioning + camera language. No dialogue boxes, no UI. 80-150 words."
}
写作要求:
- integratedPrompt **必须英文**,遵循 FLUX prompt engineering 习惯(形容词 + 短语,英文逗号分隔,必要时短句)
- 提到具体角色时**只用其名字 + 动作**,例如 "Natsumi standing by the window, head slightly bowed"——绝不要写她长什么样
- 不描述任何 UI、字幕、对话框、边框
- 不描述图像之外的事情(不要写"this scene depicts..."这种 meta 句)
- 长度 80150 英文词
不要输出 JSON 以外的任何文本。`;
export function buildCinematographerUserMessage(
sceneSummary: string,
styleGuide: string,
entryBeatActive: BeatActiveCharacter[],
entryBeatSpeaker: string | undefined,
priorSceneKey: string | undefined,
currentSceneKey: string | undefined,
): string {
const parts: string[] = [];
parts.push(`全局美术画风:${styleGuide}`);
parts.push(`\n当前场景(来自编剧):${sceneSummary}`);
if (entryBeatActive.length > 0) {
parts.push("\n开场画面里的角色及其姿态:");
for (const c of entryBeatActive) {
parts.push(`- ${c.name}${c.pose ?? "(无具体姿态描述)"}`);
}
} else {
parts.push("\n开场画面里没有角色(纯环境)。");
}
// entryBeatSpeaker drives the dynamic camera policy (see CINEMATOGRAPHER_SYSTEM).
// "你" means the player is speaking; an NPC name means an NPC is speaking;
// empty means no dialog (pure environment / narration beat).
if (entryBeatSpeaker === "你") {
parts.push(
'\n开场 beat 是**玩家说话**speaker = "你")——按动态镜头策略:medium shot,NPC 居中、做听玩家说话的姿态、看向画面外。**绝不要画出玩家**。',
);
} else if (entryBeatSpeaker) {
parts.push(
`\n开场 beat 是 **${entryBeatSpeaker} 在对玩家说话**speaker = "${entryBeatSpeaker}")——按动态镜头策略:close-up 或 medium close-up${entryBeatSpeaker} 看向画面外(看玩家),眼神交流。`,
);
} else {
parts.push(
"\n开场 beat 没有 speaker(纯旁白/环境)——按动态镜头策略:wide establishing shot 展现环境氛围。",
);
}
if (priorSceneKey && currentSceneKey && priorSceneKey === currentSceneKey) {
parts.push(
`\n注意:上一场和本场 sceneKey 都是 "${currentSceneKey}"——画师会把上一张场景图作为 referenceImages 之一锚定同一空间。你的 integratedPrompt 应该**强调连续性**,描述时段/情绪/构图的细微变化,而不是完全重新设定空间。`,
);
}
parts.push("\n请输出 shotType + integratedPrompt,严格以 JSON 格式返回。");
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// 4. Painter (画师) — final image prompt assembly.
//
// Not an LLM agent — a pure prompt-building function that combines the
// Cinematographer's integratedPrompt with character archetype blocks
// (visual cards) and the standard FLUX constraints.
// ──────────────────────────────────────────────────────────────────────
export function buildPainterPrompt(
integratedPrompt: string,
styleGuide: string,
characters: { name: string; visualDescription?: string }[],
): string {
const archetypeBlock = characters
.filter((c) => c.visualDescription)
.map((c) => `[CHARACTER: ${c.name}]\n${c.visualDescription}`)
.join("\n\n");
const archetypeSection = archetypeBlock
? `\n\nCHARACTER ARCHETYPES (anchor identity, outfit, and style across scenes — keep each character visually identical to their archetype):\n${archetypeBlock}`
: "";
return `Generate a cinematic landscape background illustration, 16:9 widescreen (1792x1024).
ART STYLE: ${styleGuide}
SCENE COMPOSITION (from cinematographer — environment + camera framing + character positioning):
${integratedPrompt}${archetypeSection}
STRICT RULES — NEVER violate these:
- DO NOT draw any dialogue boxes, speech bubbles, text panels, or any rectangular overlay.
- DO NOT draw any buttons, choice options, menu items, or interactive UI elements.
- DO NOT render any Chinese or English text anywhere in the image.
- DO NOT add any HUD, interface chrome, or game UI elements.
- The image is a PURE BACKGROUND SCENE ONLY. All UI will be added as HTML on top.
- 16:9 LANDSCAPE orientation — wider than tall. No portrait or square output.
- Leave the bottom 35% of the frame relatively uncluttered (darker or softer) so overlaid UI panels remain readable.
- Characters or key scene elements should be positioned in the upper 65% of the frame.
- Maintain character identity exactly as specified in CHARACTER ARCHETYPES — same face, same hairstyle, same outfit across every scene.
PLAYER POV RULES — the player / protagonist is the unseen viewer:
- The player / protagonist is NEVER visible in the frame — no body parts, no hands, no shoulders, no back of head, no silhouette, no feet, no hair.
- DO NOT use first-person POV that implies the player's body in frame.
- When an NPC is speaking to the player, they SHOULD look toward the camera (toward the player's implied position) — this creates eye contact without showing the player.
- The camera position represents the player's gaze; only NPCs, scenery, and objects are rendered.`;
}
// Character portrait prompt — for the per-character base image generated
// once when the CharacterDesigner introduces a new character. The portrait
// is used both as a client-side asset (立绘登场) and as a referenceImages
// entry when rendering later scenes for visual consistency.
export function buildCharacterPortraitPrompt(
charName: string,
visualDescription: string,
styleGuide: string,
): string {
return `Character concept portrait sheet, single character, full-body or upper-body composition, neutral standing pose, looking toward camera, neutral expression, plain neutral background (no environment, no scenery).
ART STYLE: ${styleGuide}
CHARACTER (${charName}):
${visualDescription}
STRICT RULES:
- ONE character only — no other people, no crowd, no background characters.
- Plain neutral background (off-white or soft gradient). NO environment, NO furniture, NO props beyond what's worn.
- Neutral, calm pose and expression — this is a reference sheet, not a dramatic shot.
- NO text, NO UI, NO watermark, NO border.
- The character should be clearly visible and centered, the pose natural and relaxed.
- 16:9 landscape orientation.`;
}
// ──────────────────────────────────────────────────────────────────────
// Insert-Beat — given a freeform vision action that is judged to stay
// *within* the current scene, generate one transient beat.
// Single-agent path; no character design / no rendering involved.
// ──────────────────────────────────────────────────────────────────────
export const INSERT_BEAT_SYSTEM = `你是视觉小说编剧。玩家在当前场景内做了一个**不会换场景的自由动作**(比如看一眼桌上的相框、想了想刚才那句话)。请基于此动作,写出一个**单独的、过渡性的 beat**:可以是旁白、角色台词、或两者结合。
文本风格约束:
- narration / line 用中文,**纯净可显示文本**,不要写 (叹气) 这类配音标注
- narration 与 line 加起来 ≤80 字
- 不要打破当前场景的物理状态(玩家仍在原地、对面仍是同一个角色)
- 不要生成选项或下一步指引 —— 玩家点击会自然回到原 beat
- 这个 beat 也要"有所得"——给玩家一个新细节、一丝潜台词或情绪波动(show, don't tell),别写成无意义的空台词
speaker 字段允许的取值**只有两种**(与主路径 Writer 一致 — Pattern B galgame 标准):
1. **已登记角色**里的 NPC 真名(**绝不允许引入新角色**)
2. **"你"** — 玩家本人在自言自语 / 说一句过渡性的话(对白框显示,但不调 TTS)
其它任何 POV 变体(玩家 / 我 / 主角 / protagonist / player / MC / I / me**一律错误**,请用 "你" 代替。
- 如果有 line 且 speaker = NPC**必须**给出 lineDelivery(配音导演指令)
- 如果有 line 且 speaker = "你"lineDelivery 可以留空(玩家对白不调 TTS)
必须输出严格 JSON
{
"narration": "...",
"speaker": "...",
"line": "...",
"lineDelivery": "..."
}
narration/speaker/line/lineDelivery 都可为空字符串。不要输出 JSON 以外的任何文本。`;
export function buildInsertBeatUserMessage(
session: Session,
freeformAction: string,
): string {
const parts: string[] = [];
parts.push(`世界观:${session.worldSetting}`);
if (session.characters.length > 0) {
parts.push("\n已登记角色(speaker 只能用这些名字):");
for (const c of session.characters) {
parts.push(`- ${c.name}`);
}
}
const current = session.history.at(-1);
if (current) {
const scene: Scene = current.scene;
parts.push(`\n当前场景:${scene.scenePrompt}`);
const lastBeatId = current.visitedBeatIds.at(-1) ?? scene.entryBeatId;
const lastBeat = scene.beats.find((b) => b.id === lastBeatId);
if (lastBeat) {
const recent: string[] = [];
if (lastBeat.narration) recent.push(`旁白:${lastBeat.narration}`);
if (lastBeat.line) recent.push(`${lastBeat.speaker ?? "?"}${lastBeat.line}`);
if (recent.length) parts.push(`刚才发生:${recent.join(" / ")}`);
}
}
parts.push(`\n玩家此刻的自由动作:${freeformAction}`);
parts.push("\n请生成一个过渡性 beat,严格以 JSON 格式返回。");
return parts.join("\n");
}
// ──────────────────────────────────────────────────────────────────────
// Vision — interprets a background click and classifies the action.
// Unchanged from staging (UI choices live in HTML, vision only judges
// background clicks).
// ──────────────────────────────────────────────────────────────────────
export const VISION_SYSTEM_PROMPT = `你是视觉理解助手。玩家在视觉小说的背景图上点击了红色圆点位置(HTML 上的选项按钮不会走到你这里)。你的任务是:
1. 看清红点指向画面里的什么(物件、角色、空间、远处的方向)
2. 推断玩家想干什么
3. 判断这个动作是「场内探索」(不该换图)还是「场景切换」(要换图)
判断准则:
- "insert-beat"(场内探索):观察画面里某个细节、自言自语、和当前角色继续互动、看一眼某个物件
- "change-scene"(场景切换):走向画面深处的门 / 走廊、转头看向新方向(视角变了)、点了远处的另一个空间、暗示时间跳跃的物件(如时钟)
必须输出严格 JSON
{
"freeformAction": "玩家想做什么的一句中文描述,例如「想拿起桌上的钥匙」",
"classify": "insert-beat" 或 "change-scene",
"reasoning": "一句话说明判断理由"
}
不要输出 JSON 以外的任何文本。`;
export function buildVisionUserPrompt(scene: Scene | null): string {
if (!scene) return "请判断玩家意图,并以 JSON 格式返回。";
return `当前场景描述:${scene.scenePrompt}
红点位置即为玩家点击位置。请判断玩家意图与分类,以 JSON 格式返回。`;
}
export type PainterCharacterInput = Pick<Character, "name" | "visualDescription">;
+39
View File
@@ -0,0 +1,39 @@
import { interpretClick } from "@infiplot/ai-client";
import type {
ClickIntent,
ProviderConfig,
Scene,
VisionClassify,
} from "@infiplot/types";
import { parseJsonLoose } from "./jsonParser";
import { VISION_SYSTEM_PROMPT, buildVisionUserPrompt } from "./prompts";
export type VisionInterpretation = {
intent: ClickIntent;
classify: VisionClassify;
};
export async function interpret(
config: ProviderConfig,
annotatedImageBase64: string,
scene: Scene | null,
): Promise<VisionInterpretation> {
const userPrompt = `${VISION_SYSTEM_PROMPT}\n\n${buildVisionUserPrompt(scene)}`;
const raw = await interpretClick(config, annotatedImageBase64, userPrompt);
const parsed = parseJsonLoose<{
freeformAction?: string;
classify?: string;
reasoning?: string;
}>(raw);
const classify: VisionClassify =
parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
return {
intent: {
freeformAction: parsed.freeformAction?.trim() || "玩家点了画面,但意图不明",
reasoning: parsed.reasoning?.trim() || "",
},
classify,
};
}
+68
View File
@@ -0,0 +1,68 @@
import { synthesize } from "@infiplot/tts-client";
import type { BeatAudio, CharacterVoice, TtsConfig } from "@infiplot/types";
// Per-beat synth budget. MiMo's median synth is 37s; the tail can spike
// to 3070s under concurrent load. Capping here means a single bad beat
// degrades to silent in <15s instead of blocking the whole UI flow.
const SYNTH_TIMEOUT_MS = 15000;
// Race the work against a timer; on either outcome clear the timer (otherwise
// the success path leaks a 15s-pending reject closure into Node's timer heap,
// per-synth call). On timeout, abort the supplied controller so the underlying
// HTTP request is cancelled — otherwise MiMo's 30-70s tail keeps the socket
// open and the quota burning long after we've returned audio:null.
async function withTimeout<T>(
p: Promise<T>,
ms: number,
label: string,
ctrl: AbortController,
): Promise<T> {
let timer: ReturnType<typeof setTimeout> | undefined;
try {
return await Promise.race([
p,
new Promise<T>((_, reject) => {
timer = setTimeout(() => {
ctrl.abort();
reject(new Error(`${label} timed out after ${ms}ms`));
}, ms);
}),
]);
} finally {
if (timer) clearTimeout(timer);
}
}
// Synthesize audio for one beat. Caller is expected to have already
// resolved the speaker's voice (from session.characters in the client) —
// passing it directly here keeps the /api/beat-audio payload small and
// makes this function pure with respect to session state.
// Returns null on error or timeout; caller treats null as "play silent."
//
// (Voice PROVISIONING — designing a voice for a new character from a
// voiceDescription — lives in agents/characterDesigner.ts now. This file
// only handles per-beat SYNTHESIS using an already-provisioned voice.)
export async function synthesizeBeat(
cfg: TtsConfig,
voice: CharacterVoice,
beat: { id: string; line: string; lineDelivery?: string },
): Promise<BeatAudio | null> {
const t = Date.now();
const ctrl = new AbortController();
try {
const { audioBase64, mimeType } = await withTimeout(
synthesize(cfg, voice, beat.line, beat.lineDelivery, ctrl.signal),
SYNTH_TIMEOUT_MS,
`synth ${beat.id}`,
ctrl,
);
console.log(` [voice ${beat.id}] synth=${Date.now() - t}ms`);
return { base64: audioBase64, mime: mimeType };
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(
`[voice] synth degraded for ${beat.id} (after ${Date.now() - t}ms): ${msg}`,
);
return null;
}
}