feat(engine): Architect agent + cross-scene StoryState coherence

Add a dedicated Architect LLM call at session start that expands the terse
world/style prompt into a persistent story bible (logline, genre, second-
person protagonist, cast, engineered opening hook). The bible seeds a
StoryState the Writer reads and patches every scene, carried + merged
across cuts (applyStoryStatePatch) so the story keeps a spine from beat
one instead of jumping between scenes.

- prompts: inject web-novel / short-drama / galgame craft into Writer +
  Architect; Writer emits storyStatePatch to update the running bible
- director: parallelize voice + non-entry portraits with the Painter
  (only entry-beat portraits block paint) to offset Architect latency
- architect: chat/parse guarded so a malformed response never aborts start
- types: StoryState / StoryStatePatch; required on Start/SceneResponse

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
yuanzonghao
2026-06-02 11:44:55 +08:00
parent 16707cc255
commit 15ce03a912
10 changed files with 576 additions and 109 deletions
+30 -49
View File
@@ -15,25 +15,20 @@ import {
} from "../prompts";
// ──────────────────────────────────────────────────────────────────────
// CharacterDesigner agent — designs ONE new character end-to-end.
// CharacterDesigner agent — designs ONE new character.
//
// Pipeline (per character, all the slow parts are parallelized):
// Exposed as three GRANULAR stages so the director can schedule the slow
// parts around the Painter (a voice is never needed to paint a scene, and
// only entry-beat characters' portraits are referenced by the Painter):
//
// 1. LLM call — designs BOTH visual + voice cards in one shot
// (intentional: same agent thinks about who this character IS,
// which keeps appearance and vocal personality coherent)
// 1. designCharacterCard — ONE LLM call → visual + voice TEXT cards
// (intentional bundling: the same agent thinks about who this character
// IS, keeping appearance and vocal personality coherent)
// 2. renderCharacterPortrait — base portrait image (Runware URL + UUID)
// 3. provisionCharacterVoice — Xiaomi MiMo voicedesign → reference audio
//
// 2. In parallel:
// a. Image gen — base portrait (Runware returns URL + UUID in one shot;
// no separate upload round-trip is needed for cheap re-reference)
// b. Voice provisioning — Xiaomi MiMo voicedesign from voiceDescription
// → reference audio for later voiceclone synth
//
// 3. Returns merged Character ready to be added to session.characters
//
// Each step degrades gracefully — if image gen fails we return the
// character without a portrait; if voice gen fails we return without
// voice. The game keeps running even when sub-components fail.
// Each step degrades gracefully — if image gen fails the character just has
// no portrait; if voice gen fails it has no voice. The game keeps running.
// ──────────────────────────────────────────────────────────────────────
type CharacterDesignOutput = {
@@ -77,7 +72,7 @@ async function runDesignLLM(
//
// In mock mode we return the data URI as basePortraitUrl with no UUID
// (Painter is short-circuited anyway, so the lack of a UUID is moot).
async function renderPortrait(
export async function renderCharacterPortrait(
config: EngineConfig,
charName: string,
visualDescription: string,
@@ -101,7 +96,7 @@ async function renderPortrait(
}
}
async function provisionVoiceSafe(
export async function provisionCharacterVoice(
config: EngineConfig,
voiceDescription: string,
charName: string,
@@ -116,45 +111,31 @@ async function provisionVoiceSafe(
}
}
// Single-character design pipeline. Called by the orchestrator once per
// NEW character name; multiple characters in the same scene run their
// pipelines in parallel at the orchestrator level.
export async function designCharacter(
// The cheap first stage: design the visual + voice TEXT cards in one LLM
// call. The director then schedules renderCharacterPortrait /
// provisionCharacterVoice around the Painter. Multiple new characters in the
// same scene run this stage in parallel at the director level.
export type CharacterCard = {
name: string;
visualDescription?: string;
voiceDescription: string;
};
export async function designCharacterCard(
config: EngineConfig,
session: Session,
charName: string,
): Promise<Character> {
const tTotal = Date.now();
// Step 1 — LLM design (visual + voice). Must complete first.
): Promise<CharacterCard> {
const tDesign = Date.now();
const design = await runDesignLLM(config, session, charName);
tlog(`[charDesigner ${charName}] design LLM`, tDesign);
const visualDescription = design.visualDescription?.trim();
const voiceDescription =
design.voiceDescription?.trim() ||
`请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
// Step 2 — parallel: portrait + voice provisioning.
const tProvision = Date.now();
const portraitPromise = visualDescription
? renderPortrait(config, charName, visualDescription, session.styleGuide)
: Promise.resolve({} as Awaited<ReturnType<typeof renderPortrait>>);
const voicePromise = provisionVoiceSafe(config, voiceDescription, charName);
const [portrait, voice] = await Promise.all([portraitPromise, voicePromise]);
tlog(`[charDesigner ${charName}] portrait+voice parallel`, tProvision);
tlog(`[charDesigner ${charName}] TOTAL`, tTotal);
return {
name: charName,
voiceDescription,
visualDescription,
basePortraitUrl: portrait.basePortraitUrl,
basePortraitUuid: portrait.basePortraitUuid,
voice,
visualDescription: design.visualDescription?.trim() || undefined,
voiceDescription:
design.voiceDescription?.trim() ||
`请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`,
};
}
@@ -169,6 +150,6 @@ export async function provisionVoiceForName(
charName: string,
): Promise<Character> {
const voiceDescription = `请根据角色名「${charName}」推断其性别、年龄与气质,生成最贴合的音色。所属世界观:${session.worldSetting}`;
const voice = await provisionVoiceSafe(config, voiceDescription, charName);
const voice = await provisionCharacterVoice(config, voiceDescription, charName);
return { name: charName, voiceDescription, voice };
}