d5b4a02cb3
Insert-beat is a pure in-scene micro-interaction — adding choices that lead to change-scene contradicted its purpose. Now insert-beat generates 1-3 richer beats then loops back to the original options, which is the natural UX for "you glanced at something decorative." Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
312 lines
12 KiB
TypeScript
312 lines
12 KiB
TypeScript
import type {
|
|
BeatAudioRequest,
|
|
BeatAudioResponse,
|
|
CharacterVoice,
|
|
EngineConfig,
|
|
FreeformClassify,
|
|
FreeformClassifyRequest,
|
|
FreeformClassifyResponse,
|
|
InsertBeatRequest,
|
|
InsertBeatResponse,
|
|
SceneStreamEvent,
|
|
Session,
|
|
SceneRequest,
|
|
SceneResponse,
|
|
StartRequest,
|
|
StartResponse,
|
|
VisionRequest,
|
|
VisionResponse,
|
|
} from "@infiplot/types";
|
|
import { coerceOrientation } from "@infiplot/types";
|
|
import { chat } from "@infiplot/ai-client";
|
|
import { isStepfun, isValidStepfunVoiceId, provisionVoice } from "@infiplot/tts-client";
|
|
import { selectStyle } from "./agents/styleSelector";
|
|
import { directInsertBeat, directScene } from "./director";
|
|
import { STYLE_MAP } from "@/lib/options";
|
|
import { parseJsonLoose } from "./jsonParser";
|
|
import { isValidLocale } from "@/lib/i18n/utils";
|
|
import {
|
|
FREEFORM_CLASSIFY_SYSTEM,
|
|
buildFreeformClassifyUserMessage,
|
|
} from "./prompts";
|
|
import { synthesizeBeat } from "./voice";
|
|
import { interpret } from "./vision";
|
|
|
|
function newSessionId(): string {
|
|
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
|
}
|
|
|
|
function tlog(label: string, t0: number): void {
|
|
console.log(`${label}: ${Date.now() - t0}ms`);
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// startSession — initial Scene via the multi-agent pipeline.
|
|
//
|
|
// directScene internally handles: Writer → (CharacterDesigner+
|
|
// Cinematographer parallel) → Painter → upload. Voice provisioning and
|
|
// portrait generation happen inside CharacterDesigner per new character,
|
|
// so the orchestrator no longer needs to coordinate them separately.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
export async function startSession(
|
|
config: EngineConfig,
|
|
req: StartRequest,
|
|
emit?: (event: SceneStreamEvent) => void,
|
|
): Promise<StartResponse> {
|
|
const tTotal = Date.now();
|
|
|
|
const session: Session = {
|
|
id: newSessionId(),
|
|
createdAt: Date.now(),
|
|
worldSetting: req.worldSetting.trim(),
|
|
styleGuide: req.styleGuide.trim(),
|
|
history: [],
|
|
characters: [],
|
|
styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
|
|
orientation: coerceOrientation(req.orientation),
|
|
playerName: req.playerName?.trim() || undefined,
|
|
language: (req.language?.trim() && isValidLocale(req.language.trim())) ? req.language.trim() : undefined,
|
|
};
|
|
|
|
// Stage 0 — optional auto style selection. The story bible is no longer
|
|
// generated by a separate Architect call; the Writer's <plan> produces it
|
|
// on the opening scene (paradigm: Writer is the single content brain).
|
|
console.log(
|
|
`[start] worldSetting (${session.worldSetting.length} chars):\n${session.worldSetting}`,
|
|
);
|
|
const isAutoStyle = session.styleGuide === "auto";
|
|
if (isAutoStyle) {
|
|
session.styleGuide = "由 AI 根据剧情自动匹配最佳画风";
|
|
const tStyle = Date.now();
|
|
const autoStyleGuide = await selectStyle(
|
|
config.text,
|
|
session.worldSetting,
|
|
).catch((err) => {
|
|
console.warn(`[styleSelector] failed, falling back to 吉卜力:`, err);
|
|
return null;
|
|
});
|
|
session.styleGuide = autoStyleGuide ?? STYLE_MAP["吉卜力"]!;
|
|
tlog("[start] StyleSelector", tStyle);
|
|
console.log(`[start] auto-selected style: ${session.styleGuide.slice(0, 60)}…`);
|
|
}
|
|
|
|
const { scene, sceneImageUrl, characters, storyState } = await directScene(
|
|
config,
|
|
session,
|
|
emit,
|
|
);
|
|
|
|
tlog("[start] TOTAL", tTotal);
|
|
|
|
return {
|
|
sessionId: session.id,
|
|
scene,
|
|
imageUrl: sceneImageUrl,
|
|
characters,
|
|
storyState,
|
|
};
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// requestScene — next Scene from existing session.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
export async function requestScene(
|
|
config: EngineConfig,
|
|
req: SceneRequest,
|
|
emit?: (event: SceneStreamEvent) => void,
|
|
): Promise<SceneResponse> {
|
|
const tTotal = Date.now();
|
|
|
|
const { scene, sceneImageUrl, characters, storyState } = await directScene(
|
|
config,
|
|
req.session,
|
|
emit,
|
|
);
|
|
|
|
tlog("[scene] TOTAL", tTotal);
|
|
|
|
return {
|
|
scene,
|
|
imageUrl: sceneImageUrl,
|
|
characters,
|
|
storyState,
|
|
};
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// visionDecide — interprets a background click into intent + classify.
|
|
// No change from staging — vision lives outside the scene-generation
|
|
// pipeline.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
export async function visionDecide(
|
|
config: EngineConfig,
|
|
req: VisionRequest,
|
|
): Promise<VisionResponse> {
|
|
const current = req.session.history.at(-1)?.scene ?? null;
|
|
return interpret(config.vision, req.annotatedImageBase64, current);
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// classifyFreeform — classifies a freeform text input at a choice node
|
|
// into match-choice / insert-beat / change-scene. Single lightweight
|
|
// LLM call; no image, no scene generation.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
export async function classifyFreeform(
|
|
config: EngineConfig,
|
|
req: FreeformClassifyRequest,
|
|
): Promise<FreeformClassifyResponse> {
|
|
const current = req.session.history.at(-1)?.scene ?? null;
|
|
const userMsg = buildFreeformClassifyUserMessage(
|
|
req.freeformText,
|
|
current?.scenePrompt,
|
|
);
|
|
|
|
const raw = await chat(config.text, [
|
|
{ role: "system", content: FREEFORM_CLASSIFY_SYSTEM },
|
|
{ role: "user", content: userMsg },
|
|
], { temperature: 0, tag: "freeform-classify" });
|
|
|
|
const parsed = parseJsonLoose<{
|
|
classify?: string;
|
|
freeformAction?: string;
|
|
}>(raw);
|
|
|
|
const classify: FreeformClassify =
|
|
parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
|
|
|
|
return {
|
|
classify,
|
|
freeformAction: parsed.freeformAction?.trim() || req.freeformText,
|
|
};
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// requestInsertBeat — single-agent transient beat (no image, no new
|
|
// characters). Stays single-LLM by design — the INSERT_BEAT prompt
|
|
// forbids new characters and there's nothing to render.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
export async function requestInsertBeat(
|
|
config: EngineConfig,
|
|
req: InsertBeatRequest,
|
|
): Promise<InsertBeatResponse> {
|
|
const tTotal = Date.now();
|
|
|
|
const result = await directInsertBeat(
|
|
config.text,
|
|
req.session,
|
|
req.freeformAction,
|
|
);
|
|
|
|
// Guard every beat: promote unregistered speakers to narration.
|
|
const guardedBeats = result.map((partial) => {
|
|
if (
|
|
partial.speaker &&
|
|
partial.speaker !== "你" &&
|
|
!req.session.characters.some((c) => c.name === partial.speaker)
|
|
) {
|
|
console.warn(
|
|
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
|
|
);
|
|
return {
|
|
narration:
|
|
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined,
|
|
speaker: undefined,
|
|
line: undefined,
|
|
lineDelivery: undefined,
|
|
};
|
|
}
|
|
return partial;
|
|
});
|
|
|
|
const first = guardedBeats[0] ?? { narration: "(你停下脚步,环视片刻。)" };
|
|
const extra = guardedBeats.slice(1);
|
|
|
|
tlog("[insert-beat] TOTAL", tTotal);
|
|
return {
|
|
partial: first,
|
|
extraBeats: extra.length > 0 ? extra : undefined,
|
|
characters: req.session.characters,
|
|
};
|
|
}
|
|
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
// requestBeatAudio — lazy per-beat synth. Returns audio:null on
|
|
// timeout / failure / TTS disabled, so the client just plays silent.
|
|
// ──────────────────────────────────────────────────────────────────────
|
|
|
|
// Resolve a synth-ready voice for the request, normalizing provider
|
|
// mismatches. The client usually sends a voice whose provider matches the
|
|
// server's TTS (the common case). The mismatch case is mainly prebaked
|
|
// homepage cards: they ship a Xiaomi voice baked at build time, but the
|
|
// server may now run StepFun — so the client skips the ~220KB reference
|
|
// audio (saving FOT) and sends stepfunVoiceId / voiceDescription instead.
|
|
// We re-provision against the SERVER's provider so the right voice synth runs.
|
|
// Returns undefined when there's nothing to synthesize from (caller plays
|
|
// silent).
|
|
async function resolveVoice(
|
|
config: EngineConfig,
|
|
req: BeatAudioRequest,
|
|
): Promise<CharacterVoice | undefined> {
|
|
const serverStepfun = !!config.tts && isStepfun(config.tts);
|
|
const voiceProvider = req.voice?.provider;
|
|
const voiceMatchesServer =
|
|
(voiceProvider === "stepfun" && serverStepfun) ||
|
|
(voiceProvider === "xiaomi" && !serverStepfun);
|
|
|
|
// Fast path: the client sent a matching voice. (Also covers the legacy
|
|
// xiaomi card + xiaomi server case where the 220KB was unavoidable anyway.)
|
|
if (req.voice && voiceMatchesServer) {
|
|
return req.voice;
|
|
}
|
|
|
|
// Mismatch (or voice omitted). Re-provision against the server's provider.
|
|
if (!config.tts) return undefined;
|
|
|
|
// StepFun server: prefer an LLM-picked / prebaked id (zero-cost), else
|
|
// fall back to the keyword scorer over the voiceDescription.
|
|
if (serverStepfun) {
|
|
if (isValidStepfunVoiceId(req.stepfunVoiceId)) {
|
|
return provisionVoice(config.tts, req.voiceDescription ?? "", req.characterName, {
|
|
stepfunVoiceId: req.stepfunVoiceId,
|
|
});
|
|
}
|
|
if (req.voiceDescription) {
|
|
return provisionVoice(config.tts, req.voiceDescription, req.characterName);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
// Xiaomi server but client sent a StepFun voice (or nothing). Re-design via
|
|
// voicedesign using the description; no description → can't synthesize.
|
|
//
|
|
// NOTE: this re-provision runs OUTSIDE synthesizeBeat's 15s withTimeout — a
|
|
// hung MiMo voicedesign tail (~30-70s) could hang /api/beat-audio until the
|
|
// platform timeout. Accepted because: (1) this path only fires on a rare
|
|
// cross-provider replay (.infiplot carrying a stepfun voice, opened on a
|
|
// Xiaomi-server deploy) or a mid-session provider flip — NOT the common
|
|
// prebaked-card + stepfun-server case, which is a pure-function provision
|
|
// with no network; (2) it degrades to silence rather than crashing. If it
|
|
// ever bites in practice, wrap resolve+synth in one withTimeout in voice.ts
|
|
// (requires threading an AbortSignal through provisionVoice → xiaomiProvision).
|
|
if (req.voiceDescription) {
|
|
return provisionVoice(config.tts, req.voiceDescription, req.characterName);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
export async function requestBeatAudio(
|
|
config: EngineConfig,
|
|
req: BeatAudioRequest,
|
|
): Promise<BeatAudioResponse> {
|
|
if (!config.tts) return { audio: null };
|
|
const voice = await resolveVoice(config, req);
|
|
if (!voice) return { audio: null };
|
|
const audio = await synthesizeBeat(config.tts, voice, req.beat);
|
|
return { audio };
|
|
}
|