Files
infiplot-web/lib/engine/orchestrator.ts
T
yuanzonghao 65b7daff0b fix(beat-audio): harden voice-provider validation and resolveVoice fast path
Address PR-agent review findings:

- resolveVoice fast path: replace ambiguous boolean comparison
  (voiceProvider === "stepfun") === serverStepfun with explicit
  per-provider equality checks. Prevents an undefined or unknown
  provider from matching the non-stepfun (xiaomi) branch by accident.

- /api/beat-audio route: reject requests whose voice.provider is present
  but not in the VALID_TTS_PROVIDERS whitelist (e.g. "azure"). Previously
  such a request would pass validation when fallback fields were also
  present, and resolveVoice might use the invalid voice directly instead
  of falling back to reprovision — producing a silent beat instead of a
  voiced one.
2026-06-15 14:33:46 +08:00

316 lines
13 KiB
TypeScript

import type {
BeatAudioRequest,
BeatAudioResponse,
CharacterVoice,
EngineConfig,
FreeformClassify,
FreeformClassifyRequest,
FreeformClassifyResponse,
InsertBeatRequest,
InsertBeatResponse,
Session,
SceneRequest,
SceneResponse,
StartRequest,
StartResponse,
VisionRequest,
VisionResponse,
} from "@infiplot/types";
import { coerceOrientation } from "@infiplot/types";
import { chat } from "@infiplot/ai-client";
import { isStepfun, isValidStepfunVoiceId, provisionVoice } from "@infiplot/tts-client";
import { runArchitect } from "./agents/architect";
import { selectStyle } from "./agents/styleSelector";
import { directInsertBeat, directScene } from "./director";
import { STYLE_MAP } from "@/lib/options";
import { parseJsonLoose } from "./jsonParser";
import {
FREEFORM_CLASSIFY_SYSTEM,
buildFreeformClassifyUserMessage,
} from "./prompts";
import { synthesizeBeat } from "./voice";
import { interpret } from "./vision";
function newSessionId(): string {
return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
}
function tlog(label: string, t0: number): void {
console.log(`${label}: ${Date.now() - t0}ms`);
}
// ──────────────────────────────────────────────────────────────────────
// startSession — initial Scene via the multi-agent pipeline.
//
// directScene internally handles: Writer → (CharacterDesigner+
// Cinematographer parallel) → Painter → upload. Voice provisioning and
// portrait generation happen inside CharacterDesigner per new character,
// so the orchestrator no longer needs to coordinate them separately.
// ──────────────────────────────────────────────────────────────────────
export async function startSession(
config: EngineConfig,
req: StartRequest,
): Promise<StartResponse> {
const tTotal = Date.now();
const session: Session = {
id: newSessionId(),
createdAt: Date.now(),
worldSetting: req.worldSetting.trim(),
styleGuide: req.styleGuide.trim(),
history: [],
characters: [],
styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
orientation: coerceOrientation(req.orientation),
playerName: req.playerName?.trim() || undefined,
};
// Stage 0 — Architect (+ optional auto style selection, in parallel).
// Both only depend on worldSetting, so they run concurrently.
console.log(
`[start] worldSetting (${session.worldSetting.length} chars):\n${session.worldSetting}`,
);
const isAutoStyle = session.styleGuide === "auto";
if (isAutoStyle) {
session.styleGuide = "由 AI 根据剧情自动匹配最佳画风";
}
const tArchitect = Date.now();
const [architectResult, autoStyleGuide] = await Promise.all([
runArchitect(config.text, session),
isAutoStyle
? selectStyle(config.text, session.worldSetting).catch((err) => {
console.warn(`[styleSelector] failed, falling back to 吉卜力:`, err);
return null;
})
: Promise.resolve(null),
]);
session.storyState = architectResult;
if (isAutoStyle) {
session.styleGuide = autoStyleGuide ?? STYLE_MAP["吉卜力"]!;
console.log(`[start] auto-selected style: ${session.styleGuide.slice(0, 60)}…`);
}
tlog("[start] Architect" + (isAutoStyle ? " + StyleSelector" : ""), tArchitect);
console.log(
`[start] storyBible: logline="${session.storyState.logline}" | genreTags="${session.storyState.genreTags}" | synopsis="${session.storyState.synopsis}"`,
);
const { scene, sceneImageUrl, characters, storyState } = await directScene(
config,
session,
);
tlog("[start] TOTAL", tTotal);
return {
sessionId: session.id,
scene,
imageUrl: sceneImageUrl,
characters,
storyState,
};
}
// ──────────────────────────────────────────────────────────────────────
// requestScene — next Scene from existing session.
// ──────────────────────────────────────────────────────────────────────
export async function requestScene(
config: EngineConfig,
req: SceneRequest,
): Promise<SceneResponse> {
const tTotal = Date.now();
const { scene, sceneImageUrl, characters, storyState } = await directScene(
config,
req.session,
);
tlog("[scene] TOTAL", tTotal);
return {
scene,
imageUrl: sceneImageUrl,
characters,
storyState,
};
}
// ──────────────────────────────────────────────────────────────────────
// visionDecide — interprets a background click into intent + classify.
// No change from staging — vision lives outside the scene-generation
// pipeline.
// ──────────────────────────────────────────────────────────────────────
export async function visionDecide(
config: EngineConfig,
req: VisionRequest,
): Promise<VisionResponse> {
const current = req.session.history.at(-1)?.scene ?? null;
return interpret(config.vision, req.annotatedImageBase64, current);
}
// ──────────────────────────────────────────────────────────────────────
// classifyFreeform — classifies a freeform text input at a choice node
// into match-choice / insert-beat / change-scene. Single lightweight
// LLM call; no image, no scene generation.
// ──────────────────────────────────────────────────────────────────────
export async function classifyFreeform(
config: EngineConfig,
req: FreeformClassifyRequest,
): Promise<FreeformClassifyResponse> {
const current = req.session.history.at(-1)?.scene ?? null;
const userMsg = buildFreeformClassifyUserMessage(
req.freeformText,
current?.scenePrompt,
);
const raw = await chat(config.text, [
{ role: "system", content: FREEFORM_CLASSIFY_SYSTEM },
{ role: "user", content: userMsg },
], { temperature: 0, tag: "freeform-classify" });
const parsed = parseJsonLoose<{
classify?: string;
freeformAction?: string;
}>(raw);
const classify: FreeformClassify =
parsed.classify === "change-scene" ? "change-scene" : "insert-beat";
return {
classify,
freeformAction: parsed.freeformAction?.trim() || req.freeformText,
};
}
// ──────────────────────────────────────────────────────────────────────
// requestInsertBeat — single-agent transient beat (no image, no new
// characters). Stays single-LLM by design — the INSERT_BEAT prompt
// forbids new characters and there's nothing to render.
// ──────────────────────────────────────────────────────────────────────
export async function requestInsertBeat(
config: EngineConfig,
req: InsertBeatRequest,
): Promise<InsertBeatResponse> {
const tTotal = Date.now();
const partial = await directInsertBeat(
config.text,
req.session,
req.freeformAction,
);
// INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines
// to narration so the player still sees the text (the client only renders
// `line` when there is a `speaker`).
//
// Exception (Pattern B): speaker = "你" is the player speaking. No
// Character record exists for "你" (intentional — TTS is skipped), so we
// must NOT demote it; the client renders the dialog box correctly.
// directInsertBeat already normalized POV variants to "你" before this
// guard, so a literal "你" here is always Pattern B player dialog.
if (
partial.speaker &&
partial.speaker !== "你" &&
!req.session.characters.some((c) => c.name === partial.speaker)
) {
console.warn(
`[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
);
const promotedNarration =
[partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
tlog("[insert-beat] TOTAL", tTotal);
return {
partial: {
narration: promotedNarration,
speaker: undefined,
line: undefined,
lineDelivery: undefined,
},
characters: req.session.characters,
};
}
tlog("[insert-beat] TOTAL", tTotal);
return { partial, characters: req.session.characters };
}
// ──────────────────────────────────────────────────────────────────────
// requestBeatAudio — lazy per-beat synth. Returns audio:null on
// timeout / failure / TTS disabled, so the client just plays silent.
// ──────────────────────────────────────────────────────────────────────
// Resolve a synth-ready voice for the request, normalizing provider
// mismatches. The client usually sends a voice whose provider matches the
// server's TTS (the common case). The mismatch case is mainly prebaked
// homepage cards: they ship a Xiaomi voice baked at build time, but the
// server may now run StepFun — so the client skips the ~220KB reference
// audio (saving FOT) and sends stepfunVoiceId / voiceDescription instead.
// We re-provision against the SERVER's provider so the right voice synth runs.
// Returns undefined when there's nothing to synthesize from (caller plays
// silent).
async function resolveVoice(
config: EngineConfig,
req: BeatAudioRequest,
): Promise<CharacterVoice | undefined> {
const serverStepfun = !!config.tts && isStepfun(config.tts);
const voiceProvider = req.voice?.provider;
const voiceMatchesServer =
(voiceProvider === "stepfun" && serverStepfun) ||
(voiceProvider === "xiaomi" && !serverStepfun);
// Fast path: the client sent a matching voice. (Also covers the legacy
// xiaomi card + xiaomi server case where the 220KB was unavoidable anyway.)
if (req.voice && voiceMatchesServer) {
return req.voice;
}
// Mismatch (or voice omitted). Re-provision against the server's provider.
if (!config.tts) return undefined;
// StepFun server: prefer an LLM-picked / prebaked id (zero-cost), else
// fall back to the keyword scorer over the voiceDescription.
if (serverStepfun) {
if (isValidStepfunVoiceId(req.stepfunVoiceId)) {
return provisionVoice(config.tts, req.voiceDescription ?? "", req.characterName, {
stepfunVoiceId: req.stepfunVoiceId,
});
}
if (req.voiceDescription) {
return provisionVoice(config.tts, req.voiceDescription, req.characterName);
}
return undefined;
}
// Xiaomi server but client sent a StepFun voice (or nothing). Re-design via
// voicedesign using the description; no description → can't synthesize.
//
// NOTE: this re-provision runs OUTSIDE synthesizeBeat's 15s withTimeout — a
// hung MiMo voicedesign tail (~30-70s) could hang /api/beat-audio until the
// platform timeout. Accepted because: (1) this path only fires on a rare
// cross-provider replay (.infiplot carrying a stepfun voice, opened on a
// Xiaomi-server deploy) or a mid-session provider flip — NOT the common
// prebaked-card + stepfun-server case, which is a pure-function provision
// with no network; (2) it degrades to silence rather than crashing. If it
// ever bites in practice, wrap resolve+synth in one withTimeout in voice.ts
// (requires threading an AbortSignal through provisionVoice → xiaomiProvision).
if (req.voiceDescription) {
return provisionVoice(config.tts, req.voiceDescription, req.characterName);
}
return undefined;
}
export async function requestBeatAudio(
config: EngineConfig,
req: BeatAudioRequest,
): Promise<BeatAudioResponse> {
if (!config.tts) return { audio: null };
const voice = await resolveVoice(config, req);
if (!voice) return { audio: null };
const audio = await synthesizeBeat(config.tts, voice, req.beat);
return { audio };
}