infiplot-web/lib/engine/orchestrator.ts

import type {
  BeatAudioRequest,
  BeatAudioResponse,
  CharacterVoice,
  EngineConfig,
  FreeformClassify,
  FreeformClassifyRequest,
  FreeformClassifyResponse,
  InsertBeatRequest,
  InsertBeatResponse,
  Session,
  SceneRequest,
  SceneResponse,
  StartRequest,
  StartResponse,
  VisionRequest,
  VisionResponse,
} from "@infiplot/types";
import { coerceOrientation } from "@infiplot/types";
import { chat } from "@infiplot/ai-client";
import { isStepfun, isValidStepfunVoiceId, provisionVoice } from "@infiplot/tts-client";
import { runArchitect } from "./agents/architect";
import { selectStyle } from "./agents/styleSelector";
import { directInsertBeat, directScene } from "./director";
import { STYLE_MAP } from "@/lib/options";
import { parseJsonLoose } from "./jsonParser";
import {
  FREEFORM_CLASSIFY_SYSTEM,
  buildFreeformClassifyUserMessage,
} from "./prompts";
import { synthesizeBeat } from "./voice";
import { interpret } from "./vision";

function newSessionId(): string {
  return `s_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
}

function tlog(label: string, t0: number): void {
  console.log(`${label}: ${Date.now() - t0}ms`);
}

// ──────────────────────────────────────────────────────────────────────
//  startSession — initial Scene via the multi-agent pipeline.
//
//  directScene internally handles: Writer → (CharacterDesigner+
//  Cinematographer parallel) → Painter → upload. Voice provisioning and
//  portrait generation happen inside CharacterDesigner per new character,
//  so the orchestrator no longer needs to coordinate them separately.
// ──────────────────────────────────────────────────────────────────────

export async function startSession(
  config: EngineConfig,
  req: StartRequest,
): Promise<StartResponse> {
  const tTotal = Date.now();

  const session: Session = {
    id: newSessionId(),
    createdAt: Date.now(),
    worldSetting: req.worldSetting.trim(),
    styleGuide: req.styleGuide.trim(),
    history: [],
    characters: [],
    styleReferenceImage: req.styleReferenceImage?.trim() || undefined,
    orientation: coerceOrientation(req.orientation),
    playerName: req.playerName?.trim() || undefined,
  };

  // Stage 0 — Architect (+ optional auto style selection, in parallel).
  // Both only depend on worldSetting, so they run concurrently.
  console.log(
    `[start] worldSetting (${session.worldSetting.length} chars):\n${session.worldSetting}`,
  );
  const isAutoStyle = session.styleGuide === "auto";
  if (isAutoStyle) {
    session.styleGuide = "由 AI 根据剧情自动匹配最佳画风";
  }
  const tArchitect = Date.now();
  const [architectResult, autoStyleGuide] = await Promise.all([
    runArchitect(config.text, session),
    isAutoStyle
      ? selectStyle(config.text, session.worldSetting).catch((err) => {
          console.warn(`[styleSelector] failed, falling back to 吉卜力:`, err);
          return null;
        })
      : Promise.resolve(null),
  ]);
  session.storyState = architectResult;
  if (isAutoStyle) {
    session.styleGuide = autoStyleGuide ?? STYLE_MAP["吉卜力"]!;
    console.log(`[start] auto-selected style: ${session.styleGuide.slice(0, 60)}…`);
  }
  tlog("[start] Architect" + (isAutoStyle ? " + StyleSelector" : ""), tArchitect);
  console.log(
    `[start] storyBible: logline="${session.storyState.logline}" | genreTags="${session.storyState.genreTags}" | synopsis="${session.storyState.synopsis}"`,
  );

  const { scene, sceneImageUrl, characters, storyState } = await directScene(
    config,
    session,
  );

  tlog("[start] TOTAL", tTotal);

  return {
    sessionId: session.id,
    scene,
    imageUrl: sceneImageUrl,
    characters,
    storyState,
  };
}

// ──────────────────────────────────────────────────────────────────────
//  requestScene — next Scene from existing session.
// ──────────────────────────────────────────────────────────────────────

export async function requestScene(
  config: EngineConfig,
  req: SceneRequest,
): Promise<SceneResponse> {
  const tTotal = Date.now();

  const { scene, sceneImageUrl, characters, storyState } = await directScene(
    config,
    req.session,
  );

  tlog("[scene] TOTAL", tTotal);

  return {
    scene,
    imageUrl: sceneImageUrl,
    characters,
    storyState,
  };
}

// ──────────────────────────────────────────────────────────────────────
//  visionDecide — interprets a background click into intent + classify.
//  No change from staging — vision lives outside the scene-generation
//  pipeline.
// ──────────────────────────────────────────────────────────────────────

export async function visionDecide(
  config: EngineConfig,
  req: VisionRequest,
): Promise<VisionResponse> {
  const current = req.session.history.at(-1)?.scene ?? null;
  return interpret(config.vision, req.annotatedImageBase64, current);
}

// ──────────────────────────────────────────────────────────────────────
//  classifyFreeform — classifies a freeform text input at a choice node
//  into match-choice / insert-beat / change-scene. Single lightweight
//  LLM call; no image, no scene generation.
// ──────────────────────────────────────────────────────────────────────

export async function classifyFreeform(
  config: EngineConfig,
  req: FreeformClassifyRequest,
): Promise<FreeformClassifyResponse> {
  const current = req.session.history.at(-1)?.scene ?? null;
  const userMsg = buildFreeformClassifyUserMessage(
    req.freeformText,
    current?.scenePrompt,
  );

  const raw = await chat(config.text, [
    { role: "system", content: FREEFORM_CLASSIFY_SYSTEM },
    { role: "user", content: userMsg },
  ], { temperature: 0, tag: "freeform-classify" });

  const parsed = parseJsonLoose<{
    classify?: string;
    freeformAction?: string;
  }>(raw);

  const classify: FreeformClassify =
    parsed.classify === "change-scene" ? "change-scene" : "insert-beat";

  return {
    classify,
    freeformAction: parsed.freeformAction?.trim() || req.freeformText,
  };
}

// ──────────────────────────────────────────────────────────────────────
//  requestInsertBeat — single-agent transient beat (no image, no new
//  characters). Stays single-LLM by design — the INSERT_BEAT prompt
//  forbids new characters and there's nothing to render.
// ──────────────────────────────────────────────────────────────────────

export async function requestInsertBeat(
  config: EngineConfig,
  req: InsertBeatRequest,
): Promise<InsertBeatResponse> {
  const tTotal = Date.now();

  const partial = await directInsertBeat(
    config.text,
    req.session,
    req.freeformAction,
  );

  // INSERT_BEAT prompt forbids new NPCs — promote disallowed-speaker lines
  // to narration so the player still sees the text (the client only renders
  // `line` when there is a `speaker`).
  //
  // Exception (Pattern B): speaker = "你" is the player speaking. No
  // Character record exists for "你" (intentional — TTS is skipped), so we
  // must NOT demote it; the client renders the dialog box correctly.
  // directInsertBeat already normalized POV variants to "你" before this
  // guard, so a literal "你" here is always Pattern B player dialog.
  if (
    partial.speaker &&
    partial.speaker !== "你" &&
    !req.session.characters.some((c) => c.name === partial.speaker)
  ) {
    console.warn(
      `[insert-beat] unregistered speaker "${partial.speaker}" ignored`,
    );
    const promotedNarration =
      [partial.narration, partial.line].filter(Boolean).join("\n") || undefined;
    tlog("[insert-beat] TOTAL", tTotal);
    return {
      partial: {
        narration: promotedNarration,
        speaker: undefined,
        line: undefined,
        lineDelivery: undefined,
      },
      characters: req.session.characters,
    };
  }

  tlog("[insert-beat] TOTAL", tTotal);
  return { partial, characters: req.session.characters };
}

// ──────────────────────────────────────────────────────────────────────
//  requestBeatAudio — lazy per-beat synth. Returns audio:null on
//  timeout / failure / TTS disabled, so the client just plays silent.
// ──────────────────────────────────────────────────────────────────────

// Resolve a synth-ready voice for the request, normalizing provider
// mismatches. The client usually sends a voice whose provider matches the
// server's TTS (the common case). The mismatch case is mainly prebaked
// homepage cards: they ship a Xiaomi voice baked at build time, but the
// server may now run StepFun — so the client skips the ~220KB reference
// audio (saving FOT) and sends stepfunVoiceId / voiceDescription instead.
// We re-provision against the SERVER's provider so the right voice synth runs.
// Returns undefined when there's nothing to synthesize from (caller plays
// silent).
async function resolveVoice(
  config: EngineConfig,
  req: BeatAudioRequest,
): Promise<CharacterVoice | undefined> {
  const serverStepfun = !!config.tts && isStepfun(config.tts);
  const voiceProvider = req.voice?.provider;
  const voiceMatchesServer =
    (voiceProvider === "stepfun" && serverStepfun) ||
    (voiceProvider === "xiaomi" && !serverStepfun);

  // Fast path: the client sent a matching voice. (Also covers the legacy
  // xiaomi card + xiaomi server case where the 220KB was unavoidable anyway.)
  if (req.voice && voiceMatchesServer) {
    return req.voice;
  }

  // Mismatch (or voice omitted). Re-provision against the server's provider.
  if (!config.tts) return undefined;

  // StepFun server: prefer an LLM-picked / prebaked id (zero-cost), else
  // fall back to the keyword scorer over the voiceDescription.
  if (serverStepfun) {
    if (isValidStepfunVoiceId(req.stepfunVoiceId)) {
      return provisionVoice(config.tts, req.voiceDescription ?? "", req.characterName, {
        stepfunVoiceId: req.stepfunVoiceId,
      });
    }
    if (req.voiceDescription) {
      return provisionVoice(config.tts, req.voiceDescription, req.characterName);
    }
    return undefined;
  }

  // Xiaomi server but client sent a StepFun voice (or nothing). Re-design via
  // voicedesign using the description; no description → can't synthesize.
  //
  // NOTE: this re-provision runs OUTSIDE synthesizeBeat's 15s withTimeout — a
  // hung MiMo voicedesign tail (~30-70s) could hang /api/beat-audio until the
  // platform timeout. Accepted because: (1) this path only fires on a rare
  // cross-provider replay (.infiplot carrying a stepfun voice, opened on a
  // Xiaomi-server deploy) or a mid-session provider flip — NOT the common
  // prebaked-card + stepfun-server case, which is a pure-function provision
  // with no network; (2) it degrades to silence rather than crashing. If it
  // ever bites in practice, wrap resolve+synth in one withTimeout in voice.ts
  // (requires threading an AbortSignal through provisionVoice → xiaomiProvision).
  if (req.voiceDescription) {
    return provisionVoice(config.tts, req.voiceDescription, req.characterName);
  }
  return undefined;
}

export async function requestBeatAudio(
  config: EngineConfig,
  req: BeatAudioRequest,
): Promise<BeatAudioResponse> {
  if (!config.tts) return { audio: null };
  const voice = await resolveVoice(config, req);
  if (!voice) return { audio: null };
  const audio = await synthesizeBeat(config.tts, voice, req.beat);
  return { audio };
}