infiplot-web/scripts/estimate-token-budget.mjs

#!/usr/bin/env node
/**
 * Task 23: Token 预算估算
 *
 * 通过对比新旧 prompt 文本长度来估算 token 增量。
 *
 * 旧版本（1ae5ab1 之前）：
 * - WRITER_STREAM_SYSTEM: 约 140 行硬编码模板字符串
 *
 * 新版本（当前 prompt 架构改造后）：
 * - 8 个段落文件 + Context segments
 */

import { promises as fs } from "fs";
import { fileURLToPath } from "url";
import { dirname, join } from "path";

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// 粗略估算：英文 ~4 chars/token，中文 ~1.5-2 chars/token
// 使用保守估算：混合文本 ~2.5 chars/token
const CHARS_PER_TOKEN = 2.5;

function estimateTokens(textOrLength) {
  const length = typeof textOrLength === "string" ? textOrLength.length : textOrLength;
  return Math.ceil(length / CHARS_PER_TOKEN);
}

async function readSegmentFiles() {
  const segmentDir = join(__dirname, "../lib/engine/prompts/segments/writer");
  const files = [
    "identity.ts",
    "cot.ts",
    "style-base.ts",
    "narrative-rules.ts",
    "dialogue.ts",
    "guardrails.ts",
    "pacing.ts",
    "format.ts"
  ];

  let totalChars = 0;
  const segments = [];

  for (const file of files) {
    const filePath = join(segmentDir, file);
    const content = await fs.readFile(filePath, "utf-8");

    // 提取 content 字段（多行模板字符串）
    const match = content.match(/content:\s*`([^`]*)`/s);
    if (match) {
      const segmentContent = match[1].trim();
      const chars = segmentContent.length;
      const tokens = estimateTokens(segmentContent);

      segments.push({
        file,
        chars,
        tokens,
        enabled: !file.includes("cot") // COT 默认关闭
      });

      if (!file.includes("cot")) {
        totalChars += chars;
      }
    }
  }

  return { segments, totalChars, totalTokens: estimateTokens(totalChars) };
}

async function estimateContextSegments() {
  // 估算 Context segments 的典型大小
  const estimates = {
    "world-style": 150,        // 世界观 + 画风
    "story-spine": 300,         // 故事骨架（logline + genreTags + protagonist）
    "character-cards": 500,     // 3个角色卡 * ~150 chars
    "prior-sceneKeys": 100,     // 5个 sceneKey
    "archived-history": 800,    // 2个已完结场景摘要
    "lore-constant": 200,       // 2-3个恒定知识条目
    "story-dynamic": 400,       // synopsis + openThreads + relationships + nextHook
    "last-beat": 200,           // 上一刻文本
    "transition-hint": 150,     // 转场提示
    "lore-triggered": 150       // 1-2个触发条目
  };

  const stableChars = estimates["world-style"] + estimates["story-spine"] +
                      estimates["character-cards"] + estimates["prior-sceneKeys"] +
                      estimates["archived-history"] + estimates["lore-constant"];

  const dynamicChars = estimates["story-dynamic"] + estimates["last-beat"] +
                       estimates["transition-hint"] + estimates["lore-triggered"];

  return {
    stable: { chars: stableChars, tokens: estimateTokens(stableChars) },
    dynamic: { chars: dynamicChars, tokens: estimateTokens(dynamicChars) }
  };
}

async function estimateOldPrompt() {
  // 旧版本 WRITER_STREAM_SYSTEM（已删除）的估算
  // 从 git history 可知约 140 行，平均每行 ~60 chars（中英混合）
  const estimatedLines = 140;
  const avgCharsPerLine = 60;
  const totalChars = estimatedLines * avgCharsPerLine;

  return {
    chars: totalChars,
    tokens: estimateTokens(totalChars)
  };
}

console.log("📊 Task 23: Token 预算估算\n");
console.log("═".repeat(60));

// 新版本 Prompt 段落
const { segments, totalChars: segmentChars, totalTokens: segmentTokens } = await readSegmentFiles();

console.log("\n【新版本：8 个 Prompt 段落】");
console.log("-".repeat(60));
for (const seg of segments) {
  const status = seg.enabled ? "✓" : "✗ (disabled)";
  console.log(`${status} ${seg.file.padEnd(25)} ${seg.chars.toString().padStart(5)} chars  ~${seg.tokens} tokens`);
}
console.log("-".repeat(60));
console.log(`启用段落总计:                     ${segmentChars.toString().padStart(5)} chars  ~${segmentTokens} tokens\n`);

// Context segments
const context = await estimateContextSegments();
console.log("【新版本：Context Segments 估算】");
console.log("-".repeat(60));
console.log(`Stable 区 (cached):               ${context.stable.chars.toString().padStart(5)} chars  ~${context.stable.tokens} tokens`);
console.log(`Dynamic 区 (每次变化):             ${context.dynamic.chars.toString().padStart(5)} chars  ~${context.dynamic.tokens} tokens`);
console.log("-".repeat(60));
console.log(`Context 总计:                     ${(context.stable.chars + context.dynamic.chars).toString().padStart(5)} chars  ~${context.stable.tokens + context.dynamic.tokens} tokens\n`);

// 新版本总计
const newTotalTokens = segmentTokens + context.stable.tokens + context.dynamic.tokens;
console.log("【新版本总计】");
console.log("-".repeat(60));
console.log(`Prompt 段落 + Context:            ~${newTotalTokens} tokens\n`);

// 旧版本估算
const oldPrompt = await estimateOldPrompt();
console.log("【旧版本估算（WRITER_STREAM_SYSTEM）】");
console.log("-".repeat(60));
console.log(`硬编码模板字符串 (~140 lines):     ${oldPrompt.chars.toString().padStart(5)} chars  ~${oldPrompt.tokens} tokens`);
console.log(`Context (buildWriterContext):     估算与新版本相近，~${context.stable.tokens + context.dynamic.tokens} tokens\n`);

const oldTotalTokens = oldPrompt.tokens + context.stable.tokens + context.dynamic.tokens;

// 对比
console.log("【对比结果】");
console.log("═".repeat(60));
console.log(`旧版本总计:                       ~${oldTotalTokens} tokens`);
console.log(`新版本总计:                       ~${newTotalTokens} tokens`);
const delta = newTotalTokens - oldTotalTokens;
console.log(`增量 (Δ):                         ~${delta > 0 ? '+' : ''}${delta} tokens`);
console.log();

if (Math.abs(delta) <= 1500) {
  console.log(`✅ Token 增量在可控范围内 (|Δ| ≤ 1500)`);
} else {
  console.log(`⚠️  Token 增量超出预期 (|Δ| > 1500)`);
}

console.log("\n💡 注意事项:");
console.log("   - 此估算基于文本长度，实际 token 数取决于 tokenizer");
console.log("   - Context segments 使用典型场景估算（3角色，2场景历史）");
console.log("   - 禁词表（10个词）增加 ~20 tokens");
console.log("   - 实际 token 消耗需通过 Anthropic API usage 统计验证");
console.log("\n📄 建议通过 wrangler tail 监控实际 token 消耗");