chore(engine): log prompt-cache hit/miss per chat call
Add a `tag` option to chat() and have it print one `[cache] <tag> hit=X miss=Y rate=Z%` line per call. Three Usage-shape variants are probed in order so the same logger works across providers: - DeepSeek (v3+): usage.prompt_cache_hit_tokens / *_miss_tokens - OpenAI / o-series: usage.prompt_tokens_details.cached_tokens - Anthropic: usage.cache_read_input_tokens / *_creation_* When none of them are present (MiMo / local Ollama / others) we still print prompt + completion totals so the cost baseline is visible. Tag every callsite so the log is greppable: architect / writer / character-designer / cinematographer / insert-beat This is the prerequisite for the prefix-cache reordering work that follows — without per-agent visibility there's no way to tell if a prompt rearrangement actually moved the needle. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+62
-2
@@ -6,10 +6,65 @@ export type ChatMessage = {
|
|||||||
content: string;
|
content: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Different providers expose prompt-cache stats under different keys. We probe
|
||||||
|
// for the three forms we've seen in the wild and fall back to total tokens
|
||||||
|
// when no cache field exists.
|
||||||
|
//
|
||||||
|
// DeepSeek (v3+) usage.prompt_cache_hit_tokens / prompt_cache_miss_tokens
|
||||||
|
// OpenAI / o-series usage.prompt_tokens_details.cached_tokens
|
||||||
|
// Anthropic / others usage.cache_read_input_tokens / cache_creation_input_tokens
|
||||||
|
// No-cache (MiMo,
|
||||||
|
// local Ollama, …) only prompt_tokens / completion_tokens — print those
|
||||||
|
// so we still get a rough cost baseline.
|
||||||
|
type Usage = {
|
||||||
|
prompt_tokens?: number;
|
||||||
|
completion_tokens?: number;
|
||||||
|
prompt_cache_hit_tokens?: number;
|
||||||
|
prompt_cache_miss_tokens?: number;
|
||||||
|
prompt_tokens_details?: { cached_tokens?: number };
|
||||||
|
cache_read_input_tokens?: number;
|
||||||
|
cache_creation_input_tokens?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
function summarizeUsage(tag: string, usage: Usage | undefined): string {
|
||||||
|
if (!usage) return `[cache] ${tag} no-usage`;
|
||||||
|
const prompt = usage.prompt_tokens ?? 0;
|
||||||
|
const completion = usage.completion_tokens ?? 0;
|
||||||
|
// DeepSeek-style
|
||||||
|
if (typeof usage.prompt_cache_hit_tokens === "number") {
|
||||||
|
const hit = usage.prompt_cache_hit_tokens;
|
||||||
|
const miss = usage.prompt_cache_miss_tokens ?? Math.max(0, prompt - hit);
|
||||||
|
const denom = hit + miss;
|
||||||
|
const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
|
||||||
|
return `[cache] ${tag} hit=${hit} miss=${miss} rate=${rate}% completion=${completion}`;
|
||||||
|
}
|
||||||
|
// OpenAI-style
|
||||||
|
const oaiCached = usage.prompt_tokens_details?.cached_tokens;
|
||||||
|
if (typeof oaiCached === "number") {
|
||||||
|
const miss = Math.max(0, prompt - oaiCached);
|
||||||
|
const rate = prompt > 0 ? ((oaiCached / prompt) * 100).toFixed(1) : "n/a";
|
||||||
|
return `[cache] ${tag} hit=${oaiCached} miss=${miss} rate=${rate}% completion=${completion}`;
|
||||||
|
}
|
||||||
|
// Anthropic-style
|
||||||
|
if (typeof usage.cache_read_input_tokens === "number") {
|
||||||
|
const hit = usage.cache_read_input_tokens;
|
||||||
|
const create = usage.cache_creation_input_tokens ?? 0;
|
||||||
|
const denom = hit + create + prompt;
|
||||||
|
const rate = denom > 0 ? ((hit / denom) * 100).toFixed(1) : "n/a";
|
||||||
|
return `[cache] ${tag} hit=${hit} create=${create} miss=${prompt} rate=${rate}% completion=${completion}`;
|
||||||
|
}
|
||||||
|
// No cache field at all
|
||||||
|
return `[cache] ${tag} prompt=${prompt} completion=${completion} (provider didn't report cache stats)`;
|
||||||
|
}
|
||||||
|
|
||||||
export async function chat(
|
export async function chat(
|
||||||
config: ProviderConfig,
|
config: ProviderConfig,
|
||||||
messages: ChatMessage[],
|
messages: ChatMessage[],
|
||||||
opts?: { temperature?: number; responseFormat?: "json_object" | "text" },
|
opts?: {
|
||||||
|
temperature?: number;
|
||||||
|
responseFormat?: "json_object" | "text";
|
||||||
|
tag?: string;
|
||||||
|
},
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
|
const url = `${config.baseUrl.replace(/\/$/, "")}/chat/completions`;
|
||||||
const body: Record<string, unknown> = {
|
const body: Record<string, unknown> = {
|
||||||
@@ -35,7 +90,10 @@ export async function chat(
|
|||||||
throw new Error(`Chat API error ${res.status}: ${text}`);
|
throw new Error(`Chat API error ${res.status}: ${text}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
let json: { choices: { message: { content: string } }[] };
|
let json: {
|
||||||
|
choices: { message: { content: string } }[];
|
||||||
|
usage?: Usage;
|
||||||
|
};
|
||||||
try {
|
try {
|
||||||
json = JSON.parse(text);
|
json = JSON.parse(text);
|
||||||
} catch {
|
} catch {
|
||||||
@@ -50,5 +108,7 @@ export async function chat(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(summarizeUsage(opts?.tag ?? "chat", json.usage));
|
||||||
|
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ export async function runArchitect(
|
|||||||
{ role: "system", content: ARCHITECT_SYSTEM },
|
{ role: "system", content: ARCHITECT_SYSTEM },
|
||||||
{ role: "user", content: buildArchitectUserMessage(session) },
|
{ role: "user", content: buildArchitectUserMessage(session) },
|
||||||
],
|
],
|
||||||
{ temperature: 0.85, responseFormat: "json_object" },
|
{ temperature: 0.85, responseFormat: "json_object", tag: "architect" },
|
||||||
);
|
);
|
||||||
|
|
||||||
const parsed = parseJsonLoose<RawStoryState>(raw);
|
const parsed = parseJsonLoose<RawStoryState>(raw);
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ async function runDesignLLM(
|
|||||||
content: buildCharacterDesignerUserMessage(charName, session),
|
content: buildCharacterDesignerUserMessage(charName, session),
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
{ temperature: 0.7, responseFormat: "json_object" },
|
{ temperature: 0.7, responseFormat: "json_object", tag: "character-designer" },
|
||||||
);
|
);
|
||||||
return parseJsonLoose<CharacterDesignOutput>(raw);
|
return parseJsonLoose<CharacterDesignOutput>(raw);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ export async function runCinematographer(
|
|||||||
),
|
),
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
{ temperature: 0.6, responseFormat: "json_object" },
|
{ temperature: 0.6, responseFormat: "json_object", tag: "cinematographer" },
|
||||||
);
|
);
|
||||||
|
|
||||||
const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
|
const parsed = parseJsonLoose<RawCinematographerOutput>(raw);
|
||||||
|
|||||||
@@ -369,7 +369,7 @@ export async function runWriter(
|
|||||||
{ role: "system", content: WRITER_SYSTEM },
|
{ role: "system", content: WRITER_SYSTEM },
|
||||||
{ role: "user", content: buildWriterUserMessage(session) },
|
{ role: "user", content: buildWriterUserMessage(session) },
|
||||||
],
|
],
|
||||||
{ temperature: 0.9, responseFormat: "json_object" },
|
{ temperature: 0.9, responseFormat: "json_object", tag: "writer" },
|
||||||
);
|
);
|
||||||
|
|
||||||
const parsed = parseJsonLoose<RawScene>(raw);
|
const parsed = parseJsonLoose<RawScene>(raw);
|
||||||
|
|||||||
@@ -405,7 +405,7 @@ export async function directInsertBeat(
|
|||||||
content: buildInsertBeatUserMessage(session, freeformAction),
|
content: buildInsertBeatUserMessage(session, freeformAction),
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
{ temperature: 0.9, responseFormat: "json_object" },
|
{ temperature: 0.9, responseFormat: "json_object", tag: "insert-beat" },
|
||||||
);
|
);
|
||||||
|
|
||||||
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
|
const parsed = parseJsonLoose<InsertBeatPartial>(raw);
|
||||||
|
|||||||
Reference in New Issue
Block a user