fix(ai-client): clean up regressions from OpenAI SDK migration and canvas frame fix (#74)

Three follow-ups to ef3b579 (OpenAI SDK migration) and ebe39ef (canvas frame): - .env.example / config.ts / AGENTS.md: anthropic & google native protocols were removed with the Vercel AI SDK, but .env.example and AGENTS.md still advertised them. Rewrite the docs to point Claude/Gemini at their OpenAI-compatible endpoints (api.anthropic.com/v1, generativelanguage.googleapis.com/v1beta/openai), drop the dead Gemini "Nano Banana" image example, sync AGENTS.md (text/vision protocol list, image protocol list, the "OpenAI/Gemini via AI SDK" reference note), and append a short hint in readProvider() error message guiding anthropic/google users to openai_compatible instead of a bare rejection. - chat.ts: drop the unsafe `as { prompt_tokens_details?: ... }` cast; read cached_tokens straight off the SDK's CompletionUsage type. Add a comment noting the OpenAI usage object reports cache reads only (no cache-write count), so the create cost the old AI SDK path logged is unrecoverable. - PlayCanvas.tsx: revert <img key={imageUrl}> to key={imageUrl.slice(-48)}. The gpt-image/mock paths emit multi-MB data URIs; using the full string as React's reconciliation key adds avoidable diff overhead during the frequent re-renders. Matches the existing <audio> element's key convention. Validation: pnpm typecheck passes. (pnpm lint fails on a pre-existing Next 16 `next lint` CLI issue, identical on staging — unrelated to this change.)
2026-06-14 13:36:19 +08:00
parent 9157454b46
commit 0dea2f8e36
5 changed files with 43 additions and 27 deletions
@@ -3,18 +3,22 @@
 # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
 # (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]).
 #
-# TEXT / VISION default to any OpenAI-compatible endpoint, and can switch to
-# native Anthropic or Google Gemini via TEXT_PROVIDER / VISION_PROVIDER.
+# TEXT / VISION / IMAGE all speak the OpenAI wire format. Anthropic Claude
+# and Google Gemini are reachable through their own OpenAI-compatible
+# endpoints (see TEXT_PROVIDER notes below) — no native protocol switch is
+# needed.
 # TTS uses Xiaomi MiMo's own voice design / clone protocol
 # (not OpenAI-compatible; appends -voicedesign / -voiceclone).
 #
-# IMAGE supports Runware (its own task-array protocol), OpenAI (gpt-image),
-# and Google Gemini (Nano Banana) via IMAGE_PROVIDER.
+# IMAGE supports Runware (its own task-array protocol) and OpenAI (gpt-image)
+# via IMAGE_PROVIDER.
 #
 # *_PROVIDER (optional) selects the wire protocol; leave unset for the
-# OpenAI-compatible default (image is auto-detected from the URL). Base URLs
-# tolerate a missing or extra /v1 (or a trailing /chat/completions) — the
-# engine normalizes them.
+# OpenAI-compatible default (image is auto-detected from the URL). Valid
+# values are openai_compatible / openai / runware — native "anthropic" /
+# "google" protocols were removed when the Vercel AI SDK was dropped.
+# Base URLs tolerate a missing or extra /v1 (or a trailing /chat/completions)
+# — the engine normalizes them.
 # =============================================================

 # ---- 1. Text LLM · scene director ----------------------------------
@@ -30,9 +34,11 @@
 TEXT_BASE_URL=https://api.deepseek.com/v1
 TEXT_API_KEY=sk-xxx
 TEXT_MODEL=deepseek-v4-flash
-# TEXT_PROVIDER: openai_compatible (default) | anthropic | google
-#   anthropic → TEXT_BASE_URL=https://api.anthropic.com  TEXT_MODEL=claude-sonnet-4-6
-#   google    → TEXT_BASE_URL=https://generativelanguage.googleapis.com  TEXT_MODEL=gemini-3.5-flash
+# TEXT_PROVIDER: openai_compatible (default). This is the ONLY supported text
+# protocol. To use Claude or Gemini, leave TEXT_PROVIDER unset and point at
+# their OpenAI-compatible endpoints:
+#   Claude  → TEXT_BASE_URL=https://api.anthropic.com/v1  TEXT_MODEL=claude-sonnet-4-6
+#   Gemini  → TEXT_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai  TEXT_MODEL=gemini-3.5-flash
 # TEXT_PROVIDER=openai_compatible

 # ---- 2. Image generator (renders the scene background) -------------
@@ -44,14 +50,10 @@ TEXT_MODEL=deepseek-v4-flash
 IMAGE_BASE_URL=https://api.runware.ai/v1
 IMAGE_API_KEY=runware-xxx
 IMAGE_MODEL=runware:400@6
-# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible
-#                 | openai | google
+# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible | openai
 #   openai → gpt-image, supports referenceImages (character/scene continuity).
 #            IMAGE_BASE_URL=https://api.openai.com  IMAGE_MODEL=gpt-image-1
-#   google → Gemini "Nano Banana" (Imagen is EOL 2026-06-24, do not use it).
-#            IMAGE_BASE_URL=https://generativelanguage.googleapis.com
-#            IMAGE_MODEL=gemini-2.5-flash-image
-# NOTE: openai/google return raw bytes → inlined as a data: URI for the session
+# NOTE: openai returns raw bytes → inlined as a data: URI for the session
 # (heavier per-call transport than Runware's UUID re-reference loop). Runware
 # stays fastest + cheapest for the scene-by-scene flow.
 # IMAGE_PROVIDER=runware
@@ -77,9 +79,9 @@ IMAGE_MODEL=runware:400@6
 VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 VISION_API_KEY=tp-xxx
 VISION_MODEL=mimo-v2.5
-# VISION_PROVIDER: openai_compatible (default) | anthropic | google
-#   anthropic → VISION_BASE_URL=https://api.anthropic.com  VISION_MODEL=claude-sonnet-4-6
-#   google    → VISION_BASE_URL=https://generativelanguage.googleapis.com  VISION_MODEL=gemini-3.5-flash
+# VISION_PROVIDER: openai_compatible (default). Only openai_compatible is
+# supported — reach Claude/Gemini via their OpenAI-compatible endpoints
+# (same base URLs as TEXT above). Leave unset to use the default.
 # VISION_PROVIDER=openai_compatible

 # ---- 4. TTS (optional — leave blank to disable) --------------------
@@ -79,7 +79,7 @@ Maintain graceful degradation. Existing flows tolerate malformed AI JSON, failed

 `sceneKey` identifies a physical space such as `"classroom-dusk"`. If a new scene shares a key with prior history, the prior scene image should be reused as a reference. Character portraits are also references.

-Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The OpenAI/Gemini image paths can also accept references through the AI SDK, but they return data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.
+Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The native OpenAI image path (gpt-image) can also accept references via `images.edit`, but returns data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.

 Writer prompt caching depends on `buildWriterPlanUserMessage()` and `buildWriterBeatsUserMessage()` keeping their stable prefixes intact: world, style, story spine, archived history, known scene keys, and character list. The dynamic suffix contains current state, last beat, exit hint, and the current plan. Do not reorder or reformat stable prefix sections casually; it can destroy cache hit rates.

@@ -136,8 +136,8 @@ Comment only non-obvious sequencing, provider quirks, fallback behavior, or arch

 Use `.env.example` as the source of truth. Never commit `.env.local`, API keys, uploaded user content, or generated secrets.

- Text and Vision use `TEXT_*` and `VISION_*`; default protocol is `openai_compatible`, with native `anthropic` and `google` available via `TEXT_PROVIDER` / `VISION_PROVIDER`.
- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, native `openai`, and native `google`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
+- Text and Vision use `TEXT_*` and `VISION_*` over the `openai_compatible` protocol (the only supported text/vision protocol); Claude and Gemini are reached via their own OpenAI-compatible endpoints with the `*_PROVIDER` var unset.
+- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, and native `openai`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
 - `IMAGE_TIMEOUT_MS` (per-attempt hard deadline) and `IMAGE_HEDGE_MS` (Painter scene-paint hedging: race a second request when the first is still pending after the threshold) are both OFF when unset — the default path must stay byte-identical to historical behavior. Hedging applies only to the Tier-A scene paint, never to portraits, and never fires after a fast failure (saturation guard). Client-side engine configs (`resolveEngineConfig`) intentionally do not set these fields.
 - TTS supports Xiaomi MiMo (voicedesign + voiceclone) or StepFun (preset voices auto-selected by keyword scoring), inferred from `TTS_BASE_URL` (host containing `stepfun.com` → StepFun, otherwise → MiMo). `CharacterVoice` is a discriminated union on `provider`; synth dispatches on the voice's own tag so a session may carry both shapes through a provider switch. Blank config means silent mode.
 - `MOCK_IMAGE=true` skips image generation and returns a placeholder for cheap local iteration.
@@ -399,9 +399,12 @@ export function PlayCanvas({
        >
          {/* The stable wrapper owns the frame size. Keeping overlay geometry
              independent of <img> decode/source swaps prevents controls from
-              jumping when a newly generated image is committed. */}
+              jumping when a newly generated image is committed. The key uses
+              a short high-entropy slice (matching the <audio> element) so data
+              URIs from the gpt-image/mock paths — which can be several MB —
+              don't become React's reconciliation key. */}
          <img
-            key={imageUrl}
+            key={imageUrl.slice(-48)}
            ref={imgRef}
            src={imageUrl}
            width={intrinsicW}
@@ -7,6 +7,12 @@ export type ChatMessage = {
  content: string;
 };

+// Cache observability for the prompt-prefix caching that the Writer stable
+// prefix relies on. The OpenAI usage object reports only cached READS
+// (prompt_tokens_details.cached_tokens) and has no field for cache WRITES
+// (tokens written to the cache on a cold pass), so unlike the old AI SDK
+// path we can show the hit rate but not the create cost. cached_tokens lives
+// directly on the SDK's CompletionUsage type — no cast needed.
 function summarizeSdkUsage(
  tag: string,
  usage: OpenAI.Completions.CompletionUsage | undefined,
@@ -14,8 +20,7 @@ function summarizeSdkUsage(
  if (!usage) return `[cache] ${tag} no-usage`;
  const input = usage.prompt_tokens ?? 0;
  const output = usage.completion_tokens ?? 0;
-  const details = (usage as { prompt_tokens_details?: { cached_tokens?: number } }).prompt_tokens_details;
-  const cached = details?.cached_tokens;
+  const cached = usage.prompt_tokens_details?.cached_tokens;
  if (typeof cached === "number") {
    const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
    return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
@@ -40,8 +40,14 @@ function readProvider(name: string): ProviderProtocol | undefined {
  if ((VALID_PROTOCOLS as readonly string[]).includes(v)) {
    return v as ProviderProtocol;
  }
+  // anthropic/google were removed with the Vercel AI SDK — nudge users who
+  // still set them toward the OpenAI-compatible endpoints (see .env.example).
+  const hint =
+    v === "anthropic" || v === "google"
+      ? ` — use openai_compatible with their OpenAI-compatible endpoint instead`
+      : "";
  throw new Error(
-    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}`,
+    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}${hint}`,
  );
 }