fix(ai-client): clean up regressions from OpenAI SDK migration and canvas frame fix (#74)

Three follow-ups to ef3b579 (OpenAI SDK migration) and ebe39ef (canvas frame): - .env.example / config.ts / AGENTS.md: anthropic & google native protocols were removed with the Vercel AI SDK, but .env.example and AGENTS.md still advertised them. Rewrite the docs to point Claude/Gemini at their OpenAI-compatible endpoints (api.anthropic.com/v1, generativelanguage.googleapis.com/v1beta/openai), drop the dead Gemini "Nano Banana" image example, sync AGENTS.md (text/vision protocol list, image protocol list, the "OpenAI/Gemini via AI SDK" reference note), and append a short hint in readProvider() error message guiding anthropic/google users to openai_compatible instead of a bare rejection. - chat.ts: drop the unsafe `as { prompt_tokens_details?: ... }` cast; read cached_tokens straight off the SDK's CompletionUsage type. Add a comment noting the OpenAI usage object reports cache reads only (no cache-write count), so the create cost the old AI SDK path logged is unrecoverable. - PlayCanvas.tsx: revert <img key={imageUrl}> to key={imageUrl.slice(-48)}. The gpt-image/mock paths emit multi-MB data URIs; using the full string as React's reconciliation key adds avoidable diff overhead during the frequent re-renders. Matches the existing <audio> element's key convention. Validation: pnpm typecheck passes. (pnpm lint fails on a pre-existing Next 16 `next lint` CLI issue, identical on staging — unrelated to this change.)
2026-06-14 13:36:19 +08:00
parent 9157454b46
commit 0dea2f8e36
5 changed files with 43 additions and 27 deletions
@@ -3,18 +3,22 @@
 # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
 # (one API key covers all three) + Runware for IMAGE (FLUX.2 [klein]).
 #
-# TEXT / VISION default to any OpenAI-compatible endpoint, and can switch to
+# TEXT / VISION / IMAGE all speak the OpenAI wire format. Anthropic Claude
-# native Anthropic or Google Gemini via TEXT_PROVIDER / VISION_PROVIDER.
+# and Google Gemini are reachable through their own OpenAI-compatible
 # endpoints (see TEXT_PROVIDER notes below) — no native protocol switch is
 # needed.
 # TTS uses Xiaomi MiMo's own voice design / clone protocol
 # (not OpenAI-compatible; appends -voicedesign / -voiceclone).
 #
-# IMAGE supports Runware (its own task-array protocol), OpenAI (gpt-image),
+# IMAGE supports Runware (its own task-array protocol) and OpenAI (gpt-image)
-# and Google Gemini (Nano Banana) via IMAGE_PROVIDER.
+# via IMAGE_PROVIDER.
 #
 # *_PROVIDER (optional) selects the wire protocol; leave unset for the
-# OpenAI-compatible default (image is auto-detected from the URL). Base URLs
+# OpenAI-compatible default (image is auto-detected from the URL). Valid
-# tolerate a missing or extra /v1 (or a trailing /chat/completions) — the
+# values are openai_compatible / openai / runware — native "anthropic" /
-# engine normalizes them.
+# "google" protocols were removed when the Vercel AI SDK was dropped.
 # Base URLs tolerate a missing or extra /v1 (or a trailing /chat/completions)
 # — the engine normalizes them.
 # =============================================================
 # ---- 1. Text LLM · scene director ----------------------------------
@@ -30,9 +34,11 @@
 TEXT_BASE_URL=https://api.deepseek.com/v1
 TEXT_API_KEY=sk-xxx
 TEXT_MODEL=deepseek-v4-flash
-# TEXT_PROVIDER: openai_compatible (default) | anthropic | google
+# TEXT_PROVIDER: openai_compatible (default). This is the ONLY supported text
-#   anthropic → TEXT_BASE_URL=https://api.anthropic.com  TEXT_MODEL=claude-sonnet-4-6
+# protocol. To use Claude or Gemini, leave TEXT_PROVIDER unset and point at
-#   google    → TEXT_BASE_URL=https://generativelanguage.googleapis.com  TEXT_MODEL=gemini-3.5-flash
+# their OpenAI-compatible endpoints:
 #   Claude  → TEXT_BASE_URL=https://api.anthropic.com/v1  TEXT_MODEL=claude-sonnet-4-6
 #   Gemini  → TEXT_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai  TEXT_MODEL=gemini-3.5-flash
 # TEXT_PROVIDER=openai_compatible
 # ---- 2. Image generator (renders the scene background) -------------
@@ -44,14 +50,10 @@ TEXT_MODEL=deepseek-v4-flash
 IMAGE_BASE_URL=https://api.runware.ai/v1
 IMAGE_API_KEY=runware-xxx
 IMAGE_MODEL=runware:400@6
-# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible
+# IMAGE_PROVIDER: runware (auto-detected for runware.ai) | openai_compatible | openai
 #                 | openai | google
 #   openai → gpt-image, supports referenceImages (character/scene continuity).
 #            IMAGE_BASE_URL=https://api.openai.com  IMAGE_MODEL=gpt-image-1
-#   google → Gemini "Nano Banana" (Imagen is EOL 2026-06-24, do not use it).
+# NOTE: openai returns raw bytes → inlined as a data: URI for the session
 #            IMAGE_BASE_URL=https://generativelanguage.googleapis.com
 #            IMAGE_MODEL=gemini-2.5-flash-image
 # NOTE: openai/google return raw bytes → inlined as a data: URI for the session
 # (heavier per-call transport than Runware's UUID re-reference loop). Runware
 # stays fastest + cheapest for the scene-by-scene flow.
 # IMAGE_PROVIDER=runware
@@ -77,9 +79,9 @@ IMAGE_MODEL=runware:400@6
 VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
 VISION_API_KEY=tp-xxx
 VISION_MODEL=mimo-v2.5
-# VISION_PROVIDER: openai_compatible (default) | anthropic | google
+# VISION_PROVIDER: openai_compatible (default). Only openai_compatible is
-#   anthropic → VISION_BASE_URL=https://api.anthropic.com  VISION_MODEL=claude-sonnet-4-6
+# supported — reach Claude/Gemini via their OpenAI-compatible endpoints
-#   google    → VISION_BASE_URL=https://generativelanguage.googleapis.com  VISION_MODEL=gemini-3.5-flash
+# (same base URLs as TEXT above). Leave unset to use the default.
 # VISION_PROVIDER=openai_compatible
 # ---- 4. TTS (optional — leave blank to disable) --------------------
@@ -79,7 +79,7 @@ Maintain graceful degradation. Existing flows tolerate malformed AI JSON, failed
 `sceneKey` identifies a physical space such as `"classroom-dusk"`. If a new scene shares a key with prior history, the prior scene image should be reused as a reference. Character portraits are also references.
-Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The OpenAI/Gemini image paths can also accept references through the AI SDK, but they return data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.
+Runware allows at most 4 references. Preserve the priority: style reference image, prior scene, speaker portrait, then other NPCs. Prefer image URLs for `referenceImages` when needed because Runware can fail to recognize UUIDs. The native OpenAI image path (gpt-image) can also accept references via `images.edit`, but returns data URIs and synthetic UUIDs, so repeated session transport is heavier than Runware's URL/UUID loop.
 Writer prompt caching depends on `buildWriterPlanUserMessage()` and `buildWriterBeatsUserMessage()` keeping their stable prefixes intact: world, style, story spine, archived history, known scene keys, and character list. The dynamic suffix contains current state, last beat, exit hint, and the current plan. Do not reorder or reformat stable prefix sections casually; it can destroy cache hit rates.
@@ -136,8 +136,8 @@ Comment only non-obvious sequencing, provider quirks, fallback behavior, or arch
 Use `.env.example` as the source of truth. Never commit `.env.local`, API keys, uploaded user content, or generated secrets.
- Text and Vision use `TEXT_*` and `VISION_*`; default protocol is `openai_compatible`, with native `anthropic` and `google` available via `TEXT_PROVIDER` / `VISION_PROVIDER`.
+- Text and Vision use `TEXT_*` and `VISION_*` over the `openai_compatible` protocol (the only supported text/vision protocol); Claude and Gemini are reached via their own OpenAI-compatible endpoints with the `*_PROVIDER` var unset.
- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, native `openai`, and native `google`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
+- Image uses `IMAGE_*`; supported protocols are `runware`, `openai_compatible`, and native `openai`. When `IMAGE_PROVIDER` is unset, Runware is inferred from `*.runware.ai` URLs and otherwise falls back to OpenAI-compatible image generations.
 - `IMAGE_TIMEOUT_MS` (per-attempt hard deadline) and `IMAGE_HEDGE_MS` (Painter scene-paint hedging: race a second request when the first is still pending after the threshold) are both OFF when unset — the default path must stay byte-identical to historical behavior. Hedging applies only to the Tier-A scene paint, never to portraits, and never fires after a fast failure (saturation guard). Client-side engine configs (`resolveEngineConfig`) intentionally do not set these fields.
 - TTS supports Xiaomi MiMo (voicedesign + voiceclone) or StepFun (preset voices auto-selected by keyword scoring), inferred from `TTS_BASE_URL` (host containing `stepfun.com` → StepFun, otherwise → MiMo). `CharacterVoice` is a discriminated union on `provider`; synth dispatches on the voice's own tag so a session may carry both shapes through a provider switch. Blank config means silent mode.
 - `MOCK_IMAGE=true` skips image generation and returns a placeholder for cheap local iteration.
@@ -399,9 +399,12 @@ export function PlayCanvas({
        >
          {/* The stable wrapper owns the frame size. Keeping overlay geometry
              independent of <img> decode/source swaps prevents controls from
-              jumping when a newly generated image is committed. */}
+              jumping when a newly generated image is committed. The key uses
              a short high-entropy slice (matching the <audio> element) so data
              URIs from the gpt-image/mock paths — which can be several MB —
              don't become React's reconciliation key. */}
          <img
-            key={imageUrl}
+            key={imageUrl.slice(-48)}
            ref={imgRef}
            src={imageUrl}
            width={intrinsicW}
@@ -7,6 +7,12 @@ export type ChatMessage = {
  content: string;
 };
 // Cache observability for the prompt-prefix caching that the Writer stable
 // prefix relies on. The OpenAI usage object reports only cached READS
 // (prompt_tokens_details.cached_tokens) and has no field for cache WRITES
 // (tokens written to the cache on a cold pass), so unlike the old AI SDK
 // path we can show the hit rate but not the create cost. cached_tokens lives
 // directly on the SDK's CompletionUsage type — no cast needed.
 function summarizeSdkUsage(
  tag: string,
  usage: OpenAI.Completions.CompletionUsage | undefined,
@@ -14,8 +20,7 @@ function summarizeSdkUsage(
  if (!usage) return `[cache] ${tag} no-usage`;
  const input = usage.prompt_tokens ?? 0;
  const output = usage.completion_tokens ?? 0;
-  const details = (usage as { prompt_tokens_details?: { cached_tokens?: number } }).prompt_tokens_details;
+  const cached = usage.prompt_tokens_details?.cached_tokens;
  const cached = details?.cached_tokens;
  if (typeof cached === "number") {
    const rate = input > 0 ? ((cached / input) * 100).toFixed(1) : "n/a";
    return `[cache] ${tag} hit=${cached} input=${input} rate=${rate}% completion=${output}`;
@@ -40,8 +40,14 @@ function readProvider(name: string): ProviderProtocol | undefined {
  if ((VALID_PROTOCOLS as readonly string[]).includes(v)) {
    return v as ProviderProtocol;
  }
  // anthropic/google were removed with the Vercel AI SDK — nudge users who
  // still set them toward the OpenAI-compatible endpoints (see .env.example).
  const hint =
    v === "anthropic" || v === "google"
      ? ` — use openai_compatible with their OpenAI-compatible endpoint instead`
      : "";
  throw new Error(
-    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}`,
+    `Invalid ${name}: "${v}". Must be one of: ${VALID_PROTOCOLS.join(", ")}${hint}`,
  );
 }