# =============================================================
# 云梦 — AI 视觉小说
# Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS
# (one API key covers all three) + any image provider for IMAGE.
#
# Any OpenAI-compatible endpoint works for any slot — OpenRouter,
# OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc.
# Image generation uses the chat-completions + modalities API
# (OpenRouter-style), NOT the legacy /images/generations endpoint.
# =============================================================

# ---- 1. Text LLM · scene director ----------------------------------
# Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN)
# Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1
# Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys)
TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
TEXT_API_KEY=tp-xxx
TEXT_MODEL=mimo-v2.5-pro

# ---- 2. Image generator (renders the scene background) -------------
# Any provider supporting chat-completions + modalities image output.
IMAGE_BASE_URL=https://openrouter.ai/api/v1
IMAGE_API_KEY=sk-or-v1-xxx
IMAGE_MODEL=openai/gpt-5.4-image-2

# ---- 3. Vision model · multimodal click interpretation -------------
# Recommended: MiMo V2.5 omni — multimodal.
# ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and
#    rejects image_url content parts.
VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
VISION_API_KEY=tp-xxx
VISION_MODEL=mimo-v2.5

# ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------
# Per-character voice design → clone, with per-line delivery direction.
# Voice identity = the reference audio kept in the session (no server expiry).
# The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL.
TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1
TTS_API_KEY=tp-xxx
TTS_SPEECH_MODEL=mimo-v2.5-tts

# ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) -----
# true → return a placeholder image instead of calling the image model.
# Text/story/voice still run normally. Great for iterating on TTS.
MOCK_IMAGE=false