# ============================================================= # 云梦 — AI 视觉小说 # Recommended setup: Xiaomi MiMo Token Plan for TEXT / VISION / TTS # (one API key covers all three) + any image provider for IMAGE. # # Any OpenAI-compatible endpoint works for any slot — OpenRouter, # OpenAI, Anthropic via OpenAI-compat proxy, Gemini, DeepSeek, etc. # Image generation uses the chat-completions + modalities API # (OpenRouter-style), NOT the legacy /images/generations endpoint. # ============================================================= # ---- 1. Text LLM · scene director ---------------------------------- # Recommended: MiMo V2.5 Pro (1M context, native JSON-mode, strong CN) # Token Plan host: https://token-plan-sgp.xiaomimimo.com/v1 # Pay-as-you-go host: https://api.xiaomimimo.com/v1 (sk- keys) TEXT_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 TEXT_API_KEY=tp-xxx TEXT_MODEL=mimo-v2.5-pro # ---- 2. Image generator (renders the scene background) ------------- # Any provider supporting chat-completions + modalities image output. IMAGE_BASE_URL=https://openrouter.ai/api/v1 IMAGE_API_KEY=sk-or-v1-xxx IMAGE_MODEL=openai/gpt-5.4-image-2 # ---- 3. Vision model · multimodal click interpretation ------------- # Recommended: MiMo V2.5 omni — multimodal. # ⚠️ DO NOT use mimo-v2.5-pro for this slot — Pro is text-only and # rejects image_url content parts. VISION_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 VISION_API_KEY=tp-xxx VISION_MODEL=mimo-v2.5 # ---- 4. TTS · Xiaomi MiMo (optional — leave blank to disable) ------ # Per-character voice design → clone, with per-line delivery direction. # Voice identity = the reference audio kept in the session (no server expiry). # The adapter appends -voicedesign / -voiceclone to TTS_SPEECH_MODEL. TTS_BASE_URL=https://token-plan-sgp.xiaomimimo.com/v1 TTS_API_KEY=tp-xxx TTS_SPEECH_MODEL=mimo-v2.5-tts # ---- 5. MOCK_IMAGE — skip image generation (cheap TTS testing) ----- # true → return a placeholder image instead of calling the image model. # Text/story/voice still run normally. Great for iterating on TTS. MOCK_IMAGE=false