diff --git a/services/mana-image-gen/.env.example b/services/mana-image-gen/.env.example
new file mode 100644
index 000000000..4171593cf
--- /dev/null
+++ b/services/mana-image-gen/.env.example
@@ -0,0 +1,25 @@
+# Mana Image Generation — Windows GPU server only
+
+# Server
+PORT=3023
+
+# Model
+IMAGE_MODEL_ID=black-forest-labs/FLUX.1-schnell
+
+# Generation defaults
+DEFAULT_STEPS=4
+DEFAULT_WIDTH=1024
+DEFAULT_HEIGHT=1024
+MAX_STEPS=8
+GUIDANCE_SCALE=0.0
+GENERATION_TIMEOUT=120
+
+# Output (where generated images are written)
+OUTPUT_DIR=C:\mana\services\mana-image-gen\outputs
+
+# CORS
+CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173
+
+# Cross-service auth — enforced by ApiKeyMiddleware in app/api_auth.py.
+# Same key as mana-llm. Generate with: openssl rand -hex 32
+GPU_API_KEY=
diff --git a/services/mana-image-gen/CLAUDE.md b/services/mana-image-gen/CLAUDE.md
index 1cd1e0504..37d21c27d 100644
--- a/services/mana-image-gen/CLAUDE.md
+++ b/services/mana-image-gen/CLAUDE.md
@@ -1,200 +1,147 @@
-# CLAUDE.md - Mana Image Generation Service
+# mana-image-gen
 
-## Service Overview
+AI image generation microservice using FLUX models via HuggingFace `diffusers` on NVIDIA CUDA. Lives on the Windows GPU server (`mana-server-gpu`, RTX 3090).
 
-AI image generation microservice using FLUX.2 klein 4B model via flux2.c:
+> ⚠️ **Earlier history**: this directory used to contain a Mac Mini–only
+> implementation built on `flux2.c` (MPS, Apple Silicon arm64). That
+> version was removed when the service moved fully onto the Windows GPU.
+> If you're looking for the old code, see git history before this commit.
 
-- **Port**: 3025
-- **Host**: Mac Mini only — `setup.sh` hard-fails on anything other than macOS arm64
-- **Framework**: Python + FastAPI
-- **Model**: FLUX.2 klein 4B (Black Forest Labs)
-- **Backend**: flux2.c (Pure C, MPS accelerated)
+## Tech Stack
 
-> ⚠️ **Two image-gen services exist with the same name.** This one is the
-> Mac Mini implementation in the repo (flux2.c, MPS, Apple Silicon only).
-> The Windows GPU server runs a *separate* image-gen on `gpu-img.mana.how`
-> (port 3023, PyTorch + diffusers + CUDA) whose code lives outside the
-> repo at `C:\mana\services\mana-image-gen\` on the GPU box. See
-> `docs/WINDOWS_GPU_SERVER_SETUP.md` for that one.
+| Layer | Technology |
+|-------|------------|
+| **Runtime** | Python 3.11 + uvicorn (Windows) |
+| **Framework** | FastAPI |
+| **Inference** | HuggingFace `diffusers` + PyTorch CUDA |
+| **Default model** | FLUX.1-schnell (BFL, Apache 2.0, 4-step distilled) |
+| **GPU** | NVIDIA RTX 3090 (24 GB VRAM) |
+| **Auth** | `GPU_API_KEY` middleware (`app/api_auth.py`) |
+| **Process supervision** | Windows Scheduled Task `ManaImageGen` (AtLogOn) |
 
-## Features
+## Port: 3023
 
-- **Sub-second generation** on Apple Silicon (M4)
-- **Memory efficient**: ~4-5 GB RAM usage (memory-mapped weights)
-- **Apache 2.0 license**: Commercially usable
-- **4 sampling steps**: Optimized for speed
-- **1024x1024 default resolution**
+## Where it runs
 
-## Commands
+| Host | Path on disk | Entrypoint |
+|------|--------------|------------|
+| Windows GPU server (`192.168.178.11`) | `C:\mana\services\mana-image-gen\` | `service.pyw` via Scheduled Task `ManaImageGen` |
 
-```bash
-# Setup (installs flux2.c + downloads model)
-./setup.sh
+The service is exposed publicly via Cloudflare Tunnel + the Mac Mini TCP-proxy (`gpu-proxy.py`):
 
-# Development
-source .venv/bin/activate
-FLUX_BINARY=/opt/flux2/flux FLUX_MODEL_DIR=/opt/flux2/model \
-  uvicorn app.main:app --host 0.0.0.0 --port 3025 --reload
-
-# Production
-../../scripts/mac-mini/setup-image-gen.sh
-
-# Test
-curl http://localhost:3025/health
-curl -X POST http://localhost:3025/generate \
-  -H "Content-Type: application/json" \
-  -d '{"prompt": "A cat in space"}' | jq
+```
+Internet → Cloudflare → Mac Mini (gpu-proxy.py) → 192.168.178.11:3023
 ```
 
-## File Structure
+Public URL: `https://gpu-img.mana.how`
+
+## Quick Start (Windows GPU)
+
+```powershell
+# As tills on mana-server-gpu
+cd C:\mana\services\mana-image-gen
+C:\mana\venvs\image-gen\Scripts\python.exe service.pyw
+
+# Or kick the scheduled task
+Start-ScheduledTask -TaskName "ManaImageGen"
+
+# Health
+curl http://localhost:3023/health
+```
+
+The Scheduled Task runs:
+```
+Execute:    C:\mana\venvs\image-gen\Scripts\python.exe
+Arguments:  C:\mana\services\mana-image-gen\service.pyw
+WorkingDir: C:\mana\services\mana-image-gen
+```
+
+## API Endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/health` | Liveness + GPU + model status |
+| GET | `/models` | Loaded model info |
+| POST | `/generate` | Generate an image (returns `{image_url, ...}`) |
+| GET | `/images/{filename}` | Serve a generated image |
+| DELETE | `/images/{filename}` | Delete a generated image |
+| POST | `/cleanup?max_age_hours=24` | Sweep old images |
+
+All non-health endpoints are gated by `ApiKeyMiddleware` — clients must send `Authorization: Bearer $GPU_API_KEY` (header name and verification details in `app/api_auth.py`).
+
+### Generate request
+
+```json
+{
+  "prompt": "A futuristic city skyline at sunset",
+  "width": 1024,
+  "height": 1024,
+  "steps": 4,
+  "seed": -1
+}
+```
+
+## Code layout
 
 ```
 services/mana-image-gen/
 ├── app/
 │   ├── __init__.py
-│   ├── main.py              # FastAPI endpoints
-│   └── flux_service.py      # flux2.c subprocess wrapper
-├── setup.sh                 # Setup script
-├── requirements.txt
-├── CLAUDE.md
-└── README.md
+│   ├── main.py            # FastAPI endpoints
+│   ├── flux_service.py    # diffusers pipeline + generate_image()
+│   ├── api_auth.py        # ApiKeyMiddleware (GPU_API_KEY)
+│   └── vram_manager.py    # shared VRAM accounting helper
+└── service.pyw            # Windows runner (used by Scheduled Task)
 ```
 
-## API Endpoints
+## Configuration (`.env` on the Windows GPU box)
 
-| Endpoint | Method | Purpose |
-|----------|--------|---------|
-| `/health` | GET | Health check |
-| `/models` | GET | Model info |
-| `/generate` | POST | Generate image |
-| `/images/{filename}` | GET | Serve generated image |
-| `/images/{filename}` | DELETE | Delete image |
-| `/cleanup` | POST | Clean old images |
-
-## Generate Request
-
-```json
-{
-  "prompt": "A beautiful sunset over mountains",
-  "width": 1024,
-  "height": 1024,
-  "steps": 4,
-  "seed": -1,
-  "output_format": "png"
-}
+```env
+PORT=3023
+IMAGE_MODEL_ID=black-forest-labs/FLUX.1-schnell
+DEFAULT_STEPS=4
+DEFAULT_WIDTH=1024
+DEFAULT_HEIGHT=1024
+MAX_STEPS=8
+GUIDANCE_SCALE=0.0
+GENERATION_TIMEOUT=120
+OUTPUT_DIR=C:\mana\services\mana-image-gen\outputs
+CORS_ORIGINS=https://mana.how,https://chat.mana.how
+GPU_API_KEY=...                # cross-service auth, also used by mana-llm
 ```
 
-## Generate Response
+The `service.pyw` runner loads `.env` from the service directory before
+starting uvicorn.
 
-```json
-{
-  "success": true,
-  "image_url": "/images/abc123.png",
-  "prompt": "A beautiful sunset over mountains",
-  "width": 1024,
-  "height": 1024,
-  "steps": 4,
-  "seed": 42,
-  "generation_time": 0.85
-}
+## Operations
+
+```powershell
+# Status
+Get-ScheduledTask -TaskName "ManaImageGen" | Format-List TaskName, State
+Get-NetTCPConnection -LocalPort 3023 -State Listen
+
+# Restart
+Stop-ScheduledTask -TaskName "ManaImageGen"
+Start-ScheduledTask -TaskName "ManaImageGen"
+
+# Logs
+Get-Content C:\mana\services\mana-image-gen\service.log -Tail 50
 ```
 
-## Environment Variables
+## Model details
 
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `PORT` | `3025` | Service port |
-| `FLUX_BINARY` | `/opt/flux2/flux` | Path to flux2.c binary |
-| `FLUX_MODEL_DIR` | `/opt/flux2/model` | Path to model weights |
-| `DEFAULT_STEPS` | `4` | Default sampling steps |
-| `DEFAULT_WIDTH` | `1024` | Default image width |
-| `DEFAULT_HEIGHT` | `1024` | Default image height |
-| `GENERATION_TIMEOUT` | `120` | Timeout in seconds |
-| `MAX_PROMPT_LENGTH` | `2000` | Max prompt chars |
-| `CORS_ORIGINS` | (production URLs) | CORS config |
+| Field | Value |
+|-------|-------|
+| Model | `black-forest-labs/FLUX.1-schnell` |
+| Parameters | ~12B |
+| License | Apache 2.0 (commercial use OK) |
+| Weights size | ~24 GB on disk |
+| VRAM footprint | ~12 GB (with the default precision/optimization settings) |
+| Optimal sampling steps | 4 (distilled "schnell" variant) |
+| HuggingFace gate | Requires HF login + license accept |
 
-## Model Details
+## Reference
 
-### FLUX.2 klein 4B
-
-- **Parameters**: 4 billion
-- **License**: Apache 2.0 (commercial use allowed)
-- **Download size**: ~16 GB
-- **RAM usage**: ~4-5 GB (memory-mapped)
-- **Optimal steps**: 4 (distilled model)
-- **Release**: January 2026
-
-## Integration with Other Apps
-
-The service is designed to be used by:
-
-- **Picture App** (`apps/picture/`) - AI image generation platform
-- **Chat App** (`apps/chat/`) - Inline image generation
-- **Matrix Bots** - Image generation via chat commands
-- **API Gateway** - Public API access
-
-### Example Integration (TypeScript)
-
-```typescript
-const response = await fetch('http://localhost:3025/generate', {
-  method: 'POST',
-  headers: { 'Content-Type': 'application/json' },
-  body: JSON.stringify({
-    prompt: 'A futuristic city at night',
-    width: 1024,
-    height: 1024,
-  }),
-});
-
-const result = await response.json();
-const imageUrl = `http://localhost:3025${result.image_url}`;
-```
-
-## Dependencies
-
-- `fastapi` - Web framework
-- `uvicorn` - ASGI server
-- `pillow` - Image processing
-- `flux2.c` - Native binary (installed separately)
-
-## Performance
-
-On Mac Mini M4 (16 GB):
-
-| Resolution | Steps | Time |
-|------------|-------|------|
-| 512x512 | 4 | ~0.3s |
-| 1024x1024 | 4 | ~0.8s |
-| 1024x1024 | 8 | ~1.5s |
-
-## Troubleshooting
-
-### flux2.c not found
-```bash
-# Verify installation
-ls -la /opt/flux2/flux
-
-# Reinstall
-sudo rm -rf /opt/flux2
-./setup.sh
-```
-
-### Model not found
-```bash
-# Check model directory
-ls -la /opt/flux2/model/
-
-# Re-download
-cd /opt/flux2/src
-./download-model.sh /opt/flux2/model
-```
-
-### Out of memory
-- Reduce resolution to 512x512
-- Close other applications
-- The 16 GB Mac Mini should handle 1024x1024 fine
-
-### Slow generation
-- Ensure MPS build was used: `make mps`
-- Check Metal GPU is being used
-- Reduce steps (4 is optimal for klein)
+- `docs/WINDOWS_GPU_SERVER_SETUP.md` — full Windows GPU box setup, all
+  AI services, scheduled task setup, firewall rules, Cloudflare tunnel
+- `docs/PORT_SCHEMA.md` — port assignments across services
diff --git a/services/mana-image-gen/README.md b/services/mana-image-gen/README.md
index efe66dd52..60cb01bbf 100644
--- a/services/mana-image-gen/README.md
+++ b/services/mana-image-gen/README.md
@@ -1,109 +1,31 @@
 # Mana Image Generation Service
 
-Local AI image generation using **FLUX.2 klein 4B** model via flux2.c.
+AI image generation via **FLUX.1-schnell** (HuggingFace `diffusers` + PyTorch CUDA). Runs on the Windows GPU server (`mana-server-gpu`, NVIDIA RTX 3090).
 
-## Features
+For architecture, deployment, and operations, see [`CLAUDE.md`](./CLAUDE.md) and [`docs/WINDOWS_GPU_SERVER_SETUP.md`](../../docs/WINDOWS_GPU_SERVER_SETUP.md).
 
-- **Fast**: Sub-second generation on Apple Silicon
-- **Efficient**: ~4-5 GB RAM (memory-mapped weights)
-- **Open**: Apache 2.0 license (commercial use)
-- **Local**: 100% on-device, no API keys needed
+## Port: 3023
 
-## Requirements
+## Public URL
 
-- macOS with Apple Silicon (M1/M2/M3/M4)
-- 16 GB RAM minimum
-- ~20 GB disk space (model + binary)
-- Python 3.11+
+`https://gpu-img.mana.how` (via Cloudflare Tunnel + Mac Mini gpu-proxy)
 
-## Quick Start
+## Quickly
 
 ```bash
-# 1. Run setup (installs flux2.c + downloads model)
-./setup.sh
+curl https://gpu-img.mana.how/health
 
-# 2. Start the service
-source .venv/bin/activate
-FLUX_BINARY=/opt/flux2/flux FLUX_MODEL_DIR=/opt/flux2/model \
-  uvicorn app.main:app --host 0.0.0.0 --port 3025
-
-# 3. Generate an image
-curl -X POST http://localhost:3025/generate \
+curl -X POST https://gpu-img.mana.how/generate \
+  -H "Authorization: Bearer $GPU_API_KEY" \
   -H "Content-Type: application/json" \
-  -d '{"prompt": "A cat wearing sunglasses"}' | jq
+  -d '{"prompt":"A serene mountain lake at dawn","width":1024,"height":1024,"steps":4}'
 ```
 
-## API
-
-### Generate Image
-
-```bash
-POST /generate
-Content-Type: application/json
-
-{
-  "prompt": "A beautiful mountain landscape",
-  "width": 1024,
-  "height": 1024,
-  "steps": 4,
-  "seed": -1,
-  "output_format": "png"
-}
-```
-
-Response:
-```json
-{
-  "success": true,
-  "image_url": "/images/abc123.png",
-  "prompt": "A beautiful mountain landscape",
-  "width": 1024,
-  "height": 1024,
-  "steps": 4,
-  "seed": 42,
-  "generation_time": 0.85
-}
-```
-
-### Get Image
-
-```bash
-GET /images/{filename}
-```
-
-### Health Check
-
-```bash
-GET /health
-```
-
-### Model Info
-
-```bash
-GET /models
-```
-
-## Environment Variables
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `PORT` | `3025` | Service port |
-| `FLUX_BINARY` | `/opt/flux2/flux` | flux2.c binary path |
-| `FLUX_MODEL_DIR` | `/opt/flux2/model` | Model weights path |
-| `DEFAULT_STEPS` | `4` | Sampling steps |
-| `DEFAULT_WIDTH` | `1024` | Default width |
-| `DEFAULT_HEIGHT` | `1024` | Default height |
-
 ## Model
 
-**FLUX.2 klein 4B** by Black Forest Labs (January 2026)
-
-- 4 billion parameters
-- Apache 2.0 license
-- Optimized for 4 sampling steps
-- Sub-second inference on consumer GPUs
-
-## Credits
-
-- [flux2.c](https://github.com/antirez/flux2.c) - Pure C implementation by antirez
-- [Black Forest Labs](https://bfl.ai) - FLUX.2 model
+| Field | Value |
+|-------|-------|
+| Model | `black-forest-labs/FLUX.1-schnell` |
+| License | Apache 2.0 |
+| Sampling | 4 steps (distilled) |
+| VRAM | ~12 GB |
diff --git a/services/mana-image-gen/app/api_auth.py b/services/mana-image-gen/app/api_auth.py
new file mode 100644
index 000000000..0f5813735
--- /dev/null
+++ b/services/mana-image-gen/app/api_auth.py
@@ -0,0 +1,53 @@
+"""
+Simple API Key Authentication Middleware for GPU Services.
+
+Checks X-API-Key header or ?api_key query parameter.
+Skips auth for /health, /docs, /openapi.json, /redoc endpoints.
+
+Environment variables:
+  GPU_API_KEY: Required API key (if empty, auth is disabled)
+  GPU_REQUIRE_AUTH: Enable/disable auth (default: true if GPU_API_KEY is set)
+"""
+
+import os
+import logging
+from fastapi import Request
+from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
+
+logger = logging.getLogger(__name__)
+
+GPU_API_KEY = os.getenv("GPU_API_KEY", "")
+GPU_REQUIRE_AUTH = os.getenv("GPU_REQUIRE_AUTH", "true" if GPU_API_KEY else "false").lower() == "true"
+
+# Endpoints that don't require auth
+PUBLIC_PATHS = {"/health", "/docs", "/openapi.json", "/redoc", "/metrics"}
+
+
+class ApiKeyMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request: Request, call_next):
+        # Skip auth if disabled
+        if not GPU_REQUIRE_AUTH or not GPU_API_KEY:
+            return await call_next(request)
+
+        # Skip auth for public endpoints
+        if request.url.path in PUBLIC_PATHS:
+            return await call_next(request)
+
+        # Check API key from header or query param
+        api_key = request.headers.get("X-API-Key") or request.query_params.get("api_key")
+
+        if not api_key:
+            return JSONResponse(
+                status_code=401,
+                content={"detail": "Missing API key. Provide X-API-Key header."},
+            )
+
+        if api_key != GPU_API_KEY:
+            logger.warning(f"Invalid API key attempt from {request.client.host if request.client else 'unknown'}")
+            return JSONResponse(
+                status_code=401,
+                content={"detail": "Invalid API key."},
+            )
+
+        return await call_next(request)
diff --git a/services/mana-image-gen/app/flux_service.py b/services/mana-image-gen/app/flux_service.py
index 03232d890..e74a7881b 100644
--- a/services/mana-image-gen/app/flux_service.py
+++ b/services/mana-image-gen/app/flux_service.py
@@ -1,14 +1,18 @@
 """
-FLUX.2 klein Image Generation Service
+Image Generation Service - CUDA version
 
-Uses flux2.c (Pure C implementation) for image generation.
-Optimized for Apple Silicon with MPS acceleration.
+Supports multiple models via HuggingFace diffusers:
+- FLUX.2 klein 4B (default): Fast, ~13GB VRAM, best quality/speed ratio
+- SDXL-Turbo: Fast fallback, 6GB, ungated
+- FLUX.1-schnell: 12B params, 23GB, gated
+
+Optimized for NVIDIA RTX 3090 (24GB VRAM).
 """
 
 import asyncio
 import logging
 import os
-import tempfile
+import time
 import uuid
 from dataclasses import dataclass
 from pathlib import Path
@@ -17,23 +21,83 @@ from typing import Optional
 logger = logging.getLogger(__name__)
 
 # Configuration
-FLUX_BINARY = os.getenv("FLUX_BINARY", os.path.expanduser("~/flux2/flux"))
-FLUX_MODEL_DIR = os.getenv("FLUX_MODEL_DIR", os.path.expanduser("~/flux2/model"))
+MODEL_ID = os.getenv("IMAGE_MODEL_ID", "black-forest-labs/FLUX.2-klein-4B")
 DEFAULT_STEPS = int(os.getenv("DEFAULT_STEPS", "4"))
 DEFAULT_WIDTH = int(os.getenv("DEFAULT_WIDTH", "1024"))
 DEFAULT_HEIGHT = int(os.getenv("DEFAULT_HEIGHT", "1024"))
-DEFAULT_SEED = int(os.getenv("DEFAULT_SEED", "-1"))  # -1 = random
-GENERATION_TIMEOUT = int(os.getenv("GENERATION_TIMEOUT", "300"))  # seconds (first load takes ~90s)
+GENERATION_TIMEOUT = int(os.getenv("GENERATION_TIMEOUT", "300"))
+GUIDANCE_SCALE = float(os.getenv("GUIDANCE_SCALE", "0.0"))
 
 # Output directory for generated images
-OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "/tmp/mana-image-gen"))
+OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "C:/mana/services/mana-image-gen/output"))
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 
+# Known model configs
+MODEL_CONFIGS = {
+    "black-forest-labs/FLUX.2-klein-4B": {
+        "pipeline_class": "Flux2KleinPipeline",
+        "model_name": "FLUX.2-klein-4B",
+        "parameters": "4 billion",
+        "license": "FLUX.2 Community License",
+        "torch_dtype": "bfloat16",
+        "guidance_scale": 4.0,
+        "default_steps": 4,
+    },
+    "black-forest-labs/FLUX.2-klein-9B": {
+        "pipeline_class": "Flux2KleinPipeline",
+        "model_name": "FLUX.2-klein-9B",
+        "parameters": "9 billion",
+        "license": "FLUX.2 Community License",
+        "torch_dtype": "bfloat16",
+        "guidance_scale": 4.0,
+        "default_steps": 4,
+    },
+    "stabilityai/sdxl-turbo": {
+        "pipeline_class": "AutoPipelineForText2Image",
+        "model_name": "SDXL-Turbo",
+        "parameters": "3.5 billion",
+        "license": "Stability AI Community License",
+        "torch_dtype": "float16",
+        "guidance_scale": 0.0,
+        "default_steps": 4,
+    },
+    "black-forest-labs/FLUX.1-schnell": {
+        "pipeline_class": "FluxPipeline",
+        "model_name": "FLUX.1-schnell",
+        "parameters": "12 billion",
+        "license": "Apache 2.0",
+        "torch_dtype": "float16",
+        "guidance_scale": 0.0,
+        "default_steps": 4,
+    },
+}
+
+# Global pipeline instance (lazy loaded)
+_pipeline = None
+
+# VRAM management — unload FLUX after 5 min idle (frees ~13GB)
+from app.vram_manager import VramManager
+_vram = VramManager(
+    idle_timeout=int(os.getenv("VRAM_IDLE_TIMEOUT", "300")),
+    service_name="mana-image-gen",
+)
+
+
+def unload_pipeline():
+    """Unload FLUX pipeline from GPU to free VRAM."""
+    global _pipeline
+    if _pipeline is not None:
+        import torch
+        del _pipeline
+        _pipeline = None
+        torch.cuda.empty_cache()
+        _vram.mark_unloaded()
+        logger.info("FLUX pipeline unloaded, VRAM freed")
+
 
 @dataclass
 class GenerationResult:
     """Result of image generation."""
-
     image_path: str
     prompt: str
     width: int
@@ -43,25 +107,99 @@ class GenerationResult:
     generation_time: float
 
 
+def _load_pipeline():
+    """Load the image generation pipeline (called once, lazy)."""
+    global _pipeline
+
+    if _pipeline is not None:
+        return _pipeline
+
+    logger.info(f"Loading model: {MODEL_ID}")
+    load_start = time.time()
+
+    import torch
+
+    config = MODEL_CONFIGS.get(MODEL_ID, {})
+    pipeline_class = config.get("pipeline_class", "AutoPipelineForText2Image")
+    dtype_str = config.get("torch_dtype", "float16")
+    dtype = torch.bfloat16 if dtype_str == "bfloat16" else torch.float16
+
+    if pipeline_class == "Flux2KleinPipeline":
+        from diffusers import Flux2KleinPipeline
+        _pipeline = Flux2KleinPipeline.from_pretrained(
+            MODEL_ID,
+            torch_dtype=dtype,
+        )
+        _pipeline.to("cuda")
+    elif pipeline_class == "FluxPipeline":
+        from diffusers import FluxPipeline
+        _pipeline = FluxPipeline.from_pretrained(
+            MODEL_ID,
+            torch_dtype=dtype,
+        )
+        _pipeline.enable_model_cpu_offload()
+    else:
+        from diffusers import AutoPipelineForText2Image
+        _pipeline = AutoPipelineForText2Image.from_pretrained(
+            MODEL_ID,
+            torch_dtype=dtype,
+            variant="fp16",
+        )
+        _pipeline.to("cuda")
+
+    load_time = time.time() - load_start
+    logger.info(f"Model loaded in {load_time:.1f}s")
+    _vram.mark_loaded()
+
+    return _pipeline
+
+
 def is_flux_available() -> bool:
-    """Check if flux2.c binary and model are available."""
-    binary_exists = Path(FLUX_BINARY).exists()
-    model_exists = Path(FLUX_MODEL_DIR).exists()
-    return binary_exists and model_exists
+    """Check if image generation is available."""
+    try:
+        import torch
+        import diffusers
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
 
 
 def get_flux_info() -> dict:
-    """Get information about the flux installation."""
+    """Get information about the model."""
+    import torch
+    loaded = _pipeline is not None
+    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A"
+    vram_used = torch.cuda.memory_allocated(0) / (1024**3) if torch.cuda.is_available() else 0
+
+    config = MODEL_CONFIGS.get(MODEL_ID, {})
+
     return {
-        "binary": FLUX_BINARY,
-        "binary_exists": Path(FLUX_BINARY).exists(),
-        "model_dir": FLUX_MODEL_DIR,
-        "model_exists": Path(FLUX_MODEL_DIR).exists(),
-        "model_name": "FLUX.2-klein-4B",
-        "parameters": "4 billion",
-        "license": "Apache 2.0",
+        "model_id": MODEL_ID,
+        "model_name": config.get("model_name", MODEL_ID.split("/")[-1]),
+        "parameters": config.get("parameters", "unknown"),
+        "license": config.get("license", "unknown"),
+        "backend": "diffusers (CUDA)",
+        "gpu": gpu_name,
+        "gpu_vram_used_gb": round(vram_used, 2),
+        "loaded": loaded,
         "default_steps": DEFAULT_STEPS,
         "default_resolution": f"{DEFAULT_WIDTH}x{DEFAULT_HEIGHT}",
+        "vram": _vram.status(),
+    }
+
+
+def get_vram_status() -> dict:
+    """Get VRAM manager status."""
+    import torch
+    vram_allocated = torch.cuda.memory_allocated(0) / (1024**3) if torch.cuda.is_available() else 0
+    vram_reserved = torch.cuda.memory_reserved(0) / (1024**3) if torch.cuda.is_available() else 0
+    vram_total = torch.cuda.get_device_properties(0).total_mem / (1024**3) if torch.cuda.is_available() else 0
+
+    return {
+        "gpu_vram_allocated_gb": round(vram_allocated, 2),
+        "gpu_vram_reserved_gb": round(vram_reserved, 2),
+        "gpu_vram_total_gb": round(vram_total, 2),
+        "model": _vram.status(),
     }
 
 
@@ -73,110 +211,76 @@ async def generate_image(
     seed: Optional[int] = None,
     output_format: str = "png",
 ) -> GenerationResult:
-    """
-    Generate an image using FLUX.2 klein via flux2.c.
+    """Generate an image from a text prompt."""
+    import torch
 
-    Args:
-        prompt: Text prompt for image generation
-        width: Image width (default 1024)
-        height: Image height (default 1024)
-        steps: Number of sampling steps (default 4)
-        seed: Random seed (-1 for random)
-        output_format: Output format (png, jpg)
+    # Check idle unload first
+    _vram.check_and_unload(unload_pipeline)
 
-    Returns:
-        GenerationResult with image path and metadata
-
-    Raises:
-        RuntimeError: If flux2.c is not available or generation fails
-    """
-    if not is_flux_available():
-        raise RuntimeError(
-            f"flux2.c not available. Binary: {FLUX_BINARY}, Model: {FLUX_MODEL_DIR}"
-        )
+    # Load pipeline (lazy — reloads if previously unloaded)
+    loop = asyncio.get_event_loop()
+    pipe = await loop.run_in_executor(None, _load_pipeline)
 
     # Generate unique output filename
     image_id = str(uuid.uuid4())[:8]
     output_path = OUTPUT_DIR / f"{image_id}.{output_format}"
 
-    # Use provided seed or generate random
-    actual_seed = seed if seed is not None and seed >= 0 else -1
+    # Set seed
+    if seed is not None and seed >= 0:
+        generator = torch.Generator("cuda").manual_seed(seed)
+        actual_seed = seed
+    else:
+        actual_seed = torch.randint(0, 2**32, (1,)).item()
+        generator = torch.Generator("cuda").manual_seed(actual_seed)
 
-    # Build flux2.c command
-    cmd = [
-        FLUX_BINARY,
-        "-d", FLUX_MODEL_DIR,
-        "-p", prompt,
-        "-o", str(output_path),
-        "-W", str(width),
-        "-H", str(height),
-        "-s", str(steps),
-    ]
+    # Get guidance scale from config
+    config = MODEL_CONFIGS.get(MODEL_ID, {})
+    guidance = GUIDANCE_SCALE if GUIDANCE_SCALE > 0 else config.get("guidance_scale", 0.0)
 
-    if actual_seed >= 0:
-        cmd.extend(["-S", str(actual_seed)])
+    logger.info(f"Generating: {width}x{height}, {steps} steps, seed={actual_seed}")
 
-    logger.info(f"Running flux2.c: {' '.join(cmd[:6])}...")
-
-    import time
     start_time = time.time()
 
-    try:
-        # Run flux2.c as subprocess
-        process = await asyncio.create_subprocess_exec(
-            *cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-        )
+    def _generate():
+        with torch.inference_mode():
+            result = pipe(
+                prompt=prompt,
+                width=width,
+                height=height,
+                num_inference_steps=steps,
+                generator=generator,
+                guidance_scale=guidance,
+            )
+            return result.images[0]
 
-        stdout, stderr = await asyncio.wait_for(
-            process.communicate(),
+    try:
+        image = await asyncio.wait_for(
+            loop.run_in_executor(None, _generate),
             timeout=GENERATION_TIMEOUT,
         )
-
-        generation_time = time.time() - start_time
-
-        if process.returncode != 0:
-            error_msg = stderr.decode() if stderr else "Unknown error"
-            logger.error(f"flux2.c failed: {error_msg}")
-            raise RuntimeError(f"Image generation failed: {error_msg}")
-
-        # Verify output file exists
-        if not output_path.exists():
-            raise RuntimeError("Image generation completed but output file not found")
-
-        # Parse seed from output if random
-        parsed_seed = actual_seed
-        if stdout:
-            output_text = stdout.decode()
-            # flux2.c outputs "seed: 12345" when using random seed
-            for line in output_text.split("\n"):
-                if line.startswith("seed:"):
-                    try:
-                        parsed_seed = int(line.split(":")[1].strip())
-                    except (ValueError, IndexError):
-                        pass
-
-        logger.info(
-            f"Image generated: {output_path} ({width}x{height}, {steps} steps, {generation_time:.2f}s)"
-        )
-
-        return GenerationResult(
-            image_path=str(output_path),
-            prompt=prompt,
-            width=width,
-            height=height,
-            steps=steps,
-            seed=parsed_seed,
-            generation_time=generation_time,
-        )
-
     except asyncio.TimeoutError:
-        logger.error(f"Image generation timed out after {GENERATION_TIMEOUT}s")
-        raise RuntimeError(f"Generation timed out after {GENERATION_TIMEOUT} seconds")
-    except Exception as e:
-        logger.error(f"Image generation error: {e}")
-        raise
+        raise RuntimeError(f"Generation timed out after {GENERATION_TIMEOUT}s")
+
+    generation_time = time.time() - start_time
+
+    # Save image
+    if output_format == "jpg":
+        image.save(output_path, "JPEG", quality=95)
+    else:
+        image.save(output_path, "PNG")
+
+    _vram.touch()
+    logger.info(f"Generated: {output_path} ({width}x{height}, {steps} steps, {generation_time:.2f}s)")
+
+    return GenerationResult(
+        image_path=str(output_path),
+        prompt=prompt,
+        width=width,
+        height=height,
+        steps=steps,
+        seed=actual_seed,
+        generation_time=generation_time,
+    )
 
 
 def cleanup_image(image_path: str) -> bool:
@@ -193,8 +297,6 @@ def cleanup_image(image_path: str) -> bool:
 
 def cleanup_old_images(max_age_hours: int = 24) -> int:
     """Clean up images older than max_age_hours."""
-    import time
-
     cleaned = 0
     cutoff = time.time() - (max_age_hours * 3600)
 
diff --git a/services/mana-image-gen/app/main.py b/services/mana-image-gen/app/main.py
index 5380623d1..04ac33f42 100644
--- a/services/mana-image-gen/app/main.py
+++ b/services/mana-image-gen/app/main.py
@@ -21,6 +21,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from pydantic import BaseModel, Field
 
+from .api_auth import ApiKeyMiddleware
 from .flux_service import (
     generate_image,
     is_flux_available,
@@ -40,7 +41,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 
 # Configuration from environment
-PORT = int(os.getenv("PORT", "3025"))
+PORT = int(os.getenv("PORT", "3023"))
 MAX_PROMPT_LENGTH = int(os.getenv("MAX_PROMPT_LENGTH", "2000"))
 MIN_DIMENSION = int(os.getenv("MIN_DIMENSION", "256"))
 MAX_DIMENSION = int(os.getenv("MAX_DIMENSION", "2048"))
@@ -87,6 +88,7 @@ app.add_middleware(
     allow_methods=["*"],
     allow_headers=["*"],
 )
+app.add_middleware(ApiKeyMiddleware)
 
 
 # ============================================================================
diff --git a/services/mana-image-gen/app/vram_manager.py b/services/mana-image-gen/app/vram_manager.py
new file mode 100644
index 000000000..89b5656ae
--- /dev/null
+++ b/services/mana-image-gen/app/vram_manager.py
@@ -0,0 +1,114 @@
+"""
+VRAM Manager — Automatic model unloading after idle timeout.
+
+Tracks last usage time per model and unloads after configurable timeout.
+Designed for shared GPU environments (multiple services on one RTX 3090).
+
+Usage in a service:
+    from vram_manager import VramManager
+
+    vram = VramManager(idle_timeout=300)  # 5 min
+
+    # Before using a model
+    vram.touch()
+
+    # Call periodically (e.g., from health check or background task)
+    vram.check_idle(unload_fn=my_unload_function)
+"""
+
+import os
+import time
+import logging
+import threading
+from typing import Optional, Callable
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_IDLE_TIMEOUT = int(os.getenv("VRAM_IDLE_TIMEOUT", "300"))  # 5 minutes
+
+
+class VramManager:
+    def __init__(self, idle_timeout: int = DEFAULT_IDLE_TIMEOUT, service_name: str = "unknown"):
+        self.idle_timeout = idle_timeout
+        self.service_name = service_name
+        self.last_used: float = 0.0
+        self.model_loaded: bool = False
+        self._lock = threading.Lock()
+        self._timer: Optional[threading.Timer] = None
+
+    def touch(self):
+        """Mark the model as recently used. Call before/after each inference."""
+        with self._lock:
+            self.last_used = time.time()
+            self.model_loaded = True
+            self._schedule_check()
+
+    def mark_loaded(self):
+        """Mark that a model has been loaded into VRAM."""
+        with self._lock:
+            self.model_loaded = True
+            self.last_used = time.time()
+            self._schedule_check()
+            logger.info(f"[{self.service_name}] Model loaded, idle timeout: {self.idle_timeout}s")
+
+    def mark_unloaded(self):
+        """Mark that a model has been unloaded from VRAM."""
+        with self._lock:
+            self.model_loaded = False
+            if self._timer:
+                self._timer.cancel()
+                self._timer = None
+            logger.info(f"[{self.service_name}] Model unloaded, VRAM freed")
+
+    def is_idle(self) -> bool:
+        """Check if the model has been idle longer than the timeout."""
+        if not self.model_loaded:
+            return False
+        return (time.time() - self.last_used) > self.idle_timeout
+
+    def seconds_until_unload(self) -> Optional[float]:
+        """Seconds until the model will be unloaded, or None if not loaded."""
+        if not self.model_loaded:
+            return None
+        remaining = self.idle_timeout - (time.time() - self.last_used)
+        return max(0, remaining)
+
+    def check_and_unload(self, unload_fn: Callable[[], None]) -> bool:
+        """Check if idle and unload if so. Returns True if unloaded."""
+        if self.is_idle():
+            logger.info(f"[{self.service_name}] Idle for >{self.idle_timeout}s, unloading model...")
+            try:
+                unload_fn()
+                self.mark_unloaded()
+                return True
+            except Exception as e:
+                logger.error(f"[{self.service_name}] Failed to unload: {e}")
+        return False
+
+    def _schedule_check(self):
+        """Schedule an idle check after the timeout period."""
+        if self._timer:
+            self._timer.cancel()
+
+        self._timer = threading.Timer(
+            self.idle_timeout + 5,  # Small buffer
+            self._auto_check,
+        )
+        self._timer.daemon = True
+        self._timer.start()
+
+    def _auto_check(self):
+        """Auto-triggered idle check (called by timer)."""
+        # This is just a log — actual unloading needs the unload_fn
+        # which depends on the service. The service should call check_and_unload.
+        if self.is_idle():
+            logger.info(f"[{self.service_name}] Model idle for >{self.idle_timeout}s — ready to unload")
+
+    def status(self) -> dict:
+        """Get current VRAM manager status."""
+        return {
+            "model_loaded": self.model_loaded,
+            "idle_seconds": round(time.time() - self.last_used, 1) if self.model_loaded else None,
+            "idle_timeout": self.idle_timeout,
+            "seconds_until_unload": round(self.seconds_until_unload(), 1) if self.model_loaded else None,
+        }
diff --git a/services/mana-image-gen/service.pyw b/services/mana-image-gen/service.pyw
new file mode 100644
index 000000000..1cc67bdd4
--- /dev/null
+++ b/services/mana-image-gen/service.pyw
@@ -0,0 +1,17 @@
+"""mana-image-gen service runner."""
+import os
+import sys
+os.chdir(r"C:\mana\services\mana-image-gen")
+sys.path.insert(0, r"C:\mana\services\mana-image-gen")
+
+# Load .env file
+from dotenv import load_dotenv
+load_dotenv(r"C:\mana\services\mana-image-gen\.env")
+
+# Redirect stdout/stderr to log file
+log = open(r"C:\mana\services\mana-image-gen\service.log", "w", buffering=1)
+sys.stdout = log
+sys.stderr = log
+
+import uvicorn
+uvicorn.run("app.main:app", host="0.0.0.0", port=3023, log_level="info")
diff --git a/services/mana-image-gen/setup.sh b/services/mana-image-gen/setup.sh
deleted file mode 100755
index 2565d29ec..000000000
--- a/services/mana-image-gen/setup.sh
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/bin/bash
-# Setup script for Mana Image Generation service
-# Installs flux2.c and FLUX.2 klein 4B model
-# Optimized for Apple Silicon (MPS)
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-VENV_DIR="$SCRIPT_DIR/.venv"
-FLUX_DIR="/opt/flux2"
-MODEL_DIR="$FLUX_DIR/model"
-
-echo "=========================================="
-echo "Mana Image Generation Setup"
-echo "=========================================="
-echo ""
-
-# Check platform
-if [[ "$(uname)" != "Darwin" ]]; then
-    echo "Error: This service requires macOS with Apple Silicon."
-    echo "flux2.c uses MPS (Metal Performance Shaders) for acceleration."
-    exit 1
-fi
-
-# Check for Apple Silicon
-if [[ "$(uname -m)" != "arm64" ]]; then
-    echo "Error: This service requires Apple Silicon (arm64)."
-    echo "flux2.c is optimized for M1/M2/M3/M4 chips."
-    exit 1
-fi
-
-echo "Platform: macOS $(sw_vers -productVersion) on $(uname -m)"
-echo ""
-
-# ============================================
-# Step 1: Install flux2.c
-# ============================================
-
-echo "Step 1: Installing flux2.c"
-echo "----------------------------------------"
-
-# Check if flux2.c already exists
-if [[ -f "$FLUX_DIR/flux" ]]; then
-    echo "flux2.c already installed at $FLUX_DIR/flux"
-    echo "To reinstall, remove the directory first: sudo rm -rf $FLUX_DIR"
-else
-    echo "Creating installation directory..."
-    sudo mkdir -p "$FLUX_DIR"
-    sudo chown $(whoami) "$FLUX_DIR"
-
-    # Clone flux2.c repository
-    echo "Cloning flux2.c repository..."
-    cd "$FLUX_DIR"
-    git clone https://github.com/antirez/flux2.c.git src
-    cd src
-
-    # Build with MPS support (Apple Silicon optimized)
-    echo "Building flux2.c with MPS acceleration..."
-    make mps
-
-    # Move binary to parent directory
-    cp flux "$FLUX_DIR/flux"
-    chmod +x "$FLUX_DIR/flux"
-
-    echo "flux2.c installed successfully!"
-fi
-
-# Verify binary
-if [[ -x "$FLUX_DIR/flux" ]]; then
-    echo "Binary: $FLUX_DIR/flux"
-else
-    echo "Error: flux2.c binary not found or not executable"
-    exit 1
-fi
-
-echo ""
-
-# ============================================
-# Step 2: Download FLUX.2 klein 4B model
-# ============================================
-
-echo "Step 2: Downloading FLUX.2 klein 4B model"
-echo "----------------------------------------"
-echo "Note: This will download ~16GB of model weights"
-echo ""
-
-if [[ -d "$MODEL_DIR" ]] && [[ -f "$MODEL_DIR/flux.safetensors" ]]; then
-    echo "Model already downloaded at $MODEL_DIR"
-else
-    mkdir -p "$MODEL_DIR"
-    cd "$FLUX_DIR/src"
-
-    # Run the model download script
-    if [[ -f "./download-model.sh" ]]; then
-        echo "Running download script..."
-        ./download-model.sh "$MODEL_DIR"
-    else
-        echo "Downloading model manually..."
-        # flux2.c expects the model in a specific format
-        # The model includes:
-        # - flux.safetensors (main weights)
-        # - qwen3-4b.safetensors (text encoder)
-        # - ae.safetensors (autoencoder)
-
-        echo "Please run the following commands manually:"
-        echo ""
-        echo "  cd $FLUX_DIR/src"
-        echo "  ./download-model.sh $MODEL_DIR"
-        echo ""
-        echo "Or download from Hugging Face:"
-        echo "  https://huggingface.co/black-forest-labs/FLUX.2-klein-4B"
-        echo ""
-    fi
-fi
-
-echo ""
-
-# ============================================
-# Step 3: Setup Python environment
-# ============================================
-
-echo "Step 3: Setting up Python environment"
-echo "----------------------------------------"
-
-# Find Python
-if command -v python3.11 &> /dev/null; then
-    PYTHON_CMD="python3.11"
-elif command -v python3 &> /dev/null; then
-    PYTHON_CMD="python3"
-else
-    echo "Error: Python 3 not found. Please install Python 3.11 or later."
-    exit 1
-fi
-
-echo "Using Python: $PYTHON_CMD"
-$PYTHON_CMD --version
-echo ""
-
-# Create virtual environment
-if [[ -d "$VENV_DIR" ]]; then
-    echo "Virtual environment exists at $VENV_DIR"
-    read -p "Recreate it? (y/N) " -n 1 -r
-    echo ""
-    if [[ $REPLY =~ ^[Yy]$ ]]; then
-        rm -rf "$VENV_DIR"
-        $PYTHON_CMD -m venv "$VENV_DIR"
-    fi
-else
-    echo "Creating virtual environment..."
-    $PYTHON_CMD -m venv "$VENV_DIR"
-fi
-
-# Activate and install dependencies
-source "$VENV_DIR/bin/activate"
-pip install --upgrade pip
-pip install -r "$SCRIPT_DIR/requirements.txt"
-
-echo ""
-
-# ============================================
-# Step 4: Create output directory
-# ============================================
-
-echo "Step 4: Creating output directory"
-echo "----------------------------------------"
-
-OUTPUT_DIR="/tmp/mana-image-gen"
-mkdir -p "$OUTPUT_DIR"
-echo "Output directory: $OUTPUT_DIR"
-
-echo ""
-
-# ============================================
-# Step 5: Test flux2.c
-# ============================================
-
-echo "Step 5: Testing flux2.c"
-echo "----------------------------------------"
-
-if [[ -x "$FLUX_DIR/flux" ]] && [[ -d "$MODEL_DIR" ]]; then
-    echo "Testing image generation..."
-    TEST_OUTPUT="$OUTPUT_DIR/test_setup.png"
-
-    # Quick test with low resolution
-    "$FLUX_DIR/flux" -d "$MODEL_DIR" -p "A simple test image" -o "$TEST_OUTPUT" -W 256 -H 256 -s 2 2>/dev/null && {
-        echo "Test successful! Generated: $TEST_OUTPUT"
-        rm -f "$TEST_OUTPUT"
-    } || {
-        echo "Warning: Test generation failed. Model may not be fully downloaded."
-        echo "Please ensure the model is complete before using the service."
-    }
-else
-    echo "Skipping test - flux2.c or model not ready"
-fi
-
-echo ""
-
-# ============================================
-# Done
-# ============================================
-
-echo "=========================================="
-echo "Setup Complete!"
-echo "=========================================="
-echo ""
-echo "Configuration:"
-echo "  FLUX_BINARY: $FLUX_DIR/flux"
-echo "  FLUX_MODEL_DIR: $MODEL_DIR"
-echo "  OUTPUT_DIR: $OUTPUT_DIR"
-echo ""
-echo "To start the service:"
-echo ""
-echo "  cd $SCRIPT_DIR"
-echo "  source .venv/bin/activate"
-echo "  FLUX_BINARY=$FLUX_DIR/flux FLUX_MODEL_DIR=$MODEL_DIR uvicorn app.main:app --host 0.0.0.0 --port 3025"
-echo ""
-echo "Or for development with auto-reload:"
-echo ""
-echo "  FLUX_BINARY=$FLUX_DIR/flux FLUX_MODEL_DIR=$MODEL_DIR uvicorn app.main:app --host 0.0.0.0 --port 3025 --reload"
-echo ""
-echo "Test the service:"
-echo ""
-echo "  curl http://localhost:3025/health"
-echo "  curl -X POST http://localhost:3025/generate \\"
-echo "    -H 'Content-Type: application/json' \\"
-echo "    -d '{\"prompt\": \"A cat wearing sunglasses\"}'"
-echo ""