From 3c4a6d4f6978c92b644dd56920c1fc04a59993cf Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Fri, 8 May 2026 18:53:53 +0200
Subject: [PATCH] =?UTF-8?q?chore(cutover):=20remove=20services/mana-stt/?=
 =?UTF-8?q?=20=E2=80=94=20moved=20to=20mana-platform?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live containers on the Mac Mini build out of `../mana/services/mana-stt/`
since the 8-Doppel-Cutover commit (774852ba2). Smoke test green
2026-05-08 — health endpoints, JWKS, login flow, Stripe-webhook all
reachable from the new build path. Removing the now-stale duplicate.

Was 132K in this repo, gone now. Active code lives in
`Code/mana/services/mana-stt/` (siehe ../mana/CLAUDE.md).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 services/mana-stt/.env.example               |  70 --
 services/mana-stt/CLAUDE.md                  |  96 ---
 services/mana-stt/README.md                  |  31 -
 services/mana-stt/app/__init__.py            |   1 -
 services/mana-stt/app/auth.py                | 271 -------
 services/mana-stt/app/external_auth.py       | 145 ----
 services/mana-stt/app/main.py                | 392 ----------
 services/mana-stt/app/vllm_service.py        | 178 -----
 services/mana-stt/app/voxtral_api_service.py | 213 ------
 services/mana-stt/app/voxtral_service.py     | 267 -------
 services/mana-stt/app/vram_manager.py        | 114 ---
 services/mana-stt/app/whisper_service.py     | 358 ---------
 services/mana-stt/grafana-dashboard.json     | 740 -------------------
 services/mana-stt/requirements-cuda.txt      |  35 -
 services/mana-stt/requirements.txt           |  28 -
 services/mana-stt/service.pyw                |  34 -
 16 files changed, 2973 deletions(-)
 delete mode 100644 services/mana-stt/.env.example
 delete mode 100644 services/mana-stt/CLAUDE.md
 delete mode 100644 services/mana-stt/README.md
 delete mode 100644 services/mana-stt/app/__init__.py
 delete mode 100644 services/mana-stt/app/auth.py
 delete mode 100644 services/mana-stt/app/external_auth.py
 delete mode 100644 services/mana-stt/app/main.py
 delete mode 100644 services/mana-stt/app/vllm_service.py
 delete mode 100644 services/mana-stt/app/voxtral_api_service.py
 delete mode 100644 services/mana-stt/app/voxtral_service.py
 delete mode 100644 services/mana-stt/app/vram_manager.py
 delete mode 100644 services/mana-stt/app/whisper_service.py
 delete mode 100644 services/mana-stt/grafana-dashboard.json
 delete mode 100644 services/mana-stt/requirements-cuda.txt
 delete mode 100644 services/mana-stt/requirements.txt
 delete mode 100644 services/mana-stt/service.pyw

diff --git a/services/mana-stt/.env.example b/services/mana-stt/.env.example
deleted file mode 100644
index 3a435c073..000000000
--- a/services/mana-stt/.env.example
+++ /dev/null
@@ -1,70 +0,0 @@
-# Mana STT Service Configuration
-# Copy to .env and adjust values as needed
-
-# Server
-PORT=3020
-
-# Whisper (Lightning MLX)
-WHISPER_MODEL=large-v3
-
-# Voxtral (Local Models)
-# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
-VOXTRAL_MODEL=voxtral-realtime-4b
-
-# WhisperX (CUDA GPU Server)
-# Enable WhisperX for rich transcription (diarization, word alignment)
-# Requires NVIDIA GPU + requirements-cuda.txt
-USE_WHISPERX=false
-
-# WhisperX batch size (higher = faster but more VRAM, 16 works well for RTX 3090)
-WHISPERX_BATCH_SIZE=16
-
-# Device and compute type for CUDA
-# WHISPER_DEVICE=cuda
-# WHISPER_COMPUTE_TYPE=float16
-
-# HuggingFace token for pyannote speaker diarization models
-# Required for diarization. Accept terms at:
-#   https://huggingface.co/pyannote/speaker-diarization-3.1
-#   https://huggingface.co/pyannote/segmentation-3.0
-HF_TOKEN=
-
-# Model Loading
-# Set to true to preload models on startup (slower startup, faster first request)
-PRELOAD_MODELS=false
-
-# Load Management
-# Maximum concurrent transcription requests before API fallback
-MAX_CONCURRENT_REQUESTS=3
-
-# API Fallback
-# Enable automatic fallback to Mistral API when overloaded
-API_FALLBACK_ENABLED=true
-
-# Mistral API Key (required for API fallback)
-# Get your key at https://console.mistral.ai/
-MISTRAL_API_KEY=
-
-# CORS Origins (comma-separated)
-CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173
-
-# ===========================================
-# Authentication
-# ===========================================
-
-# Enable API key authentication (default: true for production)
-REQUIRE_AUTH=true
-
-# API Keys (comma-separated, format: key:name)
-# Example: sk-abc123:myapp,sk-def456:testuser
-API_KEYS=
-
-# Internal API key (no rate limit, for internal services)
-# Generate with: openssl rand -hex 32
-INTERNAL_API_KEY=
-
-# Rate Limiting
-# Requests per window per API key
-RATE_LIMIT_REQUESTS=60
-# Window size in seconds
-RATE_LIMIT_WINDOW=60
diff --git a/services/mana-stt/CLAUDE.md b/services/mana-stt/CLAUDE.md
deleted file mode 100644
index 0a98c2386..000000000
--- a/services/mana-stt/CLAUDE.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# mana-stt
-
-Speech-to-Text microservice. Wraps Whisper (CUDA, with WhisperX for word-level timestamps + diarization), local Voxtral via vLLM, and Mistral's hosted Voxtral API behind a small FastAPI surface. Lives on the Windows GPU server (`mana-server-gpu`, RTX 3090).
-
-> ⚠️ **Earlier history**: this directory used to contain Mac-Mini–targeted
-> code (Whisper Lightning MLX, com.mana.mana-stt.plist launchd setup,
-> setup.sh with Apple-Silicon checks). That all moved to the Windows
-> GPU box and was removed from the repo. If you're looking for the MLX
-> path, see git history.
-
-## Tech Stack
-
-| Layer | Technology |
-|-------|------------|
-| **Runtime** | Python 3.11 + uvicorn (Windows) |
-| **Framework** | FastAPI |
-| **Whisper** | `whisperx` on CUDA (large-v3 + word alignment + pyannote diarization) |
-| **Voxtral (local)** | vLLM serving Voxtral 3B/4B/24B (`vllm_service.py`) |
-| **Voxtral (cloud)** | Mistral API (`voxtral_api_service.py`) |
-| **Auth** | Per-key + internal-key API auth (`app/auth.py`, JWT via mana-auth in `app/external_auth.py`) |
-| **VRAM** | Shared `vram_manager.py` accountant — coordinated with mana-tts and mana-image-gen so multiple GPU services don't OOM each other |
-| **Process supervision** | Windows Scheduled Task `ManaSTT` (AtLogOn) |
-
-## Port: 3020
-
-## Where it runs
-
-| Host | Path on disk | Entrypoint |
-|------|--------------|------------|
-| Windows GPU server (`192.168.178.11`) | `C:\mana\services\mana-stt\` | `service.pyw` via Scheduled Task `ManaSTT` |
-
-Public URL: `https://gpu-stt.mana.how` (via Cloudflare Tunnel + Mac Mini gpu-proxy).
-
-## API Endpoints
-
-| Method | Path | Description |
-|--------|------|-------------|
-| GET | `/health` | Liveness + which backends are loaded |
-| GET | `/models` | Available STT models |
-| POST | `/transcribe` | Whisper (WhisperX, default) — multipart `file` + optional `language` |
-| POST | `/transcribe/voxtral` | Local Voxtral via vLLM |
-| POST | `/transcribe/auto` | Routing helper — picks the best backend for the input |
-
-All endpoints (except `/health`) require `Authorization: Bearer <token>`. Tokens are validated against `API_KEYS` (per-app keys) or `INTERNAL_API_KEY` (no rate limit), and JWTs from mana-auth are also accepted via `external_auth.py`.
-
-## Backends (`app/`)
-
-| File | What it loads |
-|------|---------------|
-| `whisper_service.py` | WhisperX on CUDA (large-v3 + alignment + pyannote diarization) |
-| `voxtral_service.py` | Local Voxtral via vLLM (slower start, richer multilingual) |
-| `voxtral_api_service.py` | Mistral hosted Voxtral API (cloud, no GPU needed) |
-| `vllm_service.py` | vLLM client primitives shared by Voxtral |
-| `vram_manager.py` | Shared VRAM accounting — same module also used by mana-tts and mana-image-gen |
-| `auth.py` | API-key auth (internal + per-app keys) |
-| `external_auth.py` | JWT validation via mana-auth |
-
-Backends are loaded lazily during the FastAPI lifespan and reported by `/health`.
-
-## Configuration (`.env` on the Windows GPU box)
-
-```env
-PORT=3020
-WHISPER_MODEL=large-v3
-WHISPER_DEVICE=cuda
-WHISPER_COMPUTE_TYPE=float16
-WHISPER_DEFAULT_LANGUAGE=de
-PRELOAD_MODELS=true
-USE_VLLM=false
-HF_TOKEN=...                    # required for pyannote diarization models
-REQUIRE_AUTH=true
-API_KEYS=sk-app1:app1,sk-app2:app2
-INTERNAL_API_KEY=...            # cross-service, no rate limit
-CORS_ORIGINS=https://mana.how,https://chat.mana.how
-```
-
-## Operations
-
-```powershell
-# Status
-Get-ScheduledTask -TaskName "ManaSTT" | Format-List TaskName, State
-Get-NetTCPConnection -LocalPort 3020 -State Listen
-
-# Restart
-Stop-ScheduledTask -TaskName "ManaSTT"
-Start-ScheduledTask -TaskName "ManaSTT"
-
-# Logs
-Get-Content C:\mana\services\mana-stt\service.log -Tail 50
-```
-
-## Reference
-
-- `docs/WINDOWS_GPU_SERVER_SETUP.md` — Windows box setup, scheduled tasks, firewall, Cloudflare tunnel
-- `docs/LOCAL_STT_MODELS.md` — model comparisons (WER, latency, language coverage)
-- `services/mana-stt/grafana-dashboard.json` — Prometheus metrics dashboard
diff --git a/services/mana-stt/README.md b/services/mana-stt/README.md
deleted file mode 100644
index 8e4abf5f1..000000000
--- a/services/mana-stt/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Mana STT Service
-
-Speech-to-Text API service running on the Windows GPU server (`mana-server-gpu`, RTX 3090). Wraps **WhisperX** (CUDA, large-v3 + word alignment + pyannote diarization), local **Voxtral via vLLM**, and the hosted **Mistral Voxtral API**.
-
-For architecture, deployment, configuration, and operations see [`CLAUDE.md`](./CLAUDE.md) and [`docs/WINDOWS_GPU_SERVER_SETUP.md`](../../docs/WINDOWS_GPU_SERVER_SETUP.md).
-
-## Port: 3020
-
-## Public URL
-
-`https://gpu-stt.mana.how` (via Cloudflare Tunnel + Mac Mini gpu-proxy)
-
-## API Endpoints
-
-| Endpoint | Method | Description |
-|----------|--------|-------------|
-| `/health` | GET | Health check + which backends are loaded |
-| `/models` | GET | List available models |
-| `/transcribe` | POST | Whisper / WhisperX transcription |
-| `/transcribe/voxtral` | POST | Voxtral transcription (local vLLM) |
-| `/transcribe/auto` | POST | Auto-select best backend for the input |
-
-All endpoints (except `/health`) require `Authorization: Bearer <token>`.
-
-## Quick Test
-
-```bash
-curl -F "file=@audio.wav" -F "language=de" \
-  -H "Authorization: Bearer $INTERNAL_API_KEY" \
-  https://gpu-stt.mana.how/transcribe
-```
diff --git a/services/mana-stt/app/__init__.py b/services/mana-stt/app/__init__.py
deleted file mode 100644
index e5b57f4cc..000000000
--- a/services/mana-stt/app/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Mana STT Service
diff --git a/services/mana-stt/app/auth.py b/services/mana-stt/app/auth.py
deleted file mode 100644
index 40258c730..000000000
--- a/services/mana-stt/app/auth.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""
-API Key Authentication for ManaCore STT Service
-
-Supports two authentication modes:
-1. Local API keys: Configured via environment variables
-2. External API keys: Validated via mana-core-auth service (when EXTERNAL_AUTH_ENABLED=true)
-
-Usage:
-    # Local keys
-    API_KEYS=sk-key1:name1,sk-key2:name2
-    INTERNAL_API_KEY=sk-internal-xxx
-
-    # External auth (for user-created keys via mana.how)
-    EXTERNAL_AUTH_ENABLED=true
-    MANA_CORE_AUTH_URL=http://localhost:3001
-"""
-
-import os
-import time
-import logging
-from typing import Optional
-from collections import defaultdict
-from dataclasses import dataclass, field
-
-from fastapi import HTTPException, Security, Request
-from fastapi.security import APIKeyHeader
-
-from .external_auth import (
-    is_external_auth_enabled,
-    validate_api_key_external,
-    ExternalValidationResult,
-)
-
-logger = logging.getLogger(__name__)
-
-# Configuration
-API_KEYS_ENV = os.getenv("API_KEYS", "")  # Format: "sk-key1:name1,sk-key2:name2"
-INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY", "")  # Unlimited internal key
-REQUIRE_AUTH = os.getenv("REQUIRE_AUTH", "true").lower() == "true"
-RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "60"))  # Per minute
-RATE_LIMIT_WINDOW = int(os.getenv("RATE_LIMIT_WINDOW", "60"))  # Seconds
-
-
-@dataclass
-class APIKey:
-    """API Key with metadata."""
-    key: str
-    name: str
-    is_internal: bool = False
-    rate_limit: int = RATE_LIMIT_REQUESTS  # Requests per window
-
-
-@dataclass
-class RateLimitInfo:
-    """Rate limit tracking per key."""
-    requests: list = field(default_factory=list)
-
-    def is_allowed(self, limit: int, window: int) -> bool:
-        """Check if request is allowed within rate limit."""
-        now = time.time()
-        # Remove old requests outside window
-        self.requests = [t for t in self.requests if now - t < window]
-
-        if len(self.requests) >= limit:
-            return False
-
-        self.requests.append(now)
-        return True
-
-    def remaining(self, limit: int, window: int) -> int:
-        """Get remaining requests in current window."""
-        now = time.time()
-        self.requests = [t for t in self.requests if now - t < window]
-        return max(0, limit - len(self.requests))
-
-
-# Parse API keys from environment
-def _parse_api_keys() -> dict[str, APIKey]:
-    """Parse API keys from environment variables."""
-    keys = {}
-
-    # Parse comma-separated keys
-    if API_KEYS_ENV:
-        for entry in API_KEYS_ENV.split(","):
-            entry = entry.strip()
-            if ":" in entry:
-                key, name = entry.split(":", 1)
-            else:
-                key, name = entry, "default"
-            keys[key.strip()] = APIKey(key=key.strip(), name=name.strip())
-
-    # Add internal key with no rate limit
-    if INTERNAL_API_KEY:
-        keys[INTERNAL_API_KEY] = APIKey(
-            key=INTERNAL_API_KEY,
-            name="internal",
-            is_internal=True,
-            rate_limit=999999,  # Effectively unlimited
-        )
-
-    return keys
-
-
-# Global state
-_api_keys = _parse_api_keys()
-_rate_limits: dict[str, RateLimitInfo] = defaultdict(RateLimitInfo)
-
-# Security scheme
-api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
-
-
-@dataclass
-class AuthResult:
-    """Result of authentication check."""
-    authenticated: bool
-    key_name: Optional[str] = None
-    is_internal: bool = False
-    rate_limit_remaining: Optional[int] = None
-    user_id: Optional[str] = None  # Set when using external auth
-
-
-async def verify_api_key(
-    request: Request,
-    api_key: Optional[str] = Security(api_key_header),
-) -> AuthResult:
-    """
-    Verify API key and check rate limits.
-
-    Supports two authentication modes:
-    1. External auth via mana-core-auth (for sk_live_ keys)
-    2. Local auth via environment variables
-
-    Returns AuthResult with authentication status.
-    Raises HTTPException if auth fails or rate limited.
-    """
-    # Skip auth for health and docs endpoints
-    path = request.url.path
-    if path in ["/health", "/docs", "/openapi.json", "/redoc"]:
-        return AuthResult(authenticated=True, key_name="public")
-
-    # If auth not required, allow all
-    if not REQUIRE_AUTH:
-        return AuthResult(authenticated=True, key_name="anonymous")
-
-    # Check for API key
-    if not api_key:
-        logger.warning(f"Missing API key for {path} from {request.client.host if request.client else 'unknown'}")
-        raise HTTPException(
-            status_code=401,
-            detail="Missing API key. Provide X-API-Key header.",
-            headers={"WWW-Authenticate": "ApiKey"},
-        )
-
-    # Try external auth first for sk_live_ keys (user-created keys via mana.how)
-    if api_key.startswith("sk_live_") and is_external_auth_enabled():
-        external_result = await validate_api_key_external(api_key, "stt")
-
-        if external_result is not None:
-            if external_result.valid:
-                # Use rate limits from external auth
-                rate_info = _rate_limits[api_key]
-                limit = external_result.rate_limit_requests
-                window = external_result.rate_limit_window
-
-                if not rate_info.is_allowed(limit, window):
-                    remaining = rate_info.remaining(limit, window)
-                    logger.warning(f"Rate limit exceeded for external key")
-                    raise HTTPException(
-                        status_code=429,
-                        detail=f"Rate limit exceeded. Try again in {window} seconds.",
-                        headers={
-                            "X-RateLimit-Limit": str(limit),
-                            "X-RateLimit-Remaining": str(remaining),
-                            "X-RateLimit-Reset": str(int(time.time()) + window),
-                            "Retry-After": str(window),
-                        },
-                    )
-
-                remaining = rate_info.remaining(limit, window)
-                logger.debug(f"Authenticated external request from user {external_result.user_id} to {path}")
-
-                return AuthResult(
-                    authenticated=True,
-                    key_name="external",
-                    is_internal=False,
-                    rate_limit_remaining=remaining,
-                    user_id=external_result.user_id,
-                )
-            else:
-                # External auth returned invalid
-                logger.warning(f"External auth failed: {external_result.error}")
-                raise HTTPException(
-                    status_code=401,
-                    detail=external_result.error or "Invalid API key.",
-                    headers={"WWW-Authenticate": "ApiKey"},
-                )
-        # If external_result is None, fall through to local auth
-
-    # Local auth: Validate key against environment variables
-    if api_key not in _api_keys:
-        logger.warning(f"Invalid API key attempt for {path}")
-        raise HTTPException(
-            status_code=401,
-            detail="Invalid API key.",
-            headers={"WWW-Authenticate": "ApiKey"},
-        )
-
-    key_info = _api_keys[api_key]
-
-    # Check rate limit (skip for internal keys)
-    if not key_info.is_internal:
-        rate_info = _rate_limits[api_key]
-        if not rate_info.is_allowed(key_info.rate_limit, RATE_LIMIT_WINDOW):
-            remaining = rate_info.remaining(key_info.rate_limit, RATE_LIMIT_WINDOW)
-            logger.warning(f"Rate limit exceeded for key '{key_info.name}'")
-            raise HTTPException(
-                status_code=429,
-                detail=f"Rate limit exceeded. Try again in {RATE_LIMIT_WINDOW} seconds.",
-                headers={
-                    "X-RateLimit-Limit": str(key_info.rate_limit),
-                    "X-RateLimit-Remaining": str(remaining),
-                    "X-RateLimit-Reset": str(int(time.time()) + RATE_LIMIT_WINDOW),
-                    "Retry-After": str(RATE_LIMIT_WINDOW),
-                },
-            )
-        remaining = rate_info.remaining(key_info.rate_limit, RATE_LIMIT_WINDOW)
-    else:
-        remaining = None
-
-    logger.debug(f"Authenticated request from '{key_info.name}' to {path}")
-
-    return AuthResult(
-        authenticated=True,
-        key_name=key_info.name,
-        is_internal=key_info.is_internal,
-        rate_limit_remaining=remaining,
-    )
-
-
-def get_api_key_stats() -> dict:
-    """Get statistics about API keys (for admin endpoint)."""
-    stats = {
-        "total_keys": len(_api_keys),
-        "auth_required": REQUIRE_AUTH,
-        "rate_limit": {
-            "requests_per_window": RATE_LIMIT_REQUESTS,
-            "window_seconds": RATE_LIMIT_WINDOW,
-        },
-        "keys": [],
-    }
-
-    for key, info in _api_keys.items():
-        # Don't expose actual keys, just metadata
-        masked_key = key[:8] + "..." if len(key) > 8 else "***"
-        rate_info = _rate_limits.get(key, RateLimitInfo())
-        stats["keys"].append({
-            "name": info.name,
-            "key_prefix": masked_key,
-            "is_internal": info.is_internal,
-            "requests_in_window": len(rate_info.requests),
-            "remaining": rate_info.remaining(info.rate_limit, RATE_LIMIT_WINDOW),
-        })
-
-    return stats
-
-
-def reload_api_keys():
-    """Reload API keys from environment (for runtime updates)."""
-    global _api_keys
-    _api_keys = _parse_api_keys()
-    logger.info(f"Reloaded {len(_api_keys)} API keys")
diff --git a/services/mana-stt/app/external_auth.py b/services/mana-stt/app/external_auth.py
deleted file mode 100644
index 6f64bd315..000000000
--- a/services/mana-stt/app/external_auth.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-External API Key Validation via mana-core-auth
-
-When EXTERNAL_AUTH_ENABLED=true, API keys are validated against the
-central mana-core-auth service. This allows users to create and manage
-API keys from the mana.how web interface.
-
-Results are cached for 5 minutes to reduce load on the auth service.
-"""
-
-import os
-import time
-import logging
-import httpx
-from typing import Optional
-from dataclasses import dataclass
-
-logger = logging.getLogger(__name__)
-
-# Configuration
-EXTERNAL_AUTH_ENABLED = os.getenv("EXTERNAL_AUTH_ENABLED", "false").lower() == "true"
-MANA_CORE_AUTH_URL = os.getenv("MANA_CORE_AUTH_URL", "http://localhost:3001")
-API_KEY_CACHE_TTL = int(os.getenv("API_KEY_CACHE_TTL", "300"))  # 5 minutes
-EXTERNAL_AUTH_TIMEOUT = float(os.getenv("EXTERNAL_AUTH_TIMEOUT", "5.0"))  # seconds
-
-
-@dataclass
-class ExternalValidationResult:
-    """Result from external API key validation."""
-    valid: bool
-    user_id: Optional[str] = None
-    scopes: Optional[list] = None
-    rate_limit_requests: int = 60
-    rate_limit_window: int = 60
-    error: Optional[str] = None
-    cached_at: float = 0.0
-
-
-# In-memory cache for validation results
-# Key: API key, Value: ExternalValidationResult
-_validation_cache: dict[str, ExternalValidationResult] = {}
-
-
-def is_external_auth_enabled() -> bool:
-    """Check if external authentication is enabled."""
-    return EXTERNAL_AUTH_ENABLED
-
-
-def _get_cached_result(api_key: str) -> Optional[ExternalValidationResult]:
-    """Get cached validation result if still valid."""
-    result = _validation_cache.get(api_key)
-    if result and (time.time() - result.cached_at) < API_KEY_CACHE_TTL:
-        return result
-    return None
-
-
-def _cache_result(api_key: str, result: ExternalValidationResult):
-    """Cache a validation result."""
-    result.cached_at = time.time()
-    _validation_cache[api_key] = result
-
-    # Clean up old entries periodically (keep cache size manageable)
-    if len(_validation_cache) > 1000:
-        now = time.time()
-        expired_keys = [
-            k for k, v in _validation_cache.items()
-            if (now - v.cached_at) >= API_KEY_CACHE_TTL
-        ]
-        for k in expired_keys:
-            del _validation_cache[k]
-
-
-async def validate_api_key_external(api_key: str, scope: str) -> Optional[ExternalValidationResult]:
-    """
-    Validate an API key against mana-core-auth service.
-
-    Args:
-        api_key: The API key to validate (e.g., "sk_live_...")
-        scope: The required scope (e.g., "stt" or "tts")
-
-    Returns:
-        ExternalValidationResult if external auth is enabled and the key was validated.
-        None if external auth is disabled or the service is unavailable (fallback to local).
-    """
-    if not EXTERNAL_AUTH_ENABLED:
-        return None
-
-    # Check cache first
-    cached = _get_cached_result(api_key)
-    if cached:
-        logger.debug(f"Using cached validation result for key prefix: {api_key[:12]}...")
-        # Check scope against cached result
-        if cached.valid and cached.scopes and scope not in cached.scopes:
-            return ExternalValidationResult(
-                valid=False,
-                error=f"API key does not have scope: {scope}",
-            )
-        return cached
-
-    # Call mana-core-auth validation endpoint
-    try:
-        async with httpx.AsyncClient(timeout=EXTERNAL_AUTH_TIMEOUT) as client:
-            response = await client.post(
-                f"{MANA_CORE_AUTH_URL}/api/v1/api-keys/validate",
-                json={"apiKey": api_key, "scope": scope},
-            )
-
-            if response.status_code == 200:
-                data = response.json()
-                result = ExternalValidationResult(
-                    valid=data.get("valid", False),
-                    user_id=data.get("userId"),
-                    scopes=data.get("scopes", []),
-                    rate_limit_requests=data.get("rateLimit", {}).get("requests", 60),
-                    rate_limit_window=data.get("rateLimit", {}).get("window", 60),
-                    error=data.get("error"),
-                )
-                _cache_result(api_key, result)
-                return result
-            else:
-                logger.warning(
-                    f"External auth returned status {response.status_code}: {response.text}"
-                )
-                # Don't cache errors - allow retry
-                return ExternalValidationResult(
-                    valid=False,
-                    error=f"Auth service returned {response.status_code}",
-                )
-
-    except httpx.TimeoutException:
-        logger.warning("External auth service timeout - falling back to local auth")
-        return None
-    except httpx.ConnectError:
-        logger.warning("Cannot connect to external auth service - falling back to local auth")
-        return None
-    except Exception as e:
-        logger.error(f"External auth error: {e}")
-        return None
-
-
-def clear_cache():
-    """Clear the validation cache (for testing or runtime updates)."""
-    global _validation_cache
-    _validation_cache.clear()
-    logger.info("External auth cache cleared")
diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py
deleted file mode 100644
index f07e33a0e..000000000
--- a/services/mana-stt/app/main.py
+++ /dev/null
@@ -1,392 +0,0 @@
-"""
-ManaCore STT API Service (WhisperX Edition)
-Speech-to-Text with WhisperX: transcription, word timestamps, speaker diarization.
-
-Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
-"""
-
-import os
-import logging
-import time
-from typing import Optional
-from contextlib import asynccontextmanager
-
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, Response
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-
-from app.auth import verify_api_key, AuthResult, get_api_key_stats, REQUIRE_AUTH
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-# Environment
-PORT = int(os.getenv("PORT", "3020"))
-DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
-PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
-CORS_ORIGINS = os.getenv(
-    "CORS_ORIGINS",
-    "https://mana.how,https://chat.mana.how,http://localhost:5173"
-).split(",")
-
-# vLLM configuration
-VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
-USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true"
-
-
-# Response models
-class WordInfo(BaseModel):
-    word: str
-    start: float
-    end: float
-    score: Optional[float] = None
-    speaker: Optional[str] = None
-
-
-class SegmentInfo(BaseModel):
-    start: float
-    end: float
-    text: str
-    speaker: Optional[str] = None
-
-
-class TranscriptionResponse(BaseModel):
-    text: str
-    language: Optional[str] = None
-    model: str
-    latency_ms: Optional[float] = None
-    duration_seconds: Optional[float] = None
-    words: Optional[list[WordInfo]] = None
-    segments: Optional[list[SegmentInfo]] = None
-    speakers: Optional[list[str]] = None
-
-
-class HealthResponse(BaseModel):
-    status: str
-    whisper_loaded: bool
-    whisperx: bool
-    vllm_available: bool
-    vllm_url: Optional[str] = None
-    mistral_api_available: bool
-    auth_required: bool
-    models: dict
-
-
-class ModelsResponse(BaseModel):
-    whisper: list
-    voxtral_vllm: list
-    default_whisper: str
-
-
-# Track loaded models
-models_status = {
-    "whisper_loaded": False,
-    "vllm_available": False,
-}
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Startup and shutdown events."""
-    logger.info("Starting ManaCore STT Service (WhisperX Edition)...")
-
-    # Check vLLM availability
-    if USE_VLLM:
-        from app.vllm_service import check_health
-        health = await check_health()
-        models_status["vllm_available"] = health.get("status") == "healthy"
-
-    # Check Mistral API
-    from app.voxtral_api_service import is_available as api_available
-    if api_available():
-        logger.info("Mistral API fallback configured")
-
-    # Always preload WhisperX model at startup (avoids timeout on first request)
-    logger.info("Preloading WhisperX model...")
-    try:
-        from app.whisper_service import get_whisper_model
-        get_whisper_model(DEFAULT_WHISPER_MODEL)
-        models_status["whisper_loaded"] = True
-        logger.info("WhisperX model preloaded successfully")
-    except Exception as e:
-        logger.warning(f"Failed to preload WhisperX: {e}")
-
-    logger.info(f"STT Service ready on port {PORT}")
-    yield
-    logger.info("Shutting down STT Service...")
-
-
-# Create FastAPI app
-app = FastAPI(
-    title="ManaCore STT Service",
-    description="Speech-to-Text API with WhisperX (word timestamps + speaker diarization)",
-    version="3.0.0",
-    lifespan=lifespan,
-)
-
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=CORS_ORIGINS,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-@app.get("/health", response_model=HealthResponse)
-async def health_check():
-    """Health check endpoint."""
-    from app.voxtral_api_service import is_available as api_available
-    from app.vllm_service import check_health
-
-    vllm_health = await check_health()
-
-    return HealthResponse(
-        status="healthy",
-        whisper_loaded=models_status["whisper_loaded"],
-        whisperx=True,
-        vllm_available=vllm_health.get("status") == "healthy",
-        vllm_url=VLLM_URL if USE_VLLM else None,
-        mistral_api_available=api_available(),
-        auth_required=REQUIRE_AUTH,
-        models={
-            "default_whisper": DEFAULT_WHISPER_MODEL,
-            "engine": "whisperx",
-            "features": ["transcription", "word_timestamps", "speaker_diarization"],
-        },
-    )
-
-
-@app.get("/models", response_model=ModelsResponse)
-async def list_models(auth: AuthResult = Depends(verify_api_key)):
-    """List available models."""
-    from app.whisper_service import AVAILABLE_MODELS as whisper_models
-    from app.vllm_service import get_models
-
-    vllm_models = await get_models()
-
-    return ModelsResponse(
-        whisper=whisper_models,
-        voxtral_vllm=vllm_models,
-        default_whisper=DEFAULT_WHISPER_MODEL,
-    )
-
-
-@app.post("/transcribe", response_model=TranscriptionResponse)
-async def transcribe_whisper(
-    response: Response,
-    file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
-    model: Optional[str] = Form(None, description="Whisper model to use"),
-    align: bool = Form(True, description="Enable word-level timestamp alignment"),
-    diarize: bool = Form(False, description="Enable speaker diarization"),
-    min_speakers: Optional[int] = Form(None, description="Min expected speakers (helps diarization)"),
-    max_speakers: Optional[int] = Form(None, description="Max expected speakers"),
-    auth: AuthResult = Depends(verify_api_key),
-):
-    """
-    Transcribe audio using WhisperX.
-
-    Features:
-    - Word-level timestamps (align=true, default)
-    - Speaker diarization (diarize=true, opt-in)
-
-    Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4
-    Max file size: 100MB
-    """
-    if auth.rate_limit_remaining is not None:
-        response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
-
-    if not file.filename:
-        raise HTTPException(status_code=400, detail="No file provided")
-
-    allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
-    ext = os.path.splitext(file.filename)[1].lower()
-    if ext not in allowed_extensions:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
-        )
-
-    start_time = time.time()
-
-    try:
-        from app.whisper_service import transcribe_audio_bytes
-
-        audio_bytes = await file.read()
-        if len(audio_bytes) > 100 * 1024 * 1024:
-            raise HTTPException(status_code=400, detail="File too large (max 100MB)")
-
-        model_name = model or DEFAULT_WHISPER_MODEL
-
-        result = await transcribe_audio_bytes(
-            audio_bytes=audio_bytes,
-            filename=file.filename,
-            language=language,
-            model_name=model_name,
-            align=align,
-            diarize=diarize,
-            min_speakers=min_speakers,
-            max_speakers=max_speakers,
-        )
-
-        models_status["whisper_loaded"] = True
-        latency_ms = (time.time() - start_time) * 1000
-
-        # Build response
-        resp = TranscriptionResponse(
-            text=result.text,
-            language=result.language,
-            model=f"whisperx-{model_name}",
-            latency_ms=latency_ms,
-            duration_seconds=result.duration,
-        )
-
-        # Add word timestamps if available
-        if result.words:
-            resp.words = [
-                WordInfo(
-                    word=w.word,
-                    start=w.start,
-                    end=w.end,
-                    score=w.score,
-                    speaker=w.speaker,
-                )
-                for w in result.words
-            ]
-
-        # Add segments
-        if result.segments:
-            resp.segments = [
-                SegmentInfo(**s) for s in result.segments
-            ]
-
-        # Add speakers
-        if result.speakers:
-            resp.speakers = result.speakers
-
-        return resp
-
-    except Exception as e:
-        logger.error(f"WhisperX transcription error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
-async def transcribe_voxtral(
-    response: Response,
-    file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: str = Form("de", description="Language code"),
-    use_realtime: bool = Form(False, description="Use Realtime 4B model"),
-    auth: AuthResult = Depends(verify_api_key),
-):
-    """Transcribe audio using Voxtral via vLLM or Mistral API."""
-    if auth.rate_limit_remaining is not None:
-        response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
-
-    if not file.filename:
-        raise HTTPException(status_code=400, detail="No file provided")
-
-    from app.vllm_service import (
-        SUPPORTED_LANGUAGES,
-        is_available as vllm_available,
-        transcribe_audio_bytes as vllm_transcribe,
-        transcribe_with_realtime,
-        check_health,
-    )
-    from app.voxtral_api_service import (
-        is_available as api_available,
-        transcribe_audio_bytes as api_transcribe,
-    )
-
-    if language not in SUPPORTED_LANGUAGES:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
-        )
-
-    try:
-        audio_bytes = await file.read()
-        if len(audio_bytes) > 100 * 1024 * 1024:
-            raise HTTPException(status_code=400, detail="File too large (max 100MB)")
-
-        # Try vLLM first
-        if USE_VLLM:
-            health = await check_health()
-            if health.get("status") == "healthy":
-                if use_realtime:
-                    result = await transcribe_with_realtime(
-                        audio_bytes=audio_bytes, filename=file.filename, language=language,
-                    )
-                else:
-                    result = await vllm_transcribe(
-                        audio_bytes=audio_bytes, filename=file.filename, language=language,
-                    )
-                return TranscriptionResponse(
-                    text=result.text, language=result.language, model=result.model,
-                    latency_ms=result.latency_ms, duration_seconds=result.duration_seconds,
-                )
-
-        # Fallback to Mistral API
-        if api_available():
-            result = await api_transcribe(
-                audio_bytes=audio_bytes, filename=file.filename, language=language,
-            )
-            return TranscriptionResponse(
-                text=result.text, language=result.language, model=result.model,
-                duration_seconds=result.duration_seconds,
-            )
-
-        raise HTTPException(status_code=503, detail="Voxtral not available.")
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Voxtral transcription error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.post("/transcribe/auto", response_model=TranscriptionResponse)
-async def transcribe_auto(
-    response: Response,
-    file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: Optional[str] = Form(None, description="Language hint"),
-    prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
-    auth: AuthResult = Depends(verify_api_key),
-):
-    """Auto-select best model with fallback chain."""
-    if auth.rate_limit_remaining is not None:
-        response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
-
-    if prefer == "voxtral":
-        try:
-            return await transcribe_voxtral(response, file, language or "de", False, auth)
-        except Exception:
-            await file.seek(0)
-            return await transcribe_whisper(response, file, language, None, True, False, None, None, auth)
-    else:
-        try:
-            return await transcribe_whisper(response, file, language, None, True, False, None, None, auth)
-        except Exception:
-            await file.seek(0)
-            return await transcribe_voxtral(response, file, language or "de", False, auth)
-
-
-@app.exception_handler(Exception)
-async def global_exception_handler(request, exc):
-    logger.error(f"Unhandled error: {exc}")
-    return JSONResponse(
-        status_code=500,
-        content={"detail": "Internal server error", "error": str(exc)},
-    )
-
-
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("app.main:app", host="0.0.0.0", port=PORT, reload=False)
diff --git a/services/mana-stt/app/vllm_service.py b/services/mana-stt/app/vllm_service.py
deleted file mode 100644
index 4ca1857a1..000000000
--- a/services/mana-stt/app/vllm_service.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""
-vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription
-
-vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API.
-This service proxies requests to the vLLM server.
-
-Requirements:
-- vLLM server running on VLLM_URL (default: http://localhost:8100)
-- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602
-"""
-
-import os
-import logging
-import time
-import tempfile
-import httpx
-from pathlib import Path
-from typing import Optional
-from dataclasses import dataclass
-
-logger = logging.getLogger(__name__)
-
-# vLLM server configuration
-VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
-VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300"))  # 5 minutes for long audio
-
-# Model IDs
-VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507"
-VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
-
-
-@dataclass
-class VllmTranscriptionResult:
-    text: str
-    language: Optional[str] = None
-    model: str = "voxtral-vllm"
-    latency_ms: Optional[float] = None
-    duration_seconds: Optional[float] = None
-
-
-async def check_health() -> dict:
-    """Check if vLLM server is healthy."""
-    try:
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            response = await client.get(f"{VLLM_URL}/health")
-            if response.status_code == 200:
-                return {"status": "healthy", "url": VLLM_URL}
-            return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code}
-    except Exception as e:
-        return {"status": "unavailable", "url": VLLM_URL, "error": str(e)}
-
-
-async def get_models() -> list:
-    """Get available models from vLLM server."""
-    try:
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            response = await client.get(f"{VLLM_URL}/v1/models")
-            if response.status_code == 200:
-                data = response.json()
-                return [m["id"] for m in data.get("data", [])]
-            return []
-    except Exception:
-        return []
-
-
-def is_available() -> bool:
-    """Check if vLLM server is configured."""
-    return bool(VLLM_URL)
-
-
-async def transcribe_audio_bytes(
-    audio_bytes: bytes,
-    filename: str,
-    language: Optional[str] = "de",
-    model: Optional[str] = None,
-) -> VllmTranscriptionResult:
-    """
-    Transcribe audio using vLLM Voxtral server.
-
-    Args:
-        audio_bytes: Raw audio bytes
-        filename: Original filename (for format detection)
-        language: Language code (de, en, fr, etc.)
-        model: Model to use (defaults to Voxtral-Mini-3B-2507)
-
-    Returns:
-        VllmTranscriptionResult with transcription
-    """
-    start_time = time.time()
-    model_id = model or VOXTRAL_3B
-
-    logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)")
-
-    # Save to temp file (vLLM API accepts file uploads)
-    ext = Path(filename).suffix or ".wav"
-    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
-        tmp.write(audio_bytes)
-        tmp_path = tmp.name
-
-    try:
-        async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client:
-            # Use OpenAI-compatible transcription endpoint
-            with open(tmp_path, "rb") as f:
-                files = {"file": (filename, f, "audio/wav")}
-                data = {
-                    "model": model_id,
-                    "language": language or "de",
-                    "response_format": "json",
-                    "temperature": 0.0,  # Deterministic for transcription
-                }
-
-                response = await client.post(
-                    f"{VLLM_URL}/v1/audio/transcriptions",
-                    files=files,
-                    data=data,
-                )
-
-            if response.status_code != 200:
-                error_detail = response.text
-                logger.error(f"vLLM error: {response.status_code} - {error_detail}")
-                raise RuntimeError(f"vLLM transcription failed: {error_detail}")
-
-            result = response.json()
-            text = result.get("text", "")
-            duration = result.get("duration")
-
-            latency_ms = (time.time() - start_time) * 1000
-            logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
-
-            return VllmTranscriptionResult(
-                text=text.strip(),
-                language=language,
-                model=f"vllm-{model_id.split('/')[-1]}",
-                latency_ms=latency_ms,
-                duration_seconds=duration,
-            )
-
-    finally:
-        try:
-            os.unlink(tmp_path)
-        except Exception:
-            pass
-
-
-async def transcribe_with_realtime(
-    audio_bytes: bytes,
-    filename: str,
-    language: Optional[str] = "de",
-) -> VllmTranscriptionResult:
-    """
-    Transcribe using Voxtral 4B Realtime model.
-
-    Optimized for low latency (<500ms).
-    """
-    return await transcribe_audio_bytes(
-        audio_bytes=audio_bytes,
-        filename=filename,
-        language=language,
-        model=VOXTRAL_4B_REALTIME,
-    )
-
-
-# Supported languages (same as Voxtral)
-SUPPORTED_LANGUAGES = [
-    "en",  # English
-    "zh",  # Chinese
-    "hi",  # Hindi
-    "es",  # Spanish
-    "ar",  # Arabic
-    "fr",  # French
-    "pt",  # Portuguese
-    "ru",  # Russian
-    "de",  # German
-    "ja",  # Japanese
-    "ko",  # Korean
-    "it",  # Italian
-    "nl",  # Dutch
-]
diff --git a/services/mana-stt/app/voxtral_api_service.py b/services/mana-stt/app/voxtral_api_service.py
deleted file mode 100644
index 53d78f808..000000000
--- a/services/mana-stt/app/voxtral_api_service.py
+++ /dev/null
@@ -1,213 +0,0 @@
-"""
-Voxtral API Service - Mistral Cloud API Fallback
-Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded.
-
-Features:
-- Speaker diarization
-- Word-level timestamps
-- Context biasing for domain-specific terms
-- 13 language support
-"""
-
-import os
-import logging
-import tempfile
-from pathlib import Path
-from typing import Optional, Literal
-from dataclasses import dataclass, field
-
-logger = logging.getLogger(__name__)
-
-# Lazy load client
-_mistral_client = None
-
-MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
-DEFAULT_MODEL = "voxtral-mini-latest"  # voxtral-mini-2602
-
-
-@dataclass
-class Speaker:
-    """Speaker information from diarization."""
-    id: str
-    start: float
-    end: float
-
-
-@dataclass
-class WordTimestamp:
-    """Word-level timestamp."""
-    word: str
-    start: float
-    end: float
-
-
-@dataclass
-class SegmentTimestamp:
-    """Segment-level timestamp."""
-    text: str
-    start: float
-    end: float
-    speaker: Optional[str] = None
-
-
-@dataclass
-class VoxtralApiResult:
-    """Result from Voxtral API transcription."""
-    text: str
-    language: Optional[str] = None
-    model: str = "voxtral-api"
-    duration_seconds: Optional[float] = None
-    words: list[WordTimestamp] = field(default_factory=list)
-    segments: list[SegmentTimestamp] = field(default_factory=list)
-    speakers: list[Speaker] = field(default_factory=list)
-
-
-def get_mistral_client():
-    """Get or create Mistral client instance."""
-    global _mistral_client
-
-    if _mistral_client is None:
-        if not MISTRAL_API_KEY:
-            raise RuntimeError(
-                "MISTRAL_API_KEY environment variable not set. "
-                "Get your API key at https://console.mistral.ai/"
-            )
-
-        try:
-            from mistralai import Mistral
-            _mistral_client = Mistral(api_key=MISTRAL_API_KEY)
-            logger.info("Mistral API client initialized")
-        except ImportError:
-            raise RuntimeError(
-                "mistralai package not installed. "
-                "Run: pip install mistralai"
-            )
-
-    return _mistral_client
-
-
-def is_available() -> bool:
-    """Check if Mistral API is configured and available."""
-    return bool(MISTRAL_API_KEY)
-
-
-async def transcribe_audio_bytes(
-    audio_bytes: bytes,
-    filename: str,
-    language: Optional[str] = None,
-    timestamp_granularity: Optional[Literal["word", "segment"]] = None,
-    diarization: bool = False,
-    context_bias: Optional[list[str]] = None,
-) -> VoxtralApiResult:
-    """
-    Transcribe audio using Mistral's Voxtral API.
-
-    Args:
-        audio_bytes: Raw audio bytes
-        filename: Original filename (for extension detection)
-        language: Language code (de, en, fr, etc.) - auto-detect if None
-        timestamp_granularity: "word" or "segment" for timestamps
-        diarization: Enable speaker diarization
-        context_bias: List of domain-specific terms to improve accuracy (max 100)
-
-    Returns:
-        VoxtralApiResult with transcription and optional metadata
-    """
-    client = get_mistral_client()
-
-    logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)")
-
-    try:
-        # Build request parameters
-        request_params = {
-            "model": DEFAULT_MODEL,
-            "file": {
-                "content": audio_bytes,
-                "file_name": filename,
-            },
-        }
-
-        # Language and timestamps are mutually exclusive in current API
-        if language and not timestamp_granularity:
-            request_params["language"] = language
-
-        if timestamp_granularity:
-            request_params["timestamp_granularities"] = [timestamp_granularity]
-
-        if diarization:
-            request_params["diarization"] = True
-
-        if context_bias:
-            # API accepts comma-separated string, max 100 terms
-            bias_terms = context_bias[:100]
-            request_params["context_bias"] = ",".join(bias_terms)
-
-        # Make API call
-        response = client.audio.transcriptions.complete(**request_params)
-
-        # Parse response
-        result = VoxtralApiResult(
-            text=response.text,
-            language=getattr(response, "language", language),
-            model=f"voxtral-api-{DEFAULT_MODEL}",
-            duration_seconds=getattr(response, "duration", None),
-        )
-
-        # Parse word timestamps if present
-        if hasattr(response, "words") and response.words:
-            result.words = [
-                WordTimestamp(
-                    word=w.word,
-                    start=w.start,
-                    end=w.end,
-                )
-                for w in response.words
-            ]
-
-        # Parse segment timestamps if present
-        if hasattr(response, "segments") and response.segments:
-            result.segments = [
-                SegmentTimestamp(
-                    text=s.text,
-                    start=s.start,
-                    end=s.end,
-                    speaker=getattr(s, "speaker", None),
-                )
-                for s in response.segments
-            ]
-
-        # Parse speakers if diarization enabled
-        if hasattr(response, "speakers") and response.speakers:
-            result.speakers = [
-                Speaker(
-                    id=sp.id,
-                    start=sp.start,
-                    end=sp.end,
-                )
-                for sp in response.speakers
-            ]
-
-        logger.info(f"Mistral API transcription complete: {len(result.text)} characters")
-        return result
-
-    except Exception as e:
-        logger.error(f"Mistral API transcription failed: {e}")
-        raise
-
-
-# Supported languages by Voxtral API (13 languages)
-SUPPORTED_LANGUAGES = [
-    "en",  # English
-    "zh",  # Chinese
-    "hi",  # Hindi
-    "es",  # Spanish
-    "ar",  # Arabic
-    "fr",  # French
-    "pt",  # Portuguese
-    "ru",  # Russian
-    "de",  # German
-    "ja",  # Japanese
-    "ko",  # Korean
-    "it",  # Italian
-    "nl",  # Dutch
-]
diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py
deleted file mode 100644
index 320e5020d..000000000
--- a/services/mana-stt/app/voxtral_service.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""
-Voxtral STT Service using Hugging Face Transformers
-Mistral AI's Speech-to-Text model (Apache 2.0 License)
-
-Uses VoxtralForConditionalGeneration with apply_transcription_request
-as per official HuggingFace documentation.
-"""
-
-import os
-import tempfile
-import logging
-import time
-from pathlib import Path
-from typing import Optional
-from dataclasses import dataclass
-
-logger = logging.getLogger(__name__)
-
-# Lazy load to avoid import errors
-_voxtral_model = None
-_voxtral_processor = None
-_model_name = None
-
-# Default model
-DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507"
-
-
-@dataclass
-class VoxtralTranscriptionResult:
-    text: str
-    language: Optional[str] = None
-    model: str = "voxtral-mini-3b"
-    latency_ms: Optional[float] = None
-
-
-def get_voxtral_model(model_name: str = DEFAULT_MODEL):
-    """
-    Get or create Voxtral model instance.
-
-    Uses VoxtralForConditionalGeneration (the correct class for Voxtral).
-    """
-    global _voxtral_model, _voxtral_processor, _model_name
-
-    # Reload if different model requested
-    if _voxtral_model is not None and _model_name != model_name:
-        logger.info(f"Switching model from {_model_name} to {model_name}")
-        _voxtral_model = None
-        _voxtral_processor = None
-
-    if _voxtral_model is None:
-        logger.info(f"Loading Voxtral model: {model_name}")
-        try:
-            import torch
-            from transformers import VoxtralForConditionalGeneration, AutoProcessor
-
-            # Determine device and dtype
-            if torch.backends.mps.is_available():
-                device = "mps"
-                # MPS works better with float16
-                torch_dtype = torch.float16
-            elif torch.cuda.is_available():
-                device = "cuda"
-                torch_dtype = torch.bfloat16
-            else:
-                device = "cpu"
-                torch_dtype = torch.float32
-
-            logger.info(f"Using device: {device}, dtype: {torch_dtype}")
-
-            # Load processor
-            _voxtral_processor = AutoProcessor.from_pretrained(model_name)
-
-            # Load model with VoxtralForConditionalGeneration
-            if device == "mps":
-                # MPS doesn't support device_map, load to CPU first then move
-                _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                )
-                _voxtral_model = _voxtral_model.to(device)
-            else:
-                _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    device_map=device,
-                )
-
-            _model_name = model_name
-            logger.info(f"Voxtral model loaded successfully on {device}")
-
-        except ImportError as e:
-            logger.error(f"Failed to import transformers: {e}")
-            raise RuntimeError(
-                "transformers >= 4.54.0 required. "
-                "Run: pip install --upgrade transformers"
-            )
-        except Exception as e:
-            logger.error(f"Failed to load Voxtral model: {e}")
-            raise
-
-    return _voxtral_model, _voxtral_processor
-
-
-def transcribe_audio(
-    audio_path: str,
-    language: Optional[str] = "de",
-    model_name: str = DEFAULT_MODEL,
-) -> VoxtralTranscriptionResult:
-    """
-    Transcribe audio file using Voxtral.
-
-    Uses the official apply_transcription_request method.
-
-    Args:
-        audio_path: Path to audio file
-        language: Language code (de, en, fr, etc.)
-        model_name: Hugging Face model ID
-
-    Returns:
-        VoxtralTranscriptionResult with transcribed text
-    """
-    import torch
-
-    model, processor = get_voxtral_model(model_name)
-    device = next(model.parameters()).device
-    dtype = next(model.parameters()).dtype
-
-    logger.info(f"Transcribing with Voxtral: {audio_path}")
-    start_time = time.time()
-
-    try:
-        # Use apply_transcription_request (official method)
-        # This handles audio loading and preprocessing internally
-        inputs = processor.apply_transcription_request(
-            language=language or "en",
-            audio=audio_path,
-            model_id=model_name,
-        )
-
-        # Move inputs to device and dtype
-        inputs = inputs.to(device, dtype=dtype)
-
-        # Generate transcription
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=500,
-                do_sample=False,
-            )
-
-        # Decode - skip input tokens
-        input_len = inputs.input_ids.shape[1]
-        decoded = processor.batch_decode(
-            outputs[:, input_len:],
-            skip_special_tokens=True,
-        )
-
-        text = decoded[0] if decoded else ""
-        latency_ms = (time.time() - start_time) * 1000
-
-        logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
-
-        return VoxtralTranscriptionResult(
-            text=text.strip(),
-            language=language,
-            model=model_name.split("/")[-1],
-            latency_ms=latency_ms,
-        )
-
-    except Exception as e:
-        logger.error(f"Voxtral transcription failed: {e}")
-        raise
-
-
-async def transcribe_audio_bytes(
-    audio_bytes: bytes,
-    filename: str,
-    language: Optional[str] = "de",
-    model_name: str = DEFAULT_MODEL,
-) -> VoxtralTranscriptionResult:
-    """
-    Transcribe audio from bytes (for API uploads).
-    """
-    ext = Path(filename).suffix or ".wav"
-
-    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
-        tmp.write(audio_bytes)
-        tmp_path = tmp.name
-
-    try:
-        result = transcribe_audio(
-            audio_path=tmp_path,
-            language=language,
-            model_name=model_name,
-        )
-        return result
-    finally:
-        try:
-            os.unlink(tmp_path)
-        except Exception:
-            pass
-
-
-def unload_model():
-    """Unload model to free memory."""
-    global _voxtral_model, _voxtral_processor, _model_name
-
-    if _voxtral_model is not None:
-        del _voxtral_model
-        del _voxtral_processor
-        _voxtral_model = None
-        _voxtral_processor = None
-        _model_name = None
-
-        import gc
-        gc.collect()
-
-        try:
-            import torch
-            if torch.backends.mps.is_available():
-                torch.mps.empty_cache()
-            elif torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        except Exception:
-            pass
-
-        logger.info("Voxtral model unloaded")
-
-
-def is_loaded() -> bool:
-    """Check if model is currently loaded."""
-    return _voxtral_model is not None
-
-
-def get_loaded_model_name() -> Optional[str]:
-    """Get name of currently loaded model."""
-    return _model_name
-
-
-# Supported languages (13 languages as per Mistral docs)
-SUPPORTED_LANGUAGES = [
-    "en",  # English
-    "zh",  # Chinese
-    "hi",  # Hindi
-    "es",  # Spanish
-    "ar",  # Arabic
-    "fr",  # French
-    "pt",  # Portuguese
-    "ru",  # Russian
-    "de",  # German
-    "ja",  # Japanese
-    "ko",  # Korean
-    "it",  # Italian
-    "nl",  # Dutch
-]
-
-# Available models
-AVAILABLE_MODELS = [
-    {
-        "id": "voxtral-mini-3b",
-        "name": "Voxtral-Mini-3B-2507",
-        "huggingface_id": "mistralai/Voxtral-Mini-3B-2507",
-        "params": "3B",
-        "vram": "~6GB",
-        "description": "Balanced quality and speed for local deployment",
-    },
-]
diff --git a/services/mana-stt/app/vram_manager.py b/services/mana-stt/app/vram_manager.py
deleted file mode 100644
index 89b5656ae..000000000
--- a/services/mana-stt/app/vram_manager.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""
-VRAM Manager — Automatic model unloading after idle timeout.
-
-Tracks last usage time per model and unloads after configurable timeout.
-Designed for shared GPU environments (multiple services on one RTX 3090).
-
-Usage in a service:
-    from vram_manager import VramManager
-
-    vram = VramManager(idle_timeout=300)  # 5 min
-
-    # Before using a model
-    vram.touch()
-
-    # Call periodically (e.g., from health check or background task)
-    vram.check_idle(unload_fn=my_unload_function)
-"""
-
-import os
-import time
-import logging
-import threading
-from typing import Optional, Callable
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_IDLE_TIMEOUT = int(os.getenv("VRAM_IDLE_TIMEOUT", "300"))  # 5 minutes
-
-
-class VramManager:
-    def __init__(self, idle_timeout: int = DEFAULT_IDLE_TIMEOUT, service_name: str = "unknown"):
-        self.idle_timeout = idle_timeout
-        self.service_name = service_name
-        self.last_used: float = 0.0
-        self.model_loaded: bool = False
-        self._lock = threading.Lock()
-        self._timer: Optional[threading.Timer] = None
-
-    def touch(self):
-        """Mark the model as recently used. Call before/after each inference."""
-        with self._lock:
-            self.last_used = time.time()
-            self.model_loaded = True
-            self._schedule_check()
-
-    def mark_loaded(self):
-        """Mark that a model has been loaded into VRAM."""
-        with self._lock:
-            self.model_loaded = True
-            self.last_used = time.time()
-            self._schedule_check()
-            logger.info(f"[{self.service_name}] Model loaded, idle timeout: {self.idle_timeout}s")
-
-    def mark_unloaded(self):
-        """Mark that a model has been unloaded from VRAM."""
-        with self._lock:
-            self.model_loaded = False
-            if self._timer:
-                self._timer.cancel()
-                self._timer = None
-            logger.info(f"[{self.service_name}] Model unloaded, VRAM freed")
-
-    def is_idle(self) -> bool:
-        """Check if the model has been idle longer than the timeout."""
-        if not self.model_loaded:
-            return False
-        return (time.time() - self.last_used) > self.idle_timeout
-
-    def seconds_until_unload(self) -> Optional[float]:
-        """Seconds until the model will be unloaded, or None if not loaded."""
-        if not self.model_loaded:
-            return None
-        remaining = self.idle_timeout - (time.time() - self.last_used)
-        return max(0, remaining)
-
-    def check_and_unload(self, unload_fn: Callable[[], None]) -> bool:
-        """Check if idle and unload if so. Returns True if unloaded."""
-        if self.is_idle():
-            logger.info(f"[{self.service_name}] Idle for >{self.idle_timeout}s, unloading model...")
-            try:
-                unload_fn()
-                self.mark_unloaded()
-                return True
-            except Exception as e:
-                logger.error(f"[{self.service_name}] Failed to unload: {e}")
-        return False
-
-    def _schedule_check(self):
-        """Schedule an idle check after the timeout period."""
-        if self._timer:
-            self._timer.cancel()
-
-        self._timer = threading.Timer(
-            self.idle_timeout + 5,  # Small buffer
-            self._auto_check,
-        )
-        self._timer.daemon = True
-        self._timer.start()
-
-    def _auto_check(self):
-        """Auto-triggered idle check (called by timer)."""
-        # This is just a log — actual unloading needs the unload_fn
-        # which depends on the service. The service should call check_and_unload.
-        if self.is_idle():
-            logger.info(f"[{self.service_name}] Model idle for >{self.idle_timeout}s — ready to unload")
-
-    def status(self) -> dict:
-        """Get current VRAM manager status."""
-        return {
-            "model_loaded": self.model_loaded,
-            "idle_seconds": round(time.time() - self.last_used, 1) if self.model_loaded else None,
-            "idle_timeout": self.idle_timeout,
-            "seconds_until_unload": round(self.seconds_until_unload(), 1) if self.model_loaded else None,
-        }
diff --git a/services/mana-stt/app/whisper_service.py b/services/mana-stt/app/whisper_service.py
deleted file mode 100644
index 821e22d9b..000000000
--- a/services/mana-stt/app/whisper_service.py
+++ /dev/null
@@ -1,358 +0,0 @@
-"""
-Whisper STT Service using WhisperX (CUDA)
-Provides: transcription, word-level timestamps, speaker diarization.
-
-WhisperX pipeline:
-1. faster-whisper for transcription
-2. wav2vec2 for forced alignment (precise word timestamps)
-3. pyannote-audio for speaker diarization
-"""
-
-import os
-import tempfile
-import logging
-from pathlib import Path
-from typing import Optional
-from dataclasses import dataclass, field
-
-logger = logging.getLogger(__name__)
-
-# Lazy load
-_whisperx_model = None
-_align_model = None
-_align_metadata = None
-_diarize_pipeline = None
-
-# Config
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-
-# VRAM management — unload after 10 min idle (STT uses ~3GB)
-from app.vram_manager import VramManager
-_vram = VramManager(
-    idle_timeout=int(os.getenv("VRAM_IDLE_TIMEOUT", "600")),
-    service_name="mana-stt",
-)
-
-
-def unload_models():
-    """Unload all WhisperX models from GPU to free VRAM."""
-    global _whisperx_model, _align_model, _align_metadata, _diarize_pipeline
-    import torch
-
-    if _whisperx_model is not None:
-        del _whisperx_model
-        _whisperx_model = None
-    if _align_model is not None:
-        del _align_model
-        _align_model = None
-        _align_metadata = None
-    if _diarize_pipeline is not None:
-        del _diarize_pipeline
-        _diarize_pipeline = None
-
-    torch.cuda.empty_cache()
-    _vram.mark_unloaded()
-    logger.info("WhisperX models unloaded, VRAM freed")
-
-
-@dataclass
-class WordSegment:
-    word: str
-    start: float
-    end: float
-    score: Optional[float] = None
-    speaker: Optional[str] = None
-
-
-@dataclass
-class TranscriptionResult:
-    text: str
-    language: Optional[str] = None
-    duration: Optional[float] = None
-    segments: Optional[list] = None
-    words: Optional[list[WordSegment]] = field(default_factory=list)
-    speakers: Optional[list[str]] = field(default_factory=list)
-
-
-def get_whisper_model(model_name: str = "large-v3", **kwargs):
-    """Get or create WhisperX model instance (singleton)."""
-    global _whisperx_model
-
-    if _whisperx_model is not None:
-        return _whisperx_model
-
-    logger.info(f"Loading WhisperX model: {model_name}")
-    try:
-        import whisperx
-
-        device = os.getenv("WHISPER_DEVICE", "cuda")
-        compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
-
-        default_language = os.getenv("WHISPER_DEFAULT_LANGUAGE", "de")
-        _whisperx_model = whisperx.load_model(
-            model_name,
-            device=device,
-            compute_type=compute_type,
-            language=default_language,
-        )
-        logger.info(f"WhisperX model loaded: {model_name} on {device} ({compute_type})")
-        _vram.mark_loaded()
-    except ImportError as e:
-        logger.error(f"Failed to import whisperx: {e}")
-        raise RuntimeError("whisperx not installed. Run: pip install whisperx")
-    except Exception as e:
-        logger.error(f"Failed to load WhisperX model: {e}")
-        raise
-
-    return _whisperx_model
-
-
-def _get_align_model(language: str, device: str = "cuda"):
-    """Get or create alignment model for a language."""
-    global _align_model, _align_metadata
-
-    import whisperx
-
-    # Reload if language changed (alignment models are language-specific)
-    if _align_model is None:
-        logger.info(f"Loading alignment model for language: {language}")
-        _align_model, _align_metadata = whisperx.load_align_model(
-            language_code=language,
-            device=device,
-        )
-        logger.info("Alignment model loaded")
-
-    return _align_model, _align_metadata
-
-
-def _get_diarize_pipeline(device: str = "cuda"):
-    """Get or create speaker diarization pipeline."""
-    global _diarize_pipeline
-
-    if _diarize_pipeline is not None:
-        return _diarize_pipeline
-
-    import torch
-    from pyannote.audio import Pipeline
-
-    token = HF_TOKEN or os.getenv("HUGGING_FACE_HUB_TOKEN", "")
-    if not token:
-        logger.warning("No HF_TOKEN set — speaker diarization may fail for gated models")
-
-    logger.info("Loading speaker diarization pipeline (pyannote)...")
-    _diarize_pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1",
-        token=token,
-    )
-    _diarize_pipeline.to(torch.device(device))
-    logger.info("Diarization pipeline loaded")
-    return _diarize_pipeline
-
-
-def transcribe_audio(
-    audio_path: str,
-    language: Optional[str] = None,
-    model_name: str = "large-v3",
-    align: bool = True,
-    diarize: bool = False,
-    min_speakers: Optional[int] = None,
-    max_speakers: Optional[int] = None,
-) -> TranscriptionResult:
-    """
-    Transcribe audio using WhisperX with optional alignment and diarization.
-
-    Args:
-        audio_path: Path to audio file
-        language: Language code (auto-detect if None)
-        model_name: Whisper model to use
-        align: Enable word-level timestamp alignment
-        diarize: Enable speaker diarization
-        min_speakers: Minimum expected speakers (helps diarization)
-        max_speakers: Maximum expected speakers
-
-    Returns:
-        TranscriptionResult with text, word timestamps, and speaker info
-    """
-    import whisperx
-
-    device = os.getenv("WHISPER_DEVICE", "cuda")
-    model = get_whisper_model(model_name)
-
-    logger.info(f"Transcribing: {audio_path} (align={align}, diarize={diarize})")
-
-    # Check and unload if idle, then reload
-    _vram.check_and_unload(unload_models)
-    _vram.touch()
-
-    # Step 1: Load audio
-    audio = whisperx.load_audio(audio_path)
-
-    # Step 2: Transcribe with faster-whisper
-    transcribe_kwargs = {"batch_size": 16}
-    if language:
-        transcribe_kwargs["language"] = language
-    result = model.transcribe(audio, **transcribe_kwargs)
-    detected_language = result.get("language", language or "en")
-
-    # Step 3: Align (word-level timestamps)
-    if align and result["segments"]:
-        try:
-            align_model, metadata = _get_align_model(detected_language, device)
-            result = whisperx.align(
-                result["segments"],
-                align_model,
-                metadata,
-                audio,
-                device,
-                return_char_alignments=False,
-            )
-            logger.info("Word alignment complete")
-        except Exception as e:
-            logger.warning(f"Alignment failed (continuing without): {e}")
-
-    # Step 4: Diarize (speaker identification)
-    if diarize:
-        try:
-            import torch
-            import torchaudio
-
-            diarize_pipe = _get_diarize_pipeline(device)
-
-            # pyannote needs waveform as tensor, not the whisperx audio array
-            waveform = torch.from_numpy(audio).unsqueeze(0).float()
-            diarize_input = {"waveform": waveform, "sample_rate": 16000}
-
-            diarize_kwargs = {}
-            if min_speakers is not None:
-                diarize_kwargs["min_speakers"] = min_speakers
-            if max_speakers is not None:
-                diarize_kwargs["max_speakers"] = max_speakers
-
-            diarize_output = diarize_pipe(diarize_input, **diarize_kwargs)
-
-            # pyannote 4.x returns DiarizeOutput, extract the Annotation
-            if hasattr(diarize_output, "speaker_diarization"):
-                diarize_annotation = diarize_output.speaker_diarization
-            else:
-                diarize_annotation = diarize_output
-
-            # Convert pyannote output to DataFrame for whisperx
-            import pandas as pd
-            diarize_rows = []
-            for turn, _, speaker in diarize_annotation.itertracks(yield_label=True):
-                diarize_rows.append({
-                    "start": turn.start,
-                    "end": turn.end,
-                    "speaker": speaker,
-                })
-
-            diarize_df = pd.DataFrame(diarize_rows)
-            result = whisperx.assign_word_speakers(diarize_df, result)
-            logger.info("Speaker diarization complete")
-        except Exception as e:
-            logger.warning(f"Diarization failed (continuing without): {e}")
-            import traceback
-            traceback.print_exc()
-
-    # Build response
-    segments = result.get("segments", [])
-    full_text_parts = []
-    all_words = []
-    speaker_set = set()
-
-    for seg in segments:
-        full_text_parts.append(seg.get("text", ""))
-        speaker = seg.get("speaker")
-        if speaker:
-            speaker_set.add(speaker)
-
-        for word_info in seg.get("words", []):
-            all_words.append(WordSegment(
-                word=word_info.get("word", ""),
-                start=word_info.get("start", 0.0),
-                end=word_info.get("end", 0.0),
-                score=word_info.get("score"),
-                speaker=word_info.get("speaker", speaker),
-            ))
-
-    text = " ".join(full_text_parts)
-
-    _vram.touch()
-    logger.info(
-        f"Transcription complete: {len(text)} chars, "
-        f"{len(all_words)} words, {len(speaker_set)} speakers"
-    )
-
-    return TranscriptionResult(
-        text=text.strip(),
-        language=detected_language,
-        segments=[{
-            "start": s.get("start", 0),
-            "end": s.get("end", 0),
-            "text": s.get("text", ""),
-            "speaker": s.get("speaker"),
-        } for s in segments],
-        words=all_words,
-        speakers=sorted(speaker_set),
-    )
-
-
-async def transcribe_audio_bytes(
-    audio_bytes: bytes,
-    filename: str,
-    language: Optional[str] = None,
-    model_name: str = "large-v3",
-    align: bool = True,
-    diarize: bool = False,
-    min_speakers: Optional[int] = None,
-    max_speakers: Optional[int] = None,
-) -> TranscriptionResult:
-    """Transcribe audio from bytes (for API uploads)."""
-    import asyncio
-
-    ext = Path(filename).suffix or ".wav"
-
-    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
-        tmp.write(audio_bytes)
-        tmp_path = tmp.name
-
-    try:
-        # Run in thread pool to avoid blocking the event loop
-        loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            lambda: transcribe_audio(
-                audio_path=tmp_path,
-                language=language,
-                model_name=model_name,
-                align=align,
-                diarize=diarize,
-                min_speakers=min_speakers,
-                max_speakers=max_speakers,
-            ),
-        )
-        return result
-    finally:
-        try:
-            os.unlink(tmp_path)
-        except Exception:
-            pass
-
-
-# Available models
-AVAILABLE_MODELS = [
-    "tiny",
-    "tiny.en",
-    "base",
-    "base.en",
-    "small",
-    "small.en",
-    "medium",
-    "medium.en",
-    "large-v1",
-    "large-v2",
-    "large-v3",
-    "large-v3-turbo",
-    "distil-large-v2",
-    "distil-large-v3",
-]
diff --git a/services/mana-stt/grafana-dashboard.json b/services/mana-stt/grafana-dashboard.json
deleted file mode 100644
index 4b98ba93f..000000000
--- a/services/mana-stt/grafana-dashboard.json
+++ /dev/null
@@ -1,740 +0,0 @@
-{
-	"annotations": {
-		"list": []
-	},
-	"description": "Mana Speech-to-Text Service Monitoring",
-	"editable": true,
-	"fiscalYearStartMonth": 0,
-	"graphTooltip": 1,
-	"links": [],
-	"panels": [
-		{
-			"collapsed": false,
-			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
-			"id": 100,
-			"panels": [],
-			"title": "Overview",
-			"type": "row"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [
-						{
-							"options": {
-								"0": { "color": "red", "index": 1, "text": "DOWN" },
-								"1": { "color": "green", "index": 0, "text": "UP" }
-							},
-							"type": "value"
-						}
-					],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "red", "value": null },
-							{ "color": "green", "value": 1 }
-						]
-					}
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 0, "y": 1 },
-			"id": 1,
-			"options": {
-				"colorMode": "background",
-				"graphMode": "none",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"pluginVersion": "10.4.1",
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "up{job=\"mana-stt\"}",
-					"refId": "A"
-				}
-			],
-			"title": "Service Status",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [
-						{
-							"options": {
-								"0": { "color": "yellow", "index": 0, "text": "Not Loaded" },
-								"1": { "color": "green", "index": 1, "text": "Loaded" }
-							},
-							"type": "value"
-						}
-					],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "yellow", "value": null },
-							{ "color": "green", "value": 1 }
-						]
-					}
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 3, "y": 1 },
-			"id": 2,
-			"options": {
-				"colorMode": "background",
-				"graphMode": "none",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "mana_stt_model_loaded{model=\"whisper\"}",
-					"refId": "A"
-				}
-			],
-			"title": "Whisper Model",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [
-						{
-							"options": {
-								"0": { "color": "yellow", "index": 0, "text": "Not Loaded" },
-								"1": { "color": "green", "index": 1, "text": "Loaded" }
-							},
-							"type": "value"
-						}
-					],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "yellow", "value": null },
-							{ "color": "green", "value": 1 }
-						]
-					}
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 6, "y": 1 },
-			"id": 3,
-			"options": {
-				"colorMode": "background",
-				"graphMode": "none",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "mana_stt_model_loaded{model=\"voxtral\"}",
-					"refId": "A"
-				}
-			],
-			"title": "Voxtral Model",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "short"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 9, "y": 1 },
-			"id": 4,
-			"options": {
-				"colorMode": "value",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_requests_total{status=\"success\"})",
-					"refId": "A"
-				}
-			],
-			"title": "Total Transcriptions",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "short"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 12, "y": 1 },
-			"id": 5,
-			"options": {
-				"colorMode": "value",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_characters_transcribed_total)",
-					"refId": "A"
-				}
-			],
-			"title": "Characters Transcribed",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "green", "value": null },
-							{ "color": "yellow", "value": 1 },
-							{ "color": "red", "value": 3 }
-						]
-					},
-					"unit": "short"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 15, "y": 1 },
-			"id": 6,
-			"options": {
-				"colorMode": "background",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_active_requests)",
-					"refId": "A"
-				}
-			],
-			"title": "Active Requests",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "green", "value": null },
-							{ "color": "red", "value": 1 }
-						]
-					},
-					"unit": "short"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 18, "y": 1 },
-			"id": 7,
-			"options": {
-				"colorMode": "background",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_requests_total{status=\"error\"})",
-					"refId": "A"
-				}
-			],
-			"title": "Total Errors",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "s"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 4, "w": 3, "x": 21, "y": 1 },
-			"id": 8,
-			"options": {
-				"colorMode": "value",
-				"graphMode": "none",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket[5m])) by (le))",
-					"refId": "A"
-				}
-			],
-			"title": "Median Duration",
-			"type": "stat"
-		},
-		{
-			"collapsed": false,
-			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
-			"id": 101,
-			"panels": [],
-			"title": "Performance",
-			"type": "row"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": {
-						"axisBorderShow": false,
-						"axisCenteredZero": false,
-						"axisColorMode": "text",
-						"axisLabel": "",
-						"axisPlacement": "auto",
-						"barAlignment": 0,
-						"drawStyle": "line",
-						"fillOpacity": 20,
-						"gradientMode": "none",
-						"hideFrom": { "legend": false, "tooltip": false, "viz": false },
-						"insertNulls": false,
-						"lineInterpolation": "smooth",
-						"lineWidth": 2,
-						"pointSize": 5,
-						"scaleDistribution": { "type": "linear" },
-						"showPoints": "auto",
-						"spanNulls": false,
-						"stacking": { "group": "A", "mode": "none" },
-						"thresholdsStyle": { "mode": "off" }
-					},
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "s"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
-			"id": 10,
-			"options": {
-				"legend": {
-					"calcs": ["mean", "max"],
-					"displayMode": "table",
-					"placement": "bottom",
-					"showLegend": true
-				},
-				"tooltip": { "mode": "multi", "sort": "desc" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"whisper\"}[5m])) by (le))",
-					"legendFormat": "Whisper p50",
-					"refId": "A"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "histogram_quantile(0.95, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"whisper\"}[5m])) by (le))",
-					"legendFormat": "Whisper p95",
-					"refId": "B"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"voxtral\"}[5m])) by (le))",
-					"legendFormat": "Voxtral p50",
-					"refId": "C"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "histogram_quantile(0.95, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"voxtral\"}[5m])) by (le))",
-					"legendFormat": "Voxtral p95",
-					"refId": "D"
-				}
-			],
-			"title": "Transcription Duration (p50 / p95)",
-			"type": "timeseries"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": {
-						"axisBorderShow": false,
-						"axisCenteredZero": false,
-						"axisColorMode": "text",
-						"axisLabel": "",
-						"axisPlacement": "auto",
-						"barAlignment": 0,
-						"drawStyle": "line",
-						"fillOpacity": 20,
-						"gradientMode": "none",
-						"hideFrom": { "legend": false, "tooltip": false, "viz": false },
-						"insertNulls": false,
-						"lineInterpolation": "smooth",
-						"lineWidth": 2,
-						"pointSize": 5,
-						"scaleDistribution": { "type": "linear" },
-						"showPoints": "auto",
-						"spanNulls": false,
-						"stacking": { "group": "A", "mode": "none" },
-						"thresholdsStyle": { "mode": "off" }
-					},
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "reqps"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
-			"id": 11,
-			"options": {
-				"legend": {
-					"calcs": ["mean", "sum"],
-					"displayMode": "table",
-					"placement": "bottom",
-					"showLegend": true
-				},
-				"tooltip": { "mode": "multi", "sort": "desc" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_requests_total{model=\"whisper\", status=\"success\"}[5m]))",
-					"legendFormat": "Whisper Success",
-					"refId": "A"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_requests_total{model=\"voxtral\", status=\"success\"}[5m]))",
-					"legendFormat": "Voxtral Success",
-					"refId": "B"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_requests_total{status=\"error\"}[5m]))",
-					"legendFormat": "Errors",
-					"refId": "C"
-				}
-			],
-			"title": "Request Rate",
-			"type": "timeseries"
-		},
-		{
-			"collapsed": false,
-			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
-			"id": 102,
-			"panels": [],
-			"title": "Details",
-			"type": "row"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } },
-					"mappings": []
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 8, "w": 6, "x": 0, "y": 15 },
-			"id": 12,
-			"options": {
-				"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
-				"pieType": "pie",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"tooltip": { "mode": "single", "sort": "none" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_requests_total{status=\"success\"}) by (model)",
-					"legendFormat": "{{model}}",
-					"refId": "A"
-				}
-			],
-			"title": "Requests by Model",
-			"type": "piechart"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } },
-					"mappings": []
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 8, "w": 6, "x": 6, "y": 15 },
-			"id": 13,
-			"options": {
-				"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
-				"pieType": "pie",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"tooltip": { "mode": "single", "sort": "none" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(mana_stt_requests_total{status=\"success\"}) by (language)",
-					"legendFormat": "{{language}}",
-					"refId": "A"
-				}
-			],
-			"title": "Requests by Language",
-			"type": "piechart"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": {
-						"axisBorderShow": false,
-						"axisCenteredZero": false,
-						"axisColorMode": "text",
-						"axisLabel": "",
-						"axisPlacement": "auto",
-						"barAlignment": 0,
-						"drawStyle": "bars",
-						"fillOpacity": 80,
-						"gradientMode": "none",
-						"hideFrom": { "legend": false, "tooltip": false, "viz": false },
-						"insertNulls": false,
-						"lineInterpolation": "linear",
-						"lineWidth": 1,
-						"pointSize": 5,
-						"scaleDistribution": { "type": "linear" },
-						"showPoints": "never",
-						"spanNulls": false,
-						"stacking": { "group": "A", "mode": "normal" },
-						"thresholdsStyle": { "mode": "off" }
-					},
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "decbytes"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
-			"id": 14,
-			"options": {
-				"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
-				"tooltip": { "mode": "multi", "sort": "desc" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_file_size_mb_sum{model=\"whisper\"}[5m])) * 1024 * 1024",
-					"legendFormat": "Whisper",
-					"refId": "A"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_file_size_mb_sum{model=\"voxtral\"}[5m])) * 1024 * 1024",
-					"legendFormat": "Voxtral",
-					"refId": "B"
-				}
-			],
-			"title": "Data Processed",
-			"type": "timeseries"
-		},
-		{
-			"collapsed": false,
-			"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
-			"id": 103,
-			"panels": [],
-			"title": "Model Loading",
-			"type": "row"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "green", "value": null },
-							{ "color": "yellow", "value": 30 },
-							{ "color": "red", "value": 60 }
-						]
-					},
-					"unit": "s"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 6, "w": 8, "x": 0, "y": 24 },
-			"id": 15,
-			"options": {
-				"colorMode": "value",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "mana_stt_model_load_duration_seconds_sum{model=\"whisper\"} / mana_stt_model_load_duration_seconds_count{model=\"whisper\"}",
-					"legendFormat": "Whisper",
-					"refId": "A"
-				}
-			],
-			"title": "Whisper Load Time",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "thresholds" },
-					"mappings": [],
-					"thresholds": {
-						"mode": "absolute",
-						"steps": [
-							{ "color": "green", "value": null },
-							{ "color": "yellow", "value": 60 },
-							{ "color": "red", "value": 120 }
-						]
-					},
-					"unit": "s"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 6, "w": 8, "x": 8, "y": 24 },
-			"id": 16,
-			"options": {
-				"colorMode": "value",
-				"graphMode": "area",
-				"justifyMode": "auto",
-				"orientation": "auto",
-				"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-				"textMode": "auto"
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "mana_stt_model_load_duration_seconds_sum{model=\"voxtral\"} / mana_stt_model_load_duration_seconds_count{model=\"voxtral\"}",
-					"legendFormat": "Voxtral",
-					"refId": "A"
-				}
-			],
-			"title": "Voxtral Load Time",
-			"type": "stat"
-		},
-		{
-			"datasource": { "type": "prometheus", "uid": "prometheus" },
-			"fieldConfig": {
-				"defaults": {
-					"color": { "mode": "palette-classic" },
-					"custom": {
-						"axisBorderShow": false,
-						"axisCenteredZero": false,
-						"axisColorMode": "text",
-						"axisLabel": "",
-						"axisPlacement": "auto",
-						"barAlignment": 0,
-						"drawStyle": "line",
-						"fillOpacity": 10,
-						"gradientMode": "none",
-						"hideFrom": { "legend": false, "tooltip": false, "viz": false },
-						"insertNulls": false,
-						"lineInterpolation": "smooth",
-						"lineWidth": 2,
-						"pointSize": 5,
-						"scaleDistribution": { "type": "linear" },
-						"showPoints": "auto",
-						"spanNulls": false,
-						"stacking": { "group": "A", "mode": "none" },
-						"thresholdsStyle": { "mode": "off" }
-					},
-					"mappings": [],
-					"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
-					"unit": "short"
-				},
-				"overrides": []
-			},
-			"gridPos": { "h": 6, "w": 8, "x": 16, "y": 24 },
-			"id": 17,
-			"options": {
-				"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
-				"tooltip": { "mode": "multi", "sort": "desc" }
-			},
-			"targets": [
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_characters_transcribed_total{model=\"whisper\"}[5m]))",
-					"legendFormat": "Whisper",
-					"refId": "A"
-				},
-				{
-					"datasource": { "type": "prometheus", "uid": "prometheus" },
-					"expr": "sum(rate(mana_stt_characters_transcribed_total{model=\"voxtral\"}[5m]))",
-					"legendFormat": "Voxtral",
-					"refId": "B"
-				}
-			],
-			"title": "Characters/sec Transcribed",
-			"type": "timeseries"
-		}
-	],
-	"refresh": "30s",
-	"schemaVersion": 39,
-	"tags": ["mana", "stt", "ai"],
-	"templating": { "list": [] },
-	"time": { "from": "now-1h", "to": "now" },
-	"timepicker": {},
-	"timezone": "browser",
-	"title": "Mana STT Service",
-	"uid": "mana-stt-dashboard",
-	"version": 1,
-	"weekStart": "monday"
-}
diff --git a/services/mana-stt/requirements-cuda.txt b/services/mana-stt/requirements-cuda.txt
deleted file mode 100644
index e9bc9f95a..000000000
--- a/services/mana-stt/requirements-cuda.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-# Mana STT Service Dependencies
-# For GPU Server (NVIDIA RTX 3090 / CUDA)
-
-# Web Framework
-fastapi==0.115.6
-uvicorn[standard]==0.34.0
-python-multipart==0.0.20
-
-# Audio Processing
-pydub==0.25.1
-soundfile==0.13.1
-
-# WhisperX (CUDA) — includes faster-whisper + alignment
-whisperx @ git+https://github.com/m-bain/whisperX.git
-
-# faster-whisper with CTranslate2 (CUDA backend)
-faster-whisper>=1.1.0
-
-# Speaker Diarization (pyannote.audio)
-# Requires HF_TOKEN with accepted terms:
-#   https://huggingface.co/pyannote/speaker-diarization-3.1
-#   https://huggingface.co/pyannote/segmentation-3.0
-pyannote.audio>=3.3.0
-
-# PyTorch CUDA — install separately for your CUDA version:
-#   pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
-torch>=2.5.0
-torchaudio>=2.5.0
-
-# Utilities
-numpy>=1.26.0
-tqdm>=4.67.0
-
-# External Auth (mana-core-auth integration)
-httpx>=0.27.0
diff --git a/services/mana-stt/requirements.txt b/services/mana-stt/requirements.txt
deleted file mode 100644
index b98f07c3c..000000000
--- a/services/mana-stt/requirements.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-# Mana STT Service Dependencies
-# For Mac Mini M4 (Apple Silicon)
-
-# Web Framework
-fastapi==0.115.6
-uvicorn[standard]==0.34.0
-python-multipart==0.0.20
-
-# Audio Processing
-pydub==0.25.1
-soundfile==0.13.1
-
-# Whisper (Apple Silicon optimized)
-lightning-whisper-mlx==0.0.10
-mlx>=0.21.0
-
-# Voxtral (Hugging Face Transformers)
-transformers>=4.47.0
-torch>=2.5.0
-accelerate>=1.2.0
-sentencepiece>=0.2.0
-
-# Utilities
-numpy>=1.26.0
-tqdm>=4.67.0
-
-# External Auth (mana-core-auth integration)
-httpx>=0.27.0
diff --git a/services/mana-stt/service.pyw b/services/mana-stt/service.pyw
deleted file mode 100644
index 056059e98..000000000
--- a/services/mana-stt/service.pyw
+++ /dev/null
@@ -1,34 +0,0 @@
-"""mana-stt service runner."""
-import os
-import sys
-
-os.chdir(r"C:\mana\services\mana-stt")
-sys.path.insert(0, r"C:\mana\services\mana-stt")
-
-# Redirect stdout/stderr to log file FIRST (before any imports that warn)
-log = open(r"C:\mana\services\mana-stt\service.log", "w", buffering=1)
-sys.stdout = log
-sys.stderr = log
-
-# Load .env file
-from dotenv import load_dotenv
-load_dotenv(r"C:\mana\services\mana-stt\.env")
-
-# Ensure FFmpeg is in PATH
-ffmpeg_dir = r"C:\Users\tills\AppData\Local\Microsoft\WinGet\Links"
-if ffmpeg_dir not in os.environ.get("PATH", ""):
-    os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ.get("PATH", "")
-
-# Set HF token
-hf_token = os.environ.get("HF_TOKEN", "")
-if hf_token:
-    os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
-
-# Pre-initialize CUDA before importing whisperx (avoids hangs)
-import torch
-if torch.cuda.is_available():
-    torch.cuda.init()
-    print(f"CUDA initialized: {torch.cuda.get_device_name(0)}", flush=True)
-
-import uvicorn
-uvicorn.run("app.main:app", host="0.0.0.0", port=3020, log_level="info")