From 3c4a6d4f6978c92b644dd56920c1fc04a59993cf Mon Sep 17 00:00:00 2001 From: Till JS Date: Fri, 8 May 2026 18:53:53 +0200 Subject: [PATCH] =?UTF-8?q?chore(cutover):=20remove=20services/mana-stt/?= =?UTF-8?q?=20=E2=80=94=20moved=20to=20mana-platform?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live containers on the Mac Mini build out of `../mana/services/mana-stt/` since the 8-Doppel-Cutover commit (774852ba2). Smoke test green 2026-05-08 — health endpoints, JWKS, login flow, Stripe-webhook all reachable from the new build path. Removing the now-stale duplicate. Was 132K in this repo, gone now. Active code lives in `Code/mana/services/mana-stt/` (siehe ../mana/CLAUDE.md). Co-Authored-By: Claude Opus 4.7 (1M context) --- services/mana-stt/.env.example | 70 -- services/mana-stt/CLAUDE.md | 96 --- services/mana-stt/README.md | 31 - services/mana-stt/app/__init__.py | 1 - services/mana-stt/app/auth.py | 271 ------- services/mana-stt/app/external_auth.py | 145 ---- services/mana-stt/app/main.py | 392 ---------- services/mana-stt/app/vllm_service.py | 178 ----- services/mana-stt/app/voxtral_api_service.py | 213 ------ services/mana-stt/app/voxtral_service.py | 267 ------- services/mana-stt/app/vram_manager.py | 114 --- services/mana-stt/app/whisper_service.py | 358 --------- services/mana-stt/grafana-dashboard.json | 740 ------------------- services/mana-stt/requirements-cuda.txt | 35 - services/mana-stt/requirements.txt | 28 - services/mana-stt/service.pyw | 34 - 16 files changed, 2973 deletions(-) delete mode 100644 services/mana-stt/.env.example delete mode 100644 services/mana-stt/CLAUDE.md delete mode 100644 services/mana-stt/README.md delete mode 100644 services/mana-stt/app/__init__.py delete mode 100644 services/mana-stt/app/auth.py delete mode 100644 services/mana-stt/app/external_auth.py delete mode 100644 services/mana-stt/app/main.py delete mode 100644 services/mana-stt/app/vllm_service.py delete mode 100644 services/mana-stt/app/voxtral_api_service.py delete mode 100644 services/mana-stt/app/voxtral_service.py delete mode 100644 services/mana-stt/app/vram_manager.py delete mode 100644 services/mana-stt/app/whisper_service.py delete mode 100644 services/mana-stt/grafana-dashboard.json delete mode 100644 services/mana-stt/requirements-cuda.txt delete mode 100644 services/mana-stt/requirements.txt delete mode 100644 services/mana-stt/service.pyw diff --git a/services/mana-stt/.env.example b/services/mana-stt/.env.example deleted file mode 100644 index 3a435c073..000000000 --- a/services/mana-stt/.env.example +++ /dev/null @@ -1,70 +0,0 @@ -# Mana STT Service Configuration -# Copy to .env and adjust values as needed - -# Server -PORT=3020 - -# Whisper (Lightning MLX) -WHISPER_MODEL=large-v3 - -# Voxtral (Local Models) -# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b -VOXTRAL_MODEL=voxtral-realtime-4b - -# WhisperX (CUDA GPU Server) -# Enable WhisperX for rich transcription (diarization, word alignment) -# Requires NVIDIA GPU + requirements-cuda.txt -USE_WHISPERX=false - -# WhisperX batch size (higher = faster but more VRAM, 16 works well for RTX 3090) -WHISPERX_BATCH_SIZE=16 - -# Device and compute type for CUDA -# WHISPER_DEVICE=cuda -# WHISPER_COMPUTE_TYPE=float16 - -# HuggingFace token for pyannote speaker diarization models -# Required for diarization. Accept terms at: -# https://huggingface.co/pyannote/speaker-diarization-3.1 -# https://huggingface.co/pyannote/segmentation-3.0 -HF_TOKEN= - -# Model Loading -# Set to true to preload models on startup (slower startup, faster first request) -PRELOAD_MODELS=false - -# Load Management -# Maximum concurrent transcription requests before API fallback -MAX_CONCURRENT_REQUESTS=3 - -# API Fallback -# Enable automatic fallback to Mistral API when overloaded -API_FALLBACK_ENABLED=true - -# Mistral API Key (required for API fallback) -# Get your key at https://console.mistral.ai/ -MISTRAL_API_KEY= - -# CORS Origins (comma-separated) -CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173 - -# =========================================== -# Authentication -# =========================================== - -# Enable API key authentication (default: true for production) -REQUIRE_AUTH=true - -# API Keys (comma-separated, format: key:name) -# Example: sk-abc123:myapp,sk-def456:testuser -API_KEYS= - -# Internal API key (no rate limit, for internal services) -# Generate with: openssl rand -hex 32 -INTERNAL_API_KEY= - -# Rate Limiting -# Requests per window per API key -RATE_LIMIT_REQUESTS=60 -# Window size in seconds -RATE_LIMIT_WINDOW=60 diff --git a/services/mana-stt/CLAUDE.md b/services/mana-stt/CLAUDE.md deleted file mode 100644 index 0a98c2386..000000000 --- a/services/mana-stt/CLAUDE.md +++ /dev/null @@ -1,96 +0,0 @@ -# mana-stt - -Speech-to-Text microservice. Wraps Whisper (CUDA, with WhisperX for word-level timestamps + diarization), local Voxtral via vLLM, and Mistral's hosted Voxtral API behind a small FastAPI surface. Lives on the Windows GPU server (`mana-server-gpu`, RTX 3090). - -> ⚠️ **Earlier history**: this directory used to contain Mac-Mini–targeted -> code (Whisper Lightning MLX, com.mana.mana-stt.plist launchd setup, -> setup.sh with Apple-Silicon checks). That all moved to the Windows -> GPU box and was removed from the repo. If you're looking for the MLX -> path, see git history. - -## Tech Stack - -| Layer | Technology | -|-------|------------| -| **Runtime** | Python 3.11 + uvicorn (Windows) | -| **Framework** | FastAPI | -| **Whisper** | `whisperx` on CUDA (large-v3 + word alignment + pyannote diarization) | -| **Voxtral (local)** | vLLM serving Voxtral 3B/4B/24B (`vllm_service.py`) | -| **Voxtral (cloud)** | Mistral API (`voxtral_api_service.py`) | -| **Auth** | Per-key + internal-key API auth (`app/auth.py`, JWT via mana-auth in `app/external_auth.py`) | -| **VRAM** | Shared `vram_manager.py` accountant — coordinated with mana-tts and mana-image-gen so multiple GPU services don't OOM each other | -| **Process supervision** | Windows Scheduled Task `ManaSTT` (AtLogOn) | - -## Port: 3020 - -## Where it runs - -| Host | Path on disk | Entrypoint | -|------|--------------|------------| -| Windows GPU server (`192.168.178.11`) | `C:\mana\services\mana-stt\` | `service.pyw` via Scheduled Task `ManaSTT` | - -Public URL: `https://gpu-stt.mana.how` (via Cloudflare Tunnel + Mac Mini gpu-proxy). - -## API Endpoints - -| Method | Path | Description | -|--------|------|-------------| -| GET | `/health` | Liveness + which backends are loaded | -| GET | `/models` | Available STT models | -| POST | `/transcribe` | Whisper (WhisperX, default) — multipart `file` + optional `language` | -| POST | `/transcribe/voxtral` | Local Voxtral via vLLM | -| POST | `/transcribe/auto` | Routing helper — picks the best backend for the input | - -All endpoints (except `/health`) require `Authorization: Bearer `. Tokens are validated against `API_KEYS` (per-app keys) or `INTERNAL_API_KEY` (no rate limit), and JWTs from mana-auth are also accepted via `external_auth.py`. - -## Backends (`app/`) - -| File | What it loads | -|------|---------------| -| `whisper_service.py` | WhisperX on CUDA (large-v3 + alignment + pyannote diarization) | -| `voxtral_service.py` | Local Voxtral via vLLM (slower start, richer multilingual) | -| `voxtral_api_service.py` | Mistral hosted Voxtral API (cloud, no GPU needed) | -| `vllm_service.py` | vLLM client primitives shared by Voxtral | -| `vram_manager.py` | Shared VRAM accounting — same module also used by mana-tts and mana-image-gen | -| `auth.py` | API-key auth (internal + per-app keys) | -| `external_auth.py` | JWT validation via mana-auth | - -Backends are loaded lazily during the FastAPI lifespan and reported by `/health`. - -## Configuration (`.env` on the Windows GPU box) - -```env -PORT=3020 -WHISPER_MODEL=large-v3 -WHISPER_DEVICE=cuda -WHISPER_COMPUTE_TYPE=float16 -WHISPER_DEFAULT_LANGUAGE=de -PRELOAD_MODELS=true -USE_VLLM=false -HF_TOKEN=... # required for pyannote diarization models -REQUIRE_AUTH=true -API_KEYS=sk-app1:app1,sk-app2:app2 -INTERNAL_API_KEY=... # cross-service, no rate limit -CORS_ORIGINS=https://mana.how,https://chat.mana.how -``` - -## Operations - -```powershell -# Status -Get-ScheduledTask -TaskName "ManaSTT" | Format-List TaskName, State -Get-NetTCPConnection -LocalPort 3020 -State Listen - -# Restart -Stop-ScheduledTask -TaskName "ManaSTT" -Start-ScheduledTask -TaskName "ManaSTT" - -# Logs -Get-Content C:\mana\services\mana-stt\service.log -Tail 50 -``` - -## Reference - -- `docs/WINDOWS_GPU_SERVER_SETUP.md` — Windows box setup, scheduled tasks, firewall, Cloudflare tunnel -- `docs/LOCAL_STT_MODELS.md` — model comparisons (WER, latency, language coverage) -- `services/mana-stt/grafana-dashboard.json` — Prometheus metrics dashboard diff --git a/services/mana-stt/README.md b/services/mana-stt/README.md deleted file mode 100644 index 8e4abf5f1..000000000 --- a/services/mana-stt/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Mana STT Service - -Speech-to-Text API service running on the Windows GPU server (`mana-server-gpu`, RTX 3090). Wraps **WhisperX** (CUDA, large-v3 + word alignment + pyannote diarization), local **Voxtral via vLLM**, and the hosted **Mistral Voxtral API**. - -For architecture, deployment, configuration, and operations see [`CLAUDE.md`](./CLAUDE.md) and [`docs/WINDOWS_GPU_SERVER_SETUP.md`](../../docs/WINDOWS_GPU_SERVER_SETUP.md). - -## Port: 3020 - -## Public URL - -`https://gpu-stt.mana.how` (via Cloudflare Tunnel + Mac Mini gpu-proxy) - -## API Endpoints - -| Endpoint | Method | Description | -|----------|--------|-------------| -| `/health` | GET | Health check + which backends are loaded | -| `/models` | GET | List available models | -| `/transcribe` | POST | Whisper / WhisperX transcription | -| `/transcribe/voxtral` | POST | Voxtral transcription (local vLLM) | -| `/transcribe/auto` | POST | Auto-select best backend for the input | - -All endpoints (except `/health`) require `Authorization: Bearer `. - -## Quick Test - -```bash -curl -F "file=@audio.wav" -F "language=de" \ - -H "Authorization: Bearer $INTERNAL_API_KEY" \ - https://gpu-stt.mana.how/transcribe -``` diff --git a/services/mana-stt/app/__init__.py b/services/mana-stt/app/__init__.py deleted file mode 100644 index e5b57f4cc..000000000 --- a/services/mana-stt/app/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Mana STT Service diff --git a/services/mana-stt/app/auth.py b/services/mana-stt/app/auth.py deleted file mode 100644 index 40258c730..000000000 --- a/services/mana-stt/app/auth.py +++ /dev/null @@ -1,271 +0,0 @@ -""" -API Key Authentication for ManaCore STT Service - -Supports two authentication modes: -1. Local API keys: Configured via environment variables -2. External API keys: Validated via mana-core-auth service (when EXTERNAL_AUTH_ENABLED=true) - -Usage: - # Local keys - API_KEYS=sk-key1:name1,sk-key2:name2 - INTERNAL_API_KEY=sk-internal-xxx - - # External auth (for user-created keys via mana.how) - EXTERNAL_AUTH_ENABLED=true - MANA_CORE_AUTH_URL=http://localhost:3001 -""" - -import os -import time -import logging -from typing import Optional -from collections import defaultdict -from dataclasses import dataclass, field - -from fastapi import HTTPException, Security, Request -from fastapi.security import APIKeyHeader - -from .external_auth import ( - is_external_auth_enabled, - validate_api_key_external, - ExternalValidationResult, -) - -logger = logging.getLogger(__name__) - -# Configuration -API_KEYS_ENV = os.getenv("API_KEYS", "") # Format: "sk-key1:name1,sk-key2:name2" -INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY", "") # Unlimited internal key -REQUIRE_AUTH = os.getenv("REQUIRE_AUTH", "true").lower() == "true" -RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "60")) # Per minute -RATE_LIMIT_WINDOW = int(os.getenv("RATE_LIMIT_WINDOW", "60")) # Seconds - - -@dataclass -class APIKey: - """API Key with metadata.""" - key: str - name: str - is_internal: bool = False - rate_limit: int = RATE_LIMIT_REQUESTS # Requests per window - - -@dataclass -class RateLimitInfo: - """Rate limit tracking per key.""" - requests: list = field(default_factory=list) - - def is_allowed(self, limit: int, window: int) -> bool: - """Check if request is allowed within rate limit.""" - now = time.time() - # Remove old requests outside window - self.requests = [t for t in self.requests if now - t < window] - - if len(self.requests) >= limit: - return False - - self.requests.append(now) - return True - - def remaining(self, limit: int, window: int) -> int: - """Get remaining requests in current window.""" - now = time.time() - self.requests = [t for t in self.requests if now - t < window] - return max(0, limit - len(self.requests)) - - -# Parse API keys from environment -def _parse_api_keys() -> dict[str, APIKey]: - """Parse API keys from environment variables.""" - keys = {} - - # Parse comma-separated keys - if API_KEYS_ENV: - for entry in API_KEYS_ENV.split(","): - entry = entry.strip() - if ":" in entry: - key, name = entry.split(":", 1) - else: - key, name = entry, "default" - keys[key.strip()] = APIKey(key=key.strip(), name=name.strip()) - - # Add internal key with no rate limit - if INTERNAL_API_KEY: - keys[INTERNAL_API_KEY] = APIKey( - key=INTERNAL_API_KEY, - name="internal", - is_internal=True, - rate_limit=999999, # Effectively unlimited - ) - - return keys - - -# Global state -_api_keys = _parse_api_keys() -_rate_limits: dict[str, RateLimitInfo] = defaultdict(RateLimitInfo) - -# Security scheme -api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) - - -@dataclass -class AuthResult: - """Result of authentication check.""" - authenticated: bool - key_name: Optional[str] = None - is_internal: bool = False - rate_limit_remaining: Optional[int] = None - user_id: Optional[str] = None # Set when using external auth - - -async def verify_api_key( - request: Request, - api_key: Optional[str] = Security(api_key_header), -) -> AuthResult: - """ - Verify API key and check rate limits. - - Supports two authentication modes: - 1. External auth via mana-core-auth (for sk_live_ keys) - 2. Local auth via environment variables - - Returns AuthResult with authentication status. - Raises HTTPException if auth fails or rate limited. - """ - # Skip auth for health and docs endpoints - path = request.url.path - if path in ["/health", "/docs", "/openapi.json", "/redoc"]: - return AuthResult(authenticated=True, key_name="public") - - # If auth not required, allow all - if not REQUIRE_AUTH: - return AuthResult(authenticated=True, key_name="anonymous") - - # Check for API key - if not api_key: - logger.warning(f"Missing API key for {path} from {request.client.host if request.client else 'unknown'}") - raise HTTPException( - status_code=401, - detail="Missing API key. Provide X-API-Key header.", - headers={"WWW-Authenticate": "ApiKey"}, - ) - - # Try external auth first for sk_live_ keys (user-created keys via mana.how) - if api_key.startswith("sk_live_") and is_external_auth_enabled(): - external_result = await validate_api_key_external(api_key, "stt") - - if external_result is not None: - if external_result.valid: - # Use rate limits from external auth - rate_info = _rate_limits[api_key] - limit = external_result.rate_limit_requests - window = external_result.rate_limit_window - - if not rate_info.is_allowed(limit, window): - remaining = rate_info.remaining(limit, window) - logger.warning(f"Rate limit exceeded for external key") - raise HTTPException( - status_code=429, - detail=f"Rate limit exceeded. Try again in {window} seconds.", - headers={ - "X-RateLimit-Limit": str(limit), - "X-RateLimit-Remaining": str(remaining), - "X-RateLimit-Reset": str(int(time.time()) + window), - "Retry-After": str(window), - }, - ) - - remaining = rate_info.remaining(limit, window) - logger.debug(f"Authenticated external request from user {external_result.user_id} to {path}") - - return AuthResult( - authenticated=True, - key_name="external", - is_internal=False, - rate_limit_remaining=remaining, - user_id=external_result.user_id, - ) - else: - # External auth returned invalid - logger.warning(f"External auth failed: {external_result.error}") - raise HTTPException( - status_code=401, - detail=external_result.error or "Invalid API key.", - headers={"WWW-Authenticate": "ApiKey"}, - ) - # If external_result is None, fall through to local auth - - # Local auth: Validate key against environment variables - if api_key not in _api_keys: - logger.warning(f"Invalid API key attempt for {path}") - raise HTTPException( - status_code=401, - detail="Invalid API key.", - headers={"WWW-Authenticate": "ApiKey"}, - ) - - key_info = _api_keys[api_key] - - # Check rate limit (skip for internal keys) - if not key_info.is_internal: - rate_info = _rate_limits[api_key] - if not rate_info.is_allowed(key_info.rate_limit, RATE_LIMIT_WINDOW): - remaining = rate_info.remaining(key_info.rate_limit, RATE_LIMIT_WINDOW) - logger.warning(f"Rate limit exceeded for key '{key_info.name}'") - raise HTTPException( - status_code=429, - detail=f"Rate limit exceeded. Try again in {RATE_LIMIT_WINDOW} seconds.", - headers={ - "X-RateLimit-Limit": str(key_info.rate_limit), - "X-RateLimit-Remaining": str(remaining), - "X-RateLimit-Reset": str(int(time.time()) + RATE_LIMIT_WINDOW), - "Retry-After": str(RATE_LIMIT_WINDOW), - }, - ) - remaining = rate_info.remaining(key_info.rate_limit, RATE_LIMIT_WINDOW) - else: - remaining = None - - logger.debug(f"Authenticated request from '{key_info.name}' to {path}") - - return AuthResult( - authenticated=True, - key_name=key_info.name, - is_internal=key_info.is_internal, - rate_limit_remaining=remaining, - ) - - -def get_api_key_stats() -> dict: - """Get statistics about API keys (for admin endpoint).""" - stats = { - "total_keys": len(_api_keys), - "auth_required": REQUIRE_AUTH, - "rate_limit": { - "requests_per_window": RATE_LIMIT_REQUESTS, - "window_seconds": RATE_LIMIT_WINDOW, - }, - "keys": [], - } - - for key, info in _api_keys.items(): - # Don't expose actual keys, just metadata - masked_key = key[:8] + "..." if len(key) > 8 else "***" - rate_info = _rate_limits.get(key, RateLimitInfo()) - stats["keys"].append({ - "name": info.name, - "key_prefix": masked_key, - "is_internal": info.is_internal, - "requests_in_window": len(rate_info.requests), - "remaining": rate_info.remaining(info.rate_limit, RATE_LIMIT_WINDOW), - }) - - return stats - - -def reload_api_keys(): - """Reload API keys from environment (for runtime updates).""" - global _api_keys - _api_keys = _parse_api_keys() - logger.info(f"Reloaded {len(_api_keys)} API keys") diff --git a/services/mana-stt/app/external_auth.py b/services/mana-stt/app/external_auth.py deleted file mode 100644 index 6f64bd315..000000000 --- a/services/mana-stt/app/external_auth.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -External API Key Validation via mana-core-auth - -When EXTERNAL_AUTH_ENABLED=true, API keys are validated against the -central mana-core-auth service. This allows users to create and manage -API keys from the mana.how web interface. - -Results are cached for 5 minutes to reduce load on the auth service. -""" - -import os -import time -import logging -import httpx -from typing import Optional -from dataclasses import dataclass - -logger = logging.getLogger(__name__) - -# Configuration -EXTERNAL_AUTH_ENABLED = os.getenv("EXTERNAL_AUTH_ENABLED", "false").lower() == "true" -MANA_CORE_AUTH_URL = os.getenv("MANA_CORE_AUTH_URL", "http://localhost:3001") -API_KEY_CACHE_TTL = int(os.getenv("API_KEY_CACHE_TTL", "300")) # 5 minutes -EXTERNAL_AUTH_TIMEOUT = float(os.getenv("EXTERNAL_AUTH_TIMEOUT", "5.0")) # seconds - - -@dataclass -class ExternalValidationResult: - """Result from external API key validation.""" - valid: bool - user_id: Optional[str] = None - scopes: Optional[list] = None - rate_limit_requests: int = 60 - rate_limit_window: int = 60 - error: Optional[str] = None - cached_at: float = 0.0 - - -# In-memory cache for validation results -# Key: API key, Value: ExternalValidationResult -_validation_cache: dict[str, ExternalValidationResult] = {} - - -def is_external_auth_enabled() -> bool: - """Check if external authentication is enabled.""" - return EXTERNAL_AUTH_ENABLED - - -def _get_cached_result(api_key: str) -> Optional[ExternalValidationResult]: - """Get cached validation result if still valid.""" - result = _validation_cache.get(api_key) - if result and (time.time() - result.cached_at) < API_KEY_CACHE_TTL: - return result - return None - - -def _cache_result(api_key: str, result: ExternalValidationResult): - """Cache a validation result.""" - result.cached_at = time.time() - _validation_cache[api_key] = result - - # Clean up old entries periodically (keep cache size manageable) - if len(_validation_cache) > 1000: - now = time.time() - expired_keys = [ - k for k, v in _validation_cache.items() - if (now - v.cached_at) >= API_KEY_CACHE_TTL - ] - for k in expired_keys: - del _validation_cache[k] - - -async def validate_api_key_external(api_key: str, scope: str) -> Optional[ExternalValidationResult]: - """ - Validate an API key against mana-core-auth service. - - Args: - api_key: The API key to validate (e.g., "sk_live_...") - scope: The required scope (e.g., "stt" or "tts") - - Returns: - ExternalValidationResult if external auth is enabled and the key was validated. - None if external auth is disabled or the service is unavailable (fallback to local). - """ - if not EXTERNAL_AUTH_ENABLED: - return None - - # Check cache first - cached = _get_cached_result(api_key) - if cached: - logger.debug(f"Using cached validation result for key prefix: {api_key[:12]}...") - # Check scope against cached result - if cached.valid and cached.scopes and scope not in cached.scopes: - return ExternalValidationResult( - valid=False, - error=f"API key does not have scope: {scope}", - ) - return cached - - # Call mana-core-auth validation endpoint - try: - async with httpx.AsyncClient(timeout=EXTERNAL_AUTH_TIMEOUT) as client: - response = await client.post( - f"{MANA_CORE_AUTH_URL}/api/v1/api-keys/validate", - json={"apiKey": api_key, "scope": scope}, - ) - - if response.status_code == 200: - data = response.json() - result = ExternalValidationResult( - valid=data.get("valid", False), - user_id=data.get("userId"), - scopes=data.get("scopes", []), - rate_limit_requests=data.get("rateLimit", {}).get("requests", 60), - rate_limit_window=data.get("rateLimit", {}).get("window", 60), - error=data.get("error"), - ) - _cache_result(api_key, result) - return result - else: - logger.warning( - f"External auth returned status {response.status_code}: {response.text}" - ) - # Don't cache errors - allow retry - return ExternalValidationResult( - valid=False, - error=f"Auth service returned {response.status_code}", - ) - - except httpx.TimeoutException: - logger.warning("External auth service timeout - falling back to local auth") - return None - except httpx.ConnectError: - logger.warning("Cannot connect to external auth service - falling back to local auth") - return None - except Exception as e: - logger.error(f"External auth error: {e}") - return None - - -def clear_cache(): - """Clear the validation cache (for testing or runtime updates).""" - global _validation_cache - _validation_cache.clear() - logger.info("External auth cache cleared") diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py deleted file mode 100644 index f07e33a0e..000000000 --- a/services/mana-stt/app/main.py +++ /dev/null @@ -1,392 +0,0 @@ -""" -ManaCore STT API Service (WhisperX Edition) -Speech-to-Text with WhisperX: transcription, word timestamps, speaker diarization. - -Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020 -""" - -import os -import logging -import time -from typing import Optional -from contextlib import asynccontextmanager - -from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, Response -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from pydantic import BaseModel - -from app.auth import verify_api_key, AuthResult, get_api_key_stats, REQUIRE_AUTH - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger(__name__) - -# Environment -PORT = int(os.getenv("PORT", "3020")) -DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3") -PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true" -CORS_ORIGINS = os.getenv( - "CORS_ORIGINS", - "https://mana.how,https://chat.mana.how,http://localhost:5173" -).split(",") - -# vLLM configuration -VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100") -USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true" - - -# Response models -class WordInfo(BaseModel): - word: str - start: float - end: float - score: Optional[float] = None - speaker: Optional[str] = None - - -class SegmentInfo(BaseModel): - start: float - end: float - text: str - speaker: Optional[str] = None - - -class TranscriptionResponse(BaseModel): - text: str - language: Optional[str] = None - model: str - latency_ms: Optional[float] = None - duration_seconds: Optional[float] = None - words: Optional[list[WordInfo]] = None - segments: Optional[list[SegmentInfo]] = None - speakers: Optional[list[str]] = None - - -class HealthResponse(BaseModel): - status: str - whisper_loaded: bool - whisperx: bool - vllm_available: bool - vllm_url: Optional[str] = None - mistral_api_available: bool - auth_required: bool - models: dict - - -class ModelsResponse(BaseModel): - whisper: list - voxtral_vllm: list - default_whisper: str - - -# Track loaded models -models_status = { - "whisper_loaded": False, - "vllm_available": False, -} - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Startup and shutdown events.""" - logger.info("Starting ManaCore STT Service (WhisperX Edition)...") - - # Check vLLM availability - if USE_VLLM: - from app.vllm_service import check_health - health = await check_health() - models_status["vllm_available"] = health.get("status") == "healthy" - - # Check Mistral API - from app.voxtral_api_service import is_available as api_available - if api_available(): - logger.info("Mistral API fallback configured") - - # Always preload WhisperX model at startup (avoids timeout on first request) - logger.info("Preloading WhisperX model...") - try: - from app.whisper_service import get_whisper_model - get_whisper_model(DEFAULT_WHISPER_MODEL) - models_status["whisper_loaded"] = True - logger.info("WhisperX model preloaded successfully") - except Exception as e: - logger.warning(f"Failed to preload WhisperX: {e}") - - logger.info(f"STT Service ready on port {PORT}") - yield - logger.info("Shutting down STT Service...") - - -# Create FastAPI app -app = FastAPI( - title="ManaCore STT Service", - description="Speech-to-Text API with WhisperX (word timestamps + speaker diarization)", - version="3.0.0", - lifespan=lifespan, -) - -# CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=CORS_ORIGINS, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -@app.get("/health", response_model=HealthResponse) -async def health_check(): - """Health check endpoint.""" - from app.voxtral_api_service import is_available as api_available - from app.vllm_service import check_health - - vllm_health = await check_health() - - return HealthResponse( - status="healthy", - whisper_loaded=models_status["whisper_loaded"], - whisperx=True, - vllm_available=vllm_health.get("status") == "healthy", - vllm_url=VLLM_URL if USE_VLLM else None, - mistral_api_available=api_available(), - auth_required=REQUIRE_AUTH, - models={ - "default_whisper": DEFAULT_WHISPER_MODEL, - "engine": "whisperx", - "features": ["transcription", "word_timestamps", "speaker_diarization"], - }, - ) - - -@app.get("/models", response_model=ModelsResponse) -async def list_models(auth: AuthResult = Depends(verify_api_key)): - """List available models.""" - from app.whisper_service import AVAILABLE_MODELS as whisper_models - from app.vllm_service import get_models - - vllm_models = await get_models() - - return ModelsResponse( - whisper=whisper_models, - voxtral_vllm=vllm_models, - default_whisper=DEFAULT_WHISPER_MODEL, - ) - - -@app.post("/transcribe", response_model=TranscriptionResponse) -async def transcribe_whisper( - response: Response, - file: UploadFile = File(..., description="Audio file to transcribe"), - language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"), - model: Optional[str] = Form(None, description="Whisper model to use"), - align: bool = Form(True, description="Enable word-level timestamp alignment"), - diarize: bool = Form(False, description="Enable speaker diarization"), - min_speakers: Optional[int] = Form(None, description="Min expected speakers (helps diarization)"), - max_speakers: Optional[int] = Form(None, description="Max expected speakers"), - auth: AuthResult = Depends(verify_api_key), -): - """ - Transcribe audio using WhisperX. - - Features: - - Word-level timestamps (align=true, default) - - Speaker diarization (diarize=true, opt-in) - - Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4 - Max file size: 100MB - """ - if auth.rate_limit_remaining is not None: - response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining) - - if not file.filename: - raise HTTPException(status_code=400, detail="No file provided") - - allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"} - ext = os.path.splitext(file.filename)[1].lower() - if ext not in allowed_extensions: - raise HTTPException( - status_code=400, - detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}" - ) - - start_time = time.time() - - try: - from app.whisper_service import transcribe_audio_bytes - - audio_bytes = await file.read() - if len(audio_bytes) > 100 * 1024 * 1024: - raise HTTPException(status_code=400, detail="File too large (max 100MB)") - - model_name = model or DEFAULT_WHISPER_MODEL - - result = await transcribe_audio_bytes( - audio_bytes=audio_bytes, - filename=file.filename, - language=language, - model_name=model_name, - align=align, - diarize=diarize, - min_speakers=min_speakers, - max_speakers=max_speakers, - ) - - models_status["whisper_loaded"] = True - latency_ms = (time.time() - start_time) * 1000 - - # Build response - resp = TranscriptionResponse( - text=result.text, - language=result.language, - model=f"whisperx-{model_name}", - latency_ms=latency_ms, - duration_seconds=result.duration, - ) - - # Add word timestamps if available - if result.words: - resp.words = [ - WordInfo( - word=w.word, - start=w.start, - end=w.end, - score=w.score, - speaker=w.speaker, - ) - for w in result.words - ] - - # Add segments - if result.segments: - resp.segments = [ - SegmentInfo(**s) for s in result.segments - ] - - # Add speakers - if result.speakers: - resp.speakers = result.speakers - - return resp - - except Exception as e: - logger.error(f"WhisperX transcription error: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/transcribe/voxtral", response_model=TranscriptionResponse) -async def transcribe_voxtral( - response: Response, - file: UploadFile = File(..., description="Audio file to transcribe"), - language: str = Form("de", description="Language code"), - use_realtime: bool = Form(False, description="Use Realtime 4B model"), - auth: AuthResult = Depends(verify_api_key), -): - """Transcribe audio using Voxtral via vLLM or Mistral API.""" - if auth.rate_limit_remaining is not None: - response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining) - - if not file.filename: - raise HTTPException(status_code=400, detail="No file provided") - - from app.vllm_service import ( - SUPPORTED_LANGUAGES, - is_available as vllm_available, - transcribe_audio_bytes as vllm_transcribe, - transcribe_with_realtime, - check_health, - ) - from app.voxtral_api_service import ( - is_available as api_available, - transcribe_audio_bytes as api_transcribe, - ) - - if language not in SUPPORTED_LANGUAGES: - raise HTTPException( - status_code=400, - detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}" - ) - - try: - audio_bytes = await file.read() - if len(audio_bytes) > 100 * 1024 * 1024: - raise HTTPException(status_code=400, detail="File too large (max 100MB)") - - # Try vLLM first - if USE_VLLM: - health = await check_health() - if health.get("status") == "healthy": - if use_realtime: - result = await transcribe_with_realtime( - audio_bytes=audio_bytes, filename=file.filename, language=language, - ) - else: - result = await vllm_transcribe( - audio_bytes=audio_bytes, filename=file.filename, language=language, - ) - return TranscriptionResponse( - text=result.text, language=result.language, model=result.model, - latency_ms=result.latency_ms, duration_seconds=result.duration_seconds, - ) - - # Fallback to Mistral API - if api_available(): - result = await api_transcribe( - audio_bytes=audio_bytes, filename=file.filename, language=language, - ) - return TranscriptionResponse( - text=result.text, language=result.language, model=result.model, - duration_seconds=result.duration_seconds, - ) - - raise HTTPException(status_code=503, detail="Voxtral not available.") - - except HTTPException: - raise - except Exception as e: - logger.error(f"Voxtral transcription error: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/transcribe/auto", response_model=TranscriptionResponse) -async def transcribe_auto( - response: Response, - file: UploadFile = File(..., description="Audio file to transcribe"), - language: Optional[str] = Form(None, description="Language hint"), - prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"), - auth: AuthResult = Depends(verify_api_key), -): - """Auto-select best model with fallback chain.""" - if auth.rate_limit_remaining is not None: - response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining) - - if prefer == "voxtral": - try: - return await transcribe_voxtral(response, file, language or "de", False, auth) - except Exception: - await file.seek(0) - return await transcribe_whisper(response, file, language, None, True, False, None, None, auth) - else: - try: - return await transcribe_whisper(response, file, language, None, True, False, None, None, auth) - except Exception: - await file.seek(0) - return await transcribe_voxtral(response, file, language or "de", False, auth) - - -@app.exception_handler(Exception) -async def global_exception_handler(request, exc): - logger.error(f"Unhandled error: {exc}") - return JSONResponse( - status_code=500, - content={"detail": "Internal server error", "error": str(exc)}, - ) - - -if __name__ == "__main__": - import uvicorn - uvicorn.run("app.main:app", host="0.0.0.0", port=PORT, reload=False) diff --git a/services/mana-stt/app/vllm_service.py b/services/mana-stt/app/vllm_service.py deleted file mode 100644 index 4ca1857a1..000000000 --- a/services/mana-stt/app/vllm_service.py +++ /dev/null @@ -1,178 +0,0 @@ -""" -vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription - -vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API. -This service proxies requests to the vLLM server. - -Requirements: -- vLLM server running on VLLM_URL (default: http://localhost:8100) -- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602 -""" - -import os -import logging -import time -import tempfile -import httpx -from pathlib import Path -from typing import Optional -from dataclasses import dataclass - -logger = logging.getLogger(__name__) - -# vLLM server configuration -VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100") -VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio - -# Model IDs -VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507" -VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602" - - -@dataclass -class VllmTranscriptionResult: - text: str - language: Optional[str] = None - model: str = "voxtral-vllm" - latency_ms: Optional[float] = None - duration_seconds: Optional[float] = None - - -async def check_health() -> dict: - """Check if vLLM server is healthy.""" - try: - async with httpx.AsyncClient(timeout=5.0) as client: - response = await client.get(f"{VLLM_URL}/health") - if response.status_code == 200: - return {"status": "healthy", "url": VLLM_URL} - return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code} - except Exception as e: - return {"status": "unavailable", "url": VLLM_URL, "error": str(e)} - - -async def get_models() -> list: - """Get available models from vLLM server.""" - try: - async with httpx.AsyncClient(timeout=5.0) as client: - response = await client.get(f"{VLLM_URL}/v1/models") - if response.status_code == 200: - data = response.json() - return [m["id"] for m in data.get("data", [])] - return [] - except Exception: - return [] - - -def is_available() -> bool: - """Check if vLLM server is configured.""" - return bool(VLLM_URL) - - -async def transcribe_audio_bytes( - audio_bytes: bytes, - filename: str, - language: Optional[str] = "de", - model: Optional[str] = None, -) -> VllmTranscriptionResult: - """ - Transcribe audio using vLLM Voxtral server. - - Args: - audio_bytes: Raw audio bytes - filename: Original filename (for format detection) - language: Language code (de, en, fr, etc.) - model: Model to use (defaults to Voxtral-Mini-3B-2507) - - Returns: - VllmTranscriptionResult with transcription - """ - start_time = time.time() - model_id = model or VOXTRAL_3B - - logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)") - - # Save to temp file (vLLM API accepts file uploads) - ext = Path(filename).suffix or ".wav" - with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: - tmp.write(audio_bytes) - tmp_path = tmp.name - - try: - async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client: - # Use OpenAI-compatible transcription endpoint - with open(tmp_path, "rb") as f: - files = {"file": (filename, f, "audio/wav")} - data = { - "model": model_id, - "language": language or "de", - "response_format": "json", - "temperature": 0.0, # Deterministic for transcription - } - - response = await client.post( - f"{VLLM_URL}/v1/audio/transcriptions", - files=files, - data=data, - ) - - if response.status_code != 200: - error_detail = response.text - logger.error(f"vLLM error: {response.status_code} - {error_detail}") - raise RuntimeError(f"vLLM transcription failed: {error_detail}") - - result = response.json() - text = result.get("text", "") - duration = result.get("duration") - - latency_ms = (time.time() - start_time) * 1000 - logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms") - - return VllmTranscriptionResult( - text=text.strip(), - language=language, - model=f"vllm-{model_id.split('/')[-1]}", - latency_ms=latency_ms, - duration_seconds=duration, - ) - - finally: - try: - os.unlink(tmp_path) - except Exception: - pass - - -async def transcribe_with_realtime( - audio_bytes: bytes, - filename: str, - language: Optional[str] = "de", -) -> VllmTranscriptionResult: - """ - Transcribe using Voxtral 4B Realtime model. - - Optimized for low latency (<500ms). - """ - return await transcribe_audio_bytes( - audio_bytes=audio_bytes, - filename=filename, - language=language, - model=VOXTRAL_4B_REALTIME, - ) - - -# Supported languages (same as Voxtral) -SUPPORTED_LANGUAGES = [ - "en", # English - "zh", # Chinese - "hi", # Hindi - "es", # Spanish - "ar", # Arabic - "fr", # French - "pt", # Portuguese - "ru", # Russian - "de", # German - "ja", # Japanese - "ko", # Korean - "it", # Italian - "nl", # Dutch -] diff --git a/services/mana-stt/app/voxtral_api_service.py b/services/mana-stt/app/voxtral_api_service.py deleted file mode 100644 index 53d78f808..000000000 --- a/services/mana-stt/app/voxtral_api_service.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -Voxtral API Service - Mistral Cloud API Fallback -Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded. - -Features: -- Speaker diarization -- Word-level timestamps -- Context biasing for domain-specific terms -- 13 language support -""" - -import os -import logging -import tempfile -from pathlib import Path -from typing import Optional, Literal -from dataclasses import dataclass, field - -logger = logging.getLogger(__name__) - -# Lazy load client -_mistral_client = None - -MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") -DEFAULT_MODEL = "voxtral-mini-latest" # voxtral-mini-2602 - - -@dataclass -class Speaker: - """Speaker information from diarization.""" - id: str - start: float - end: float - - -@dataclass -class WordTimestamp: - """Word-level timestamp.""" - word: str - start: float - end: float - - -@dataclass -class SegmentTimestamp: - """Segment-level timestamp.""" - text: str - start: float - end: float - speaker: Optional[str] = None - - -@dataclass -class VoxtralApiResult: - """Result from Voxtral API transcription.""" - text: str - language: Optional[str] = None - model: str = "voxtral-api" - duration_seconds: Optional[float] = None - words: list[WordTimestamp] = field(default_factory=list) - segments: list[SegmentTimestamp] = field(default_factory=list) - speakers: list[Speaker] = field(default_factory=list) - - -def get_mistral_client(): - """Get or create Mistral client instance.""" - global _mistral_client - - if _mistral_client is None: - if not MISTRAL_API_KEY: - raise RuntimeError( - "MISTRAL_API_KEY environment variable not set. " - "Get your API key at https://console.mistral.ai/" - ) - - try: - from mistralai import Mistral - _mistral_client = Mistral(api_key=MISTRAL_API_KEY) - logger.info("Mistral API client initialized") - except ImportError: - raise RuntimeError( - "mistralai package not installed. " - "Run: pip install mistralai" - ) - - return _mistral_client - - -def is_available() -> bool: - """Check if Mistral API is configured and available.""" - return bool(MISTRAL_API_KEY) - - -async def transcribe_audio_bytes( - audio_bytes: bytes, - filename: str, - language: Optional[str] = None, - timestamp_granularity: Optional[Literal["word", "segment"]] = None, - diarization: bool = False, - context_bias: Optional[list[str]] = None, -) -> VoxtralApiResult: - """ - Transcribe audio using Mistral's Voxtral API. - - Args: - audio_bytes: Raw audio bytes - filename: Original filename (for extension detection) - language: Language code (de, en, fr, etc.) - auto-detect if None - timestamp_granularity: "word" or "segment" for timestamps - diarization: Enable speaker diarization - context_bias: List of domain-specific terms to improve accuracy (max 100) - - Returns: - VoxtralApiResult with transcription and optional metadata - """ - client = get_mistral_client() - - logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)") - - try: - # Build request parameters - request_params = { - "model": DEFAULT_MODEL, - "file": { - "content": audio_bytes, - "file_name": filename, - }, - } - - # Language and timestamps are mutually exclusive in current API - if language and not timestamp_granularity: - request_params["language"] = language - - if timestamp_granularity: - request_params["timestamp_granularities"] = [timestamp_granularity] - - if diarization: - request_params["diarization"] = True - - if context_bias: - # API accepts comma-separated string, max 100 terms - bias_terms = context_bias[:100] - request_params["context_bias"] = ",".join(bias_terms) - - # Make API call - response = client.audio.transcriptions.complete(**request_params) - - # Parse response - result = VoxtralApiResult( - text=response.text, - language=getattr(response, "language", language), - model=f"voxtral-api-{DEFAULT_MODEL}", - duration_seconds=getattr(response, "duration", None), - ) - - # Parse word timestamps if present - if hasattr(response, "words") and response.words: - result.words = [ - WordTimestamp( - word=w.word, - start=w.start, - end=w.end, - ) - for w in response.words - ] - - # Parse segment timestamps if present - if hasattr(response, "segments") and response.segments: - result.segments = [ - SegmentTimestamp( - text=s.text, - start=s.start, - end=s.end, - speaker=getattr(s, "speaker", None), - ) - for s in response.segments - ] - - # Parse speakers if diarization enabled - if hasattr(response, "speakers") and response.speakers: - result.speakers = [ - Speaker( - id=sp.id, - start=sp.start, - end=sp.end, - ) - for sp in response.speakers - ] - - logger.info(f"Mistral API transcription complete: {len(result.text)} characters") - return result - - except Exception as e: - logger.error(f"Mistral API transcription failed: {e}") - raise - - -# Supported languages by Voxtral API (13 languages) -SUPPORTED_LANGUAGES = [ - "en", # English - "zh", # Chinese - "hi", # Hindi - "es", # Spanish - "ar", # Arabic - "fr", # French - "pt", # Portuguese - "ru", # Russian - "de", # German - "ja", # Japanese - "ko", # Korean - "it", # Italian - "nl", # Dutch -] diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py deleted file mode 100644 index 320e5020d..000000000 --- a/services/mana-stt/app/voxtral_service.py +++ /dev/null @@ -1,267 +0,0 @@ -""" -Voxtral STT Service using Hugging Face Transformers -Mistral AI's Speech-to-Text model (Apache 2.0 License) - -Uses VoxtralForConditionalGeneration with apply_transcription_request -as per official HuggingFace documentation. -""" - -import os -import tempfile -import logging -import time -from pathlib import Path -from typing import Optional -from dataclasses import dataclass - -logger = logging.getLogger(__name__) - -# Lazy load to avoid import errors -_voxtral_model = None -_voxtral_processor = None -_model_name = None - -# Default model -DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507" - - -@dataclass -class VoxtralTranscriptionResult: - text: str - language: Optional[str] = None - model: str = "voxtral-mini-3b" - latency_ms: Optional[float] = None - - -def get_voxtral_model(model_name: str = DEFAULT_MODEL): - """ - Get or create Voxtral model instance. - - Uses VoxtralForConditionalGeneration (the correct class for Voxtral). - """ - global _voxtral_model, _voxtral_processor, _model_name - - # Reload if different model requested - if _voxtral_model is not None and _model_name != model_name: - logger.info(f"Switching model from {_model_name} to {model_name}") - _voxtral_model = None - _voxtral_processor = None - - if _voxtral_model is None: - logger.info(f"Loading Voxtral model: {model_name}") - try: - import torch - from transformers import VoxtralForConditionalGeneration, AutoProcessor - - # Determine device and dtype - if torch.backends.mps.is_available(): - device = "mps" - # MPS works better with float16 - torch_dtype = torch.float16 - elif torch.cuda.is_available(): - device = "cuda" - torch_dtype = torch.bfloat16 - else: - device = "cpu" - torch_dtype = torch.float32 - - logger.info(f"Using device: {device}, dtype: {torch_dtype}") - - # Load processor - _voxtral_processor = AutoProcessor.from_pretrained(model_name) - - # Load model with VoxtralForConditionalGeneration - if device == "mps": - # MPS doesn't support device_map, load to CPU first then move - _voxtral_model = VoxtralForConditionalGeneration.from_pretrained( - model_name, - torch_dtype=torch_dtype, - ) - _voxtral_model = _voxtral_model.to(device) - else: - _voxtral_model = VoxtralForConditionalGeneration.from_pretrained( - model_name, - torch_dtype=torch_dtype, - device_map=device, - ) - - _model_name = model_name - logger.info(f"Voxtral model loaded successfully on {device}") - - except ImportError as e: - logger.error(f"Failed to import transformers: {e}") - raise RuntimeError( - "transformers >= 4.54.0 required. " - "Run: pip install --upgrade transformers" - ) - except Exception as e: - logger.error(f"Failed to load Voxtral model: {e}") - raise - - return _voxtral_model, _voxtral_processor - - -def transcribe_audio( - audio_path: str, - language: Optional[str] = "de", - model_name: str = DEFAULT_MODEL, -) -> VoxtralTranscriptionResult: - """ - Transcribe audio file using Voxtral. - - Uses the official apply_transcription_request method. - - Args: - audio_path: Path to audio file - language: Language code (de, en, fr, etc.) - model_name: Hugging Face model ID - - Returns: - VoxtralTranscriptionResult with transcribed text - """ - import torch - - model, processor = get_voxtral_model(model_name) - device = next(model.parameters()).device - dtype = next(model.parameters()).dtype - - logger.info(f"Transcribing with Voxtral: {audio_path}") - start_time = time.time() - - try: - # Use apply_transcription_request (official method) - # This handles audio loading and preprocessing internally - inputs = processor.apply_transcription_request( - language=language or "en", - audio=audio_path, - model_id=model_name, - ) - - # Move inputs to device and dtype - inputs = inputs.to(device, dtype=dtype) - - # Generate transcription - with torch.no_grad(): - outputs = model.generate( - **inputs, - max_new_tokens=500, - do_sample=False, - ) - - # Decode - skip input tokens - input_len = inputs.input_ids.shape[1] - decoded = processor.batch_decode( - outputs[:, input_len:], - skip_special_tokens=True, - ) - - text = decoded[0] if decoded else "" - latency_ms = (time.time() - start_time) * 1000 - - logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms") - - return VoxtralTranscriptionResult( - text=text.strip(), - language=language, - model=model_name.split("/")[-1], - latency_ms=latency_ms, - ) - - except Exception as e: - logger.error(f"Voxtral transcription failed: {e}") - raise - - -async def transcribe_audio_bytes( - audio_bytes: bytes, - filename: str, - language: Optional[str] = "de", - model_name: str = DEFAULT_MODEL, -) -> VoxtralTranscriptionResult: - """ - Transcribe audio from bytes (for API uploads). - """ - ext = Path(filename).suffix or ".wav" - - with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: - tmp.write(audio_bytes) - tmp_path = tmp.name - - try: - result = transcribe_audio( - audio_path=tmp_path, - language=language, - model_name=model_name, - ) - return result - finally: - try: - os.unlink(tmp_path) - except Exception: - pass - - -def unload_model(): - """Unload model to free memory.""" - global _voxtral_model, _voxtral_processor, _model_name - - if _voxtral_model is not None: - del _voxtral_model - del _voxtral_processor - _voxtral_model = None - _voxtral_processor = None - _model_name = None - - import gc - gc.collect() - - try: - import torch - if torch.backends.mps.is_available(): - torch.mps.empty_cache() - elif torch.cuda.is_available(): - torch.cuda.empty_cache() - except Exception: - pass - - logger.info("Voxtral model unloaded") - - -def is_loaded() -> bool: - """Check if model is currently loaded.""" - return _voxtral_model is not None - - -def get_loaded_model_name() -> Optional[str]: - """Get name of currently loaded model.""" - return _model_name - - -# Supported languages (13 languages as per Mistral docs) -SUPPORTED_LANGUAGES = [ - "en", # English - "zh", # Chinese - "hi", # Hindi - "es", # Spanish - "ar", # Arabic - "fr", # French - "pt", # Portuguese - "ru", # Russian - "de", # German - "ja", # Japanese - "ko", # Korean - "it", # Italian - "nl", # Dutch -] - -# Available models -AVAILABLE_MODELS = [ - { - "id": "voxtral-mini-3b", - "name": "Voxtral-Mini-3B-2507", - "huggingface_id": "mistralai/Voxtral-Mini-3B-2507", - "params": "3B", - "vram": "~6GB", - "description": "Balanced quality and speed for local deployment", - }, -] diff --git a/services/mana-stt/app/vram_manager.py b/services/mana-stt/app/vram_manager.py deleted file mode 100644 index 89b5656ae..000000000 --- a/services/mana-stt/app/vram_manager.py +++ /dev/null @@ -1,114 +0,0 @@ -""" -VRAM Manager — Automatic model unloading after idle timeout. - -Tracks last usage time per model and unloads after configurable timeout. -Designed for shared GPU environments (multiple services on one RTX 3090). - -Usage in a service: - from vram_manager import VramManager - - vram = VramManager(idle_timeout=300) # 5 min - - # Before using a model - vram.touch() - - # Call periodically (e.g., from health check or background task) - vram.check_idle(unload_fn=my_unload_function) -""" - -import os -import time -import logging -import threading -from typing import Optional, Callable - -logger = logging.getLogger(__name__) - -DEFAULT_IDLE_TIMEOUT = int(os.getenv("VRAM_IDLE_TIMEOUT", "300")) # 5 minutes - - -class VramManager: - def __init__(self, idle_timeout: int = DEFAULT_IDLE_TIMEOUT, service_name: str = "unknown"): - self.idle_timeout = idle_timeout - self.service_name = service_name - self.last_used: float = 0.0 - self.model_loaded: bool = False - self._lock = threading.Lock() - self._timer: Optional[threading.Timer] = None - - def touch(self): - """Mark the model as recently used. Call before/after each inference.""" - with self._lock: - self.last_used = time.time() - self.model_loaded = True - self._schedule_check() - - def mark_loaded(self): - """Mark that a model has been loaded into VRAM.""" - with self._lock: - self.model_loaded = True - self.last_used = time.time() - self._schedule_check() - logger.info(f"[{self.service_name}] Model loaded, idle timeout: {self.idle_timeout}s") - - def mark_unloaded(self): - """Mark that a model has been unloaded from VRAM.""" - with self._lock: - self.model_loaded = False - if self._timer: - self._timer.cancel() - self._timer = None - logger.info(f"[{self.service_name}] Model unloaded, VRAM freed") - - def is_idle(self) -> bool: - """Check if the model has been idle longer than the timeout.""" - if not self.model_loaded: - return False - return (time.time() - self.last_used) > self.idle_timeout - - def seconds_until_unload(self) -> Optional[float]: - """Seconds until the model will be unloaded, or None if not loaded.""" - if not self.model_loaded: - return None - remaining = self.idle_timeout - (time.time() - self.last_used) - return max(0, remaining) - - def check_and_unload(self, unload_fn: Callable[[], None]) -> bool: - """Check if idle and unload if so. Returns True if unloaded.""" - if self.is_idle(): - logger.info(f"[{self.service_name}] Idle for >{self.idle_timeout}s, unloading model...") - try: - unload_fn() - self.mark_unloaded() - return True - except Exception as e: - logger.error(f"[{self.service_name}] Failed to unload: {e}") - return False - - def _schedule_check(self): - """Schedule an idle check after the timeout period.""" - if self._timer: - self._timer.cancel() - - self._timer = threading.Timer( - self.idle_timeout + 5, # Small buffer - self._auto_check, - ) - self._timer.daemon = True - self._timer.start() - - def _auto_check(self): - """Auto-triggered idle check (called by timer).""" - # This is just a log — actual unloading needs the unload_fn - # which depends on the service. The service should call check_and_unload. - if self.is_idle(): - logger.info(f"[{self.service_name}] Model idle for >{self.idle_timeout}s — ready to unload") - - def status(self) -> dict: - """Get current VRAM manager status.""" - return { - "model_loaded": self.model_loaded, - "idle_seconds": round(time.time() - self.last_used, 1) if self.model_loaded else None, - "idle_timeout": self.idle_timeout, - "seconds_until_unload": round(self.seconds_until_unload(), 1) if self.model_loaded else None, - } diff --git a/services/mana-stt/app/whisper_service.py b/services/mana-stt/app/whisper_service.py deleted file mode 100644 index 821e22d9b..000000000 --- a/services/mana-stt/app/whisper_service.py +++ /dev/null @@ -1,358 +0,0 @@ -""" -Whisper STT Service using WhisperX (CUDA) -Provides: transcription, word-level timestamps, speaker diarization. - -WhisperX pipeline: -1. faster-whisper for transcription -2. wav2vec2 for forced alignment (precise word timestamps) -3. pyannote-audio for speaker diarization -""" - -import os -import tempfile -import logging -from pathlib import Path -from typing import Optional -from dataclasses import dataclass, field - -logger = logging.getLogger(__name__) - -# Lazy load -_whisperx_model = None -_align_model = None -_align_metadata = None -_diarize_pipeline = None - -# Config -HF_TOKEN = os.getenv("HF_TOKEN", "") - -# VRAM management — unload after 10 min idle (STT uses ~3GB) -from app.vram_manager import VramManager -_vram = VramManager( - idle_timeout=int(os.getenv("VRAM_IDLE_TIMEOUT", "600")), - service_name="mana-stt", -) - - -def unload_models(): - """Unload all WhisperX models from GPU to free VRAM.""" - global _whisperx_model, _align_model, _align_metadata, _diarize_pipeline - import torch - - if _whisperx_model is not None: - del _whisperx_model - _whisperx_model = None - if _align_model is not None: - del _align_model - _align_model = None - _align_metadata = None - if _diarize_pipeline is not None: - del _diarize_pipeline - _diarize_pipeline = None - - torch.cuda.empty_cache() - _vram.mark_unloaded() - logger.info("WhisperX models unloaded, VRAM freed") - - -@dataclass -class WordSegment: - word: str - start: float - end: float - score: Optional[float] = None - speaker: Optional[str] = None - - -@dataclass -class TranscriptionResult: - text: str - language: Optional[str] = None - duration: Optional[float] = None - segments: Optional[list] = None - words: Optional[list[WordSegment]] = field(default_factory=list) - speakers: Optional[list[str]] = field(default_factory=list) - - -def get_whisper_model(model_name: str = "large-v3", **kwargs): - """Get or create WhisperX model instance (singleton).""" - global _whisperx_model - - if _whisperx_model is not None: - return _whisperx_model - - logger.info(f"Loading WhisperX model: {model_name}") - try: - import whisperx - - device = os.getenv("WHISPER_DEVICE", "cuda") - compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16") - - default_language = os.getenv("WHISPER_DEFAULT_LANGUAGE", "de") - _whisperx_model = whisperx.load_model( - model_name, - device=device, - compute_type=compute_type, - language=default_language, - ) - logger.info(f"WhisperX model loaded: {model_name} on {device} ({compute_type})") - _vram.mark_loaded() - except ImportError as e: - logger.error(f"Failed to import whisperx: {e}") - raise RuntimeError("whisperx not installed. Run: pip install whisperx") - except Exception as e: - logger.error(f"Failed to load WhisperX model: {e}") - raise - - return _whisperx_model - - -def _get_align_model(language: str, device: str = "cuda"): - """Get or create alignment model for a language.""" - global _align_model, _align_metadata - - import whisperx - - # Reload if language changed (alignment models are language-specific) - if _align_model is None: - logger.info(f"Loading alignment model for language: {language}") - _align_model, _align_metadata = whisperx.load_align_model( - language_code=language, - device=device, - ) - logger.info("Alignment model loaded") - - return _align_model, _align_metadata - - -def _get_diarize_pipeline(device: str = "cuda"): - """Get or create speaker diarization pipeline.""" - global _diarize_pipeline - - if _diarize_pipeline is not None: - return _diarize_pipeline - - import torch - from pyannote.audio import Pipeline - - token = HF_TOKEN or os.getenv("HUGGING_FACE_HUB_TOKEN", "") - if not token: - logger.warning("No HF_TOKEN set — speaker diarization may fail for gated models") - - logger.info("Loading speaker diarization pipeline (pyannote)...") - _diarize_pipeline = Pipeline.from_pretrained( - "pyannote/speaker-diarization-3.1", - token=token, - ) - _diarize_pipeline.to(torch.device(device)) - logger.info("Diarization pipeline loaded") - return _diarize_pipeline - - -def transcribe_audio( - audio_path: str, - language: Optional[str] = None, - model_name: str = "large-v3", - align: bool = True, - diarize: bool = False, - min_speakers: Optional[int] = None, - max_speakers: Optional[int] = None, -) -> TranscriptionResult: - """ - Transcribe audio using WhisperX with optional alignment and diarization. - - Args: - audio_path: Path to audio file - language: Language code (auto-detect if None) - model_name: Whisper model to use - align: Enable word-level timestamp alignment - diarize: Enable speaker diarization - min_speakers: Minimum expected speakers (helps diarization) - max_speakers: Maximum expected speakers - - Returns: - TranscriptionResult with text, word timestamps, and speaker info - """ - import whisperx - - device = os.getenv("WHISPER_DEVICE", "cuda") - model = get_whisper_model(model_name) - - logger.info(f"Transcribing: {audio_path} (align={align}, diarize={diarize})") - - # Check and unload if idle, then reload - _vram.check_and_unload(unload_models) - _vram.touch() - - # Step 1: Load audio - audio = whisperx.load_audio(audio_path) - - # Step 2: Transcribe with faster-whisper - transcribe_kwargs = {"batch_size": 16} - if language: - transcribe_kwargs["language"] = language - result = model.transcribe(audio, **transcribe_kwargs) - detected_language = result.get("language", language or "en") - - # Step 3: Align (word-level timestamps) - if align and result["segments"]: - try: - align_model, metadata = _get_align_model(detected_language, device) - result = whisperx.align( - result["segments"], - align_model, - metadata, - audio, - device, - return_char_alignments=False, - ) - logger.info("Word alignment complete") - except Exception as e: - logger.warning(f"Alignment failed (continuing without): {e}") - - # Step 4: Diarize (speaker identification) - if diarize: - try: - import torch - import torchaudio - - diarize_pipe = _get_diarize_pipeline(device) - - # pyannote needs waveform as tensor, not the whisperx audio array - waveform = torch.from_numpy(audio).unsqueeze(0).float() - diarize_input = {"waveform": waveform, "sample_rate": 16000} - - diarize_kwargs = {} - if min_speakers is not None: - diarize_kwargs["min_speakers"] = min_speakers - if max_speakers is not None: - diarize_kwargs["max_speakers"] = max_speakers - - diarize_output = diarize_pipe(diarize_input, **diarize_kwargs) - - # pyannote 4.x returns DiarizeOutput, extract the Annotation - if hasattr(diarize_output, "speaker_diarization"): - diarize_annotation = diarize_output.speaker_diarization - else: - diarize_annotation = diarize_output - - # Convert pyannote output to DataFrame for whisperx - import pandas as pd - diarize_rows = [] - for turn, _, speaker in diarize_annotation.itertracks(yield_label=True): - diarize_rows.append({ - "start": turn.start, - "end": turn.end, - "speaker": speaker, - }) - - diarize_df = pd.DataFrame(diarize_rows) - result = whisperx.assign_word_speakers(diarize_df, result) - logger.info("Speaker diarization complete") - except Exception as e: - logger.warning(f"Diarization failed (continuing without): {e}") - import traceback - traceback.print_exc() - - # Build response - segments = result.get("segments", []) - full_text_parts = [] - all_words = [] - speaker_set = set() - - for seg in segments: - full_text_parts.append(seg.get("text", "")) - speaker = seg.get("speaker") - if speaker: - speaker_set.add(speaker) - - for word_info in seg.get("words", []): - all_words.append(WordSegment( - word=word_info.get("word", ""), - start=word_info.get("start", 0.0), - end=word_info.get("end", 0.0), - score=word_info.get("score"), - speaker=word_info.get("speaker", speaker), - )) - - text = " ".join(full_text_parts) - - _vram.touch() - logger.info( - f"Transcription complete: {len(text)} chars, " - f"{len(all_words)} words, {len(speaker_set)} speakers" - ) - - return TranscriptionResult( - text=text.strip(), - language=detected_language, - segments=[{ - "start": s.get("start", 0), - "end": s.get("end", 0), - "text": s.get("text", ""), - "speaker": s.get("speaker"), - } for s in segments], - words=all_words, - speakers=sorted(speaker_set), - ) - - -async def transcribe_audio_bytes( - audio_bytes: bytes, - filename: str, - language: Optional[str] = None, - model_name: str = "large-v3", - align: bool = True, - diarize: bool = False, - min_speakers: Optional[int] = None, - max_speakers: Optional[int] = None, -) -> TranscriptionResult: - """Transcribe audio from bytes (for API uploads).""" - import asyncio - - ext = Path(filename).suffix or ".wav" - - with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: - tmp.write(audio_bytes) - tmp_path = tmp.name - - try: - # Run in thread pool to avoid blocking the event loop - loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - lambda: transcribe_audio( - audio_path=tmp_path, - language=language, - model_name=model_name, - align=align, - diarize=diarize, - min_speakers=min_speakers, - max_speakers=max_speakers, - ), - ) - return result - finally: - try: - os.unlink(tmp_path) - except Exception: - pass - - -# Available models -AVAILABLE_MODELS = [ - "tiny", - "tiny.en", - "base", - "base.en", - "small", - "small.en", - "medium", - "medium.en", - "large-v1", - "large-v2", - "large-v3", - "large-v3-turbo", - "distil-large-v2", - "distil-large-v3", -] diff --git a/services/mana-stt/grafana-dashboard.json b/services/mana-stt/grafana-dashboard.json deleted file mode 100644 index 4b98ba93f..000000000 --- a/services/mana-stt/grafana-dashboard.json +++ /dev/null @@ -1,740 +0,0 @@ -{ - "annotations": { - "list": [] - }, - "description": "Mana Speech-to-Text Service Monitoring", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 100, - "panels": [], - "title": "Overview", - "type": "row" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [ - { - "options": { - "0": { "color": "red", "index": 1, "text": "DOWN" }, - "1": { "color": "green", "index": 0, "text": "UP" } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 0, "y": 1 }, - "id": 1, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "pluginVersion": "10.4.1", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "up{job=\"mana-stt\"}", - "refId": "A" - } - ], - "title": "Service Status", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [ - { - "options": { - "0": { "color": "yellow", "index": 0, "text": "Not Loaded" }, - "1": { "color": "green", "index": 1, "text": "Loaded" } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "yellow", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 3, "y": 1 }, - "id": 2, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "mana_stt_model_loaded{model=\"whisper\"}", - "refId": "A" - } - ], - "title": "Whisper Model", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [ - { - "options": { - "0": { "color": "yellow", "index": 0, "text": "Not Loaded" }, - "1": { "color": "green", "index": 1, "text": "Loaded" } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "yellow", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 6, "y": 1 }, - "id": 3, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "mana_stt_model_loaded{model=\"voxtral\"}", - "refId": "A" - } - ], - "title": "Voxtral Model", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 9, "y": 1 }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_requests_total{status=\"success\"})", - "refId": "A" - } - ], - "title": "Total Transcriptions", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 12, "y": 1 }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_characters_transcribed_total)", - "refId": "A" - } - ], - "title": "Characters Transcribed", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 }, - { "color": "red", "value": 3 } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 15, "y": 1 }, - "id": 6, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_active_requests)", - "refId": "A" - } - ], - "title": "Active Requests", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 1 } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 18, "y": 1 }, - "id": 7, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_requests_total{status=\"error\"})", - "refId": "A" - } - ], - "title": "Total Errors", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { "h": 4, "w": 3, "x": 21, "y": 1 }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket[5m])) by (le))", - "refId": "A" - } - ], - "title": "Median Duration", - "type": "stat" - }, - { - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, - "id": 101, - "panels": [], - "title": "Performance", - "type": "row" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, - "id": 10, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"whisper\"}[5m])) by (le))", - "legendFormat": "Whisper p50", - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"whisper\"}[5m])) by (le))", - "legendFormat": "Whisper p95", - "refId": "B" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"voxtral\"}[5m])) by (le))", - "legendFormat": "Voxtral p50", - "refId": "C" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(mana_stt_transcription_duration_seconds_bucket{model=\"voxtral\"}[5m])) by (le))", - "legendFormat": "Voxtral p95", - "refId": "D" - } - ], - "title": "Transcription Duration (p50 / p95)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "reqps" - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, - "id": 11, - "options": { - "legend": { - "calcs": ["mean", "sum"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_requests_total{model=\"whisper\", status=\"success\"}[5m]))", - "legendFormat": "Whisper Success", - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_requests_total{model=\"voxtral\", status=\"success\"}[5m]))", - "legendFormat": "Voxtral Success", - "refId": "B" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_requests_total{status=\"error\"}[5m]))", - "legendFormat": "Errors", - "refId": "C" - } - ], - "title": "Request Rate", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, - "id": 102, - "panels": [], - "title": "Details", - "type": "row" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 6, "x": 0, "y": 15 }, - "id": 12, - "options": { - "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, - "pieType": "pie", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_requests_total{status=\"success\"}) by (model)", - "legendFormat": "{{model}}", - "refId": "A" - } - ], - "title": "Requests by Model", - "type": "piechart" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 6, "x": 6, "y": 15 }, - "id": 13, - "options": { - "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, - "pieType": "pie", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(mana_stt_requests_total{status=\"success\"}) by (language)", - "legendFormat": "{{language}}", - "refId": "A" - } - ], - "title": "Requests by Language", - "type": "piechart" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 80, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "never", - "spanNulls": false, - "stacking": { "group": "A", "mode": "normal" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, - "id": 14, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_file_size_mb_sum{model=\"whisper\"}[5m])) * 1024 * 1024", - "legendFormat": "Whisper", - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_file_size_mb_sum{model=\"voxtral\"}[5m])) * 1024 * 1024", - "legendFormat": "Voxtral", - "refId": "B" - } - ], - "title": "Data Processed", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, - "id": 103, - "panels": [], - "title": "Model Loading", - "type": "row" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 30 }, - { "color": "red", "value": 60 } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { "h": 6, "w": 8, "x": 0, "y": 24 }, - "id": 15, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "mana_stt_model_load_duration_seconds_sum{model=\"whisper\"} / mana_stt_model_load_duration_seconds_count{model=\"whisper\"}", - "legendFormat": "Whisper", - "refId": "A" - } - ], - "title": "Whisper Load Time", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 60 }, - { "color": "red", "value": 120 } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { "h": 6, "w": 8, "x": 8, "y": 24 }, - "id": 16, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "mana_stt_model_load_duration_seconds_sum{model=\"voxtral\"} / mana_stt_model_load_duration_seconds_count{model=\"voxtral\"}", - "legendFormat": "Voxtral", - "refId": "A" - } - ], - "title": "Voxtral Load Time", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 6, "w": 8, "x": 16, "y": 24 }, - "id": 17, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_characters_transcribed_total{model=\"whisper\"}[5m]))", - "legendFormat": "Whisper", - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "sum(rate(mana_stt_characters_transcribed_total{model=\"voxtral\"}[5m]))", - "legendFormat": "Voxtral", - "refId": "B" - } - ], - "title": "Characters/sec Transcribed", - "type": "timeseries" - } - ], - "refresh": "30s", - "schemaVersion": 39, - "tags": ["mana", "stt", "ai"], - "templating": { "list": [] }, - "time": { "from": "now-1h", "to": "now" }, - "timepicker": {}, - "timezone": "browser", - "title": "Mana STT Service", - "uid": "mana-stt-dashboard", - "version": 1, - "weekStart": "monday" -} diff --git a/services/mana-stt/requirements-cuda.txt b/services/mana-stt/requirements-cuda.txt deleted file mode 100644 index e9bc9f95a..000000000 --- a/services/mana-stt/requirements-cuda.txt +++ /dev/null @@ -1,35 +0,0 @@ -# Mana STT Service Dependencies -# For GPU Server (NVIDIA RTX 3090 / CUDA) - -# Web Framework -fastapi==0.115.6 -uvicorn[standard]==0.34.0 -python-multipart==0.0.20 - -# Audio Processing -pydub==0.25.1 -soundfile==0.13.1 - -# WhisperX (CUDA) — includes faster-whisper + alignment -whisperx @ git+https://github.com/m-bain/whisperX.git - -# faster-whisper with CTranslate2 (CUDA backend) -faster-whisper>=1.1.0 - -# Speaker Diarization (pyannote.audio) -# Requires HF_TOKEN with accepted terms: -# https://huggingface.co/pyannote/speaker-diarization-3.1 -# https://huggingface.co/pyannote/segmentation-3.0 -pyannote.audio>=3.3.0 - -# PyTorch CUDA — install separately for your CUDA version: -# pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121 -torch>=2.5.0 -torchaudio>=2.5.0 - -# Utilities -numpy>=1.26.0 -tqdm>=4.67.0 - -# External Auth (mana-core-auth integration) -httpx>=0.27.0 diff --git a/services/mana-stt/requirements.txt b/services/mana-stt/requirements.txt deleted file mode 100644 index b98f07c3c..000000000 --- a/services/mana-stt/requirements.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Mana STT Service Dependencies -# For Mac Mini M4 (Apple Silicon) - -# Web Framework -fastapi==0.115.6 -uvicorn[standard]==0.34.0 -python-multipart==0.0.20 - -# Audio Processing -pydub==0.25.1 -soundfile==0.13.1 - -# Whisper (Apple Silicon optimized) -lightning-whisper-mlx==0.0.10 -mlx>=0.21.0 - -# Voxtral (Hugging Face Transformers) -transformers>=4.47.0 -torch>=2.5.0 -accelerate>=1.2.0 -sentencepiece>=0.2.0 - -# Utilities -numpy>=1.26.0 -tqdm>=4.67.0 - -# External Auth (mana-core-auth integration) -httpx>=0.27.0 diff --git a/services/mana-stt/service.pyw b/services/mana-stt/service.pyw deleted file mode 100644 index 056059e98..000000000 --- a/services/mana-stt/service.pyw +++ /dev/null @@ -1,34 +0,0 @@ -"""mana-stt service runner.""" -import os -import sys - -os.chdir(r"C:\mana\services\mana-stt") -sys.path.insert(0, r"C:\mana\services\mana-stt") - -# Redirect stdout/stderr to log file FIRST (before any imports that warn) -log = open(r"C:\mana\services\mana-stt\service.log", "w", buffering=1) -sys.stdout = log -sys.stderr = log - -# Load .env file -from dotenv import load_dotenv -load_dotenv(r"C:\mana\services\mana-stt\.env") - -# Ensure FFmpeg is in PATH -ffmpeg_dir = r"C:\Users\tills\AppData\Local\Microsoft\WinGet\Links" -if ffmpeg_dir not in os.environ.get("PATH", ""): - os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ.get("PATH", "") - -# Set HF token -hf_token = os.environ.get("HF_TOKEN", "") -if hf_token: - os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token - -# Pre-initialize CUDA before importing whisperx (avoids hangs) -import torch -if torch.cuda.is_available(): - torch.cuda.init() - print(f"CUDA initialized: {torch.cuda.get_device_name(0)}", flush=True) - -import uvicorn -uvicorn.run("app.main:app", host="0.0.0.0", port=3020, log_level="info")