From 60394076e5120a458165ba8c53f3281cc62d1f95 Mon Sep 17 00:00:00 2001
From: Till-JS <101404291+Till-JS@users.noreply.github.com>
Date: Wed, 11 Feb 2026 16:10:00 +0100
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(mana-stt):=20add=20vLLM=20inte?=
 =?UTF-8?q?gration=20for=20Voxtral=20transcription?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add vllm_service.py as proxy to vLLM server for Voxtral 3B/4B
- Add voxtral_api_service.py for Mistral API fallback
- Update main.py with /transcribe/voxtral endpoint using vLLM
- Add /transcribe/auto endpoint with automatic fallback chain
- Create setup-vllm.sh and start-vllm-voxtral.sh scripts
- Add launchd plist files for Mac Mini deployment
- Add install-services.sh for automated service installation

Architecture:
- vLLM server runs Voxtral models on port 8100
- mana-stt proxies to vLLM with Mistral API fallback
- Fallback chain: vLLM -> Mistral API

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 services/mana-stt/.env.example                |  31 +++
 services/mana-stt/app/main.py                 | 252 +++++++++++++-----
 services/mana-stt/app/vllm_service.py         | 178 +++++++++++++
 services/mana-stt/app/voxtral_api_service.py  | 213 +++++++++++++++
 services/mana-stt/app/voxtral_service.py      | 218 ++++++++-------
 services/mana-stt/com.manacore.mana-stt.plist |  41 +++
 .../mana-stt/com.manacore.vllm-voxtral.plist  |  41 +++
 services/mana-stt/install-service.sh          |  45 ++++
 services/mana-stt/install-services.sh         |  84 ++++++
 services/mana-stt/scripts/setup-vllm.sh       |  83 ++++++
 .../mana-stt/scripts/start-vllm-voxtral.sh    |  36 +++
 11 files changed, 1060 insertions(+), 162 deletions(-)
 create mode 100644 services/mana-stt/.env.example
 create mode 100644 services/mana-stt/app/vllm_service.py
 create mode 100644 services/mana-stt/app/voxtral_api_service.py
 create mode 100644 services/mana-stt/com.manacore.mana-stt.plist
 create mode 100644 services/mana-stt/com.manacore.vllm-voxtral.plist
 create mode 100755 services/mana-stt/install-service.sh
 create mode 100755 services/mana-stt/install-services.sh
 create mode 100755 services/mana-stt/scripts/setup-vllm.sh
 create mode 100755 services/mana-stt/scripts/start-vllm-voxtral.sh

diff --git a/services/mana-stt/.env.example b/services/mana-stt/.env.example
new file mode 100644
index 000000000..df9062cbe
--- /dev/null
+++ b/services/mana-stt/.env.example
@@ -0,0 +1,31 @@
+# ManaCore STT Service Configuration
+# Copy to .env and adjust values as needed
+
+# Server
+PORT=3020
+
+# Whisper (Lightning MLX)
+WHISPER_MODEL=large-v3
+
+# Voxtral (Local Models)
+# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
+VOXTRAL_MODEL=voxtral-realtime-4b
+
+# Model Loading
+# Set to true to preload models on startup (slower startup, faster first request)
+PRELOAD_MODELS=false
+
+# Load Management
+# Maximum concurrent transcription requests before API fallback
+MAX_CONCURRENT_REQUESTS=3
+
+# API Fallback
+# Enable automatic fallback to Mistral API when overloaded
+API_FALLBACK_ENABLED=true
+
+# Mistral API Key (required for API fallback)
+# Get your key at https://console.mistral.ai/
+MISTRAL_API_KEY=
+
+# CORS Origins (comma-separated)
+CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173
diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py
index 717115f2f..5423f044e 100644
--- a/services/mana-stt/app/main.py
+++ b/services/mana-stt/app/main.py
@@ -1,16 +1,17 @@
 """
 ManaCore STT API Service
-Speech-to-Text with Whisper (MLX) and Voxtral
+Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
 
 Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
 """
 
 import os
 import logging
+import time
 from typing import Optional
 from contextlib import asynccontextmanager
 
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
@@ -31,32 +32,39 @@ CORS_ORIGINS = os.getenv(
     "https://mana.how,https://chat.mana.how,http://localhost:5173"
 ).split(",")
 
+# vLLM configuration
+VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
+USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true"
+
 
 # Response models
 class TranscriptionResponse(BaseModel):
     text: str
     language: Optional[str] = None
     model: str
+    latency_ms: Optional[float] = None
     duration_seconds: Optional[float] = None
 
 
 class HealthResponse(BaseModel):
     status: str
     whisper_loaded: bool
-    voxtral_loaded: bool
+    vllm_available: bool
+    vllm_url: Optional[str] = None
+    mistral_api_available: bool
     models: dict
 
 
 class ModelsResponse(BaseModel):
     whisper: list
-    voxtral: list
+    voxtral_vllm: list
     default_whisper: str
 
 
 # Track loaded models
 models_status = {
     "whisper_loaded": False,
-    "voxtral_loaded": False,
+    "vllm_available": False,
 }
 
 
@@ -65,9 +73,24 @@ async def lifespan(app: FastAPI):
     """Startup and shutdown events."""
     logger.info("Starting ManaCore STT Service...")
 
-    # Optionally preload models on startup
+    # Check vLLM availability
+    if USE_VLLM:
+        from app.vllm_service import check_health
+        health = await check_health()
+        models_status["vllm_available"] = health.get("status") == "healthy"
+        if models_status["vllm_available"]:
+            logger.info(f"vLLM server available at {VLLM_URL}")
+        else:
+            logger.warning(f"vLLM server not available: {health}")
+
+    # Check Mistral API
+    from app.voxtral_api_service import is_available as api_available
+    if api_available():
+        logger.info("Mistral API fallback configured")
+
+    # Optionally preload Whisper
     if PRELOAD_MODELS:
-        logger.info("Preloading models (PRELOAD_MODELS=true)...")
+        logger.info("Preloading Whisper model...")
         try:
             from app.whisper_service import get_whisper_model
             get_whisper_model(DEFAULT_WHISPER_MODEL)
@@ -76,16 +99,6 @@ async def lifespan(app: FastAPI):
         except Exception as e:
             logger.warning(f"Failed to preload Whisper: {e}")
 
-        try:
-            from app.voxtral_service import get_voxtral_model
-            get_voxtral_model()
-            models_status["voxtral_loaded"] = True
-            logger.info("Voxtral model preloaded")
-        except Exception as e:
-            logger.warning(f"Failed to preload Voxtral: {e}")
-    else:
-        logger.info("Models will be loaded on first request (lazy loading)")
-
     logger.info(f"STT Service ready on port {PORT}")
     yield
     logger.info("Shutting down STT Service...")
@@ -94,8 +107,8 @@ async def lifespan(app: FastAPI):
 # Create FastAPI app
 app = FastAPI(
     title="ManaCore STT Service",
-    description="Speech-to-Text API with Whisper (MLX) and Voxtral",
-    version="1.0.0",
+    description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API",
+    version="2.0.0",
     lifespan=lifespan,
 )
 
@@ -112,10 +125,17 @@ app.add_middleware(
 @app.get("/health", response_model=HealthResponse)
 async def health_check():
     """Health check endpoint."""
+    from app.voxtral_api_service import is_available as api_available
+    from app.vllm_service import check_health
+
+    vllm_health = await check_health()
+
     return HealthResponse(
         status="healthy",
         whisper_loaded=models_status["whisper_loaded"],
-        voxtral_loaded=models_status["voxtral_loaded"],
+        vllm_available=vllm_health.get("status") == "healthy",
+        vllm_url=VLLM_URL if USE_VLLM else None,
+        mistral_api_available=api_available(),
         models={
             "default_whisper": DEFAULT_WHISPER_MODEL,
         },
@@ -126,11 +146,13 @@ async def health_check():
 async def list_models():
     """List available models."""
     from app.whisper_service import AVAILABLE_MODELS as whisper_models
-    from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
+    from app.vllm_service import get_models
+
+    vllm_models = await get_models()
 
     return ModelsResponse(
         whisper=whisper_models,
-        voxtral=voxtral_languages,
+        voxtral_vllm=vllm_models,
         default_whisper=DEFAULT_WHISPER_MODEL,
     )
 
@@ -138,25 +160,19 @@ async def list_models():
 @app.post("/transcribe", response_model=TranscriptionResponse)
 async def transcribe_whisper(
     file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: Optional[str] = Form(
-        None,
-        description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
-    ),
-    model: str = Form(
-        None,
-        description="Whisper model to use (default: large-v3-turbo)"
-    ),
+    language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
+    model: Optional[str] = Form(None, description="Whisper model to use"),
 ):
     """
     Transcribe audio using Whisper (Lightning MLX).
 
+    Best for: General transcription, many languages
     Supported formats: mp3, wav, m4a, flac, ogg, webm
     Max file size: 100MB
     """
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
 
-    # Validate file type
     allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
     ext = os.path.splitext(file.filename)[1].lower()
     if ext not in allowed_extensions:
@@ -165,20 +181,17 @@ async def transcribe_whisper(
             detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
         )
 
+    start_time = time.time()
+
     try:
         from app.whisper_service import transcribe_audio_bytes
 
-        # Read file
         audio_bytes = await file.read()
-
-        # Check file size (100MB limit)
         if len(audio_bytes) > 100 * 1024 * 1024:
             raise HTTPException(status_code=400, detail="File too large (max 100MB)")
 
-        # Use default model if not specified
         model_name = model or DEFAULT_WHISPER_MODEL
 
-        # Transcribe
         result = await transcribe_audio_bytes(
             audio_bytes=audio_bytes,
             filename=file.filename,
@@ -187,38 +200,53 @@ async def transcribe_whisper(
         )
 
         models_status["whisper_loaded"] = True
+        latency_ms = (time.time() - start_time) * 1000
 
         return TranscriptionResponse(
             text=result.text,
             language=result.language,
             model=f"whisper-{model_name}",
+            latency_ms=latency_ms,
         )
 
     except Exception as e:
-        logger.error(f"Transcription error: {e}")
+        logger.error(f"Whisper transcription error: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
 
 @app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
 async def transcribe_voxtral(
     file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: str = Form(
-        "de",
-        description="Language code (de, en, fr, es, pt, it, nl, hi)"
-    ),
+    language: str = Form("de", description="Language code"),
+    use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"),
 ):
     """
-    Transcribe audio using Voxtral Mini (Mistral AI).
+    Transcribe audio using Voxtral via vLLM server.
 
-    Best for: German, French, European languages
-    Supported formats: mp3, wav, m4a, flac
+    Models:
+    - Voxtral Mini 3B (default): Best quality
+    - Voxtral Realtime 4B: Lower latency (<500ms)
+
+    Falls back to Mistral API if vLLM is unavailable.
+
+    Supported formats: mp3, wav, m4a, flac, ogg, webm
     Max file size: 100MB
     """
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
 
-    # Validate language
-    from app.voxtral_service import SUPPORTED_LANGUAGES
+    from app.vllm_service import (
+        SUPPORTED_LANGUAGES,
+        is_available as vllm_available,
+        transcribe_audio_bytes as vllm_transcribe,
+        transcribe_with_realtime,
+        check_health,
+    )
+    from app.voxtral_api_service import (
+        is_available as api_available,
+        transcribe_audio_bytes as api_transcribe,
+    )
+
     if language not in SUPPORTED_LANGUAGES:
         raise HTTPException(
             status_code=400,
@@ -226,10 +254,94 @@ async def transcribe_voxtral(
         )
 
     try:
-        from app.voxtral_service import transcribe_audio_bytes
-
         audio_bytes = await file.read()
+        if len(audio_bytes) > 100 * 1024 * 1024:
+            raise HTTPException(status_code=400, detail="File too large (max 100MB)")
 
+        # Try vLLM first
+        if USE_VLLM:
+            health = await check_health()
+            if health.get("status") == "healthy":
+                logger.info("Using vLLM for Voxtral transcription")
+                if use_realtime:
+                    result = await transcribe_with_realtime(
+                        audio_bytes=audio_bytes,
+                        filename=file.filename,
+                        language=language,
+                    )
+                else:
+                    result = await vllm_transcribe(
+                        audio_bytes=audio_bytes,
+                        filename=file.filename,
+                        language=language,
+                    )
+
+                return TranscriptionResponse(
+                    text=result.text,
+                    language=result.language,
+                    model=result.model,
+                    latency_ms=result.latency_ms,
+                    duration_seconds=result.duration_seconds,
+                )
+
+        # Fallback to Mistral API
+        if api_available():
+            logger.info("Falling back to Mistral API")
+            result = await api_transcribe(
+                audio_bytes=audio_bytes,
+                filename=file.filename,
+                language=language,
+            )
+
+            return TranscriptionResponse(
+                text=result.text,
+                language=result.language,
+                model=result.model,
+                latency_ms=None,
+                duration_seconds=result.duration_seconds,
+            )
+
+        raise HTTPException(
+            status_code=503,
+            detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY."
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Voxtral transcription error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse)
+async def transcribe_voxtral_api(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
+    diarization: bool = Form(False, description="Enable speaker diarization"),
+):
+    """
+    Transcribe audio using Mistral's Voxtral API directly.
+
+    Features:
+    - Speaker diarization
+    - Auto language detection
+    - High quality (~4% WER)
+
+    Requires MISTRAL_API_KEY environment variable.
+    """
+    from app.voxtral_api_service import is_available, transcribe_audio_bytes
+
+    if not is_available():
+        raise HTTPException(
+            status_code=503,
+            detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable."
+        )
+
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+
+    try:
+        audio_bytes = await file.read()
         if len(audio_bytes) > 100 * 1024 * 1024:
             raise HTTPException(status_code=400, detail="File too large (max 100MB)")
 
@@ -237,59 +349,61 @@ async def transcribe_voxtral(
             audio_bytes=audio_bytes,
             filename=file.filename,
             language=language,
+            diarization=diarization,
         )
 
-        models_status["voxtral_loaded"] = True
-
         return TranscriptionResponse(
             text=result.text,
             language=result.language,
             model=result.model,
+            duration_seconds=result.duration_seconds,
         )
 
     except Exception as e:
-        logger.error(f"Voxtral transcription error: {e}")
+        logger.error(f"Mistral API error: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 
 
 @app.post("/transcribe/auto", response_model=TranscriptionResponse)
 async def transcribe_auto(
     file: UploadFile = File(..., description="Audio file to transcribe"),
-    language: Optional[str] = Form(
-        None,
-        description="Language hint (optional)"
-    ),
-    prefer: str = Form(
-        "whisper",
-        description="Preferred model: 'whisper' or 'voxtral'"
-    ),
+    language: Optional[str] = Form(None, description="Language hint"),
+    prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
 ):
     """
-    Transcribe audio with automatic model selection.
+    Transcribe with automatic model selection and fallback.
 
-    - Uses Whisper by default (faster, more languages)
-    - Falls back to Voxtral if Whisper fails
+    Fallback chain:
+    1. Preferred model (whisper or voxtral)
+    2. Alternative model
+    3. Mistral API
     """
     if prefer == "voxtral":
-        # Try Voxtral first
         try:
-            return await transcribe_voxtral(file, language or "de")
+            return await transcribe_voxtral(file, language or "de", False)
         except Exception as e:
             logger.warning(f"Voxtral failed, trying Whisper: {e}")
-            # Reset file position
             await file.seek(0)
-            return await transcribe_whisper(file, language, None)
+            try:
+                return await transcribe_whisper(file, language, None)
+            except Exception as e2:
+                logger.warning(f"Whisper failed, trying API: {e2}")
+                await file.seek(0)
+                return await transcribe_voxtral_api(file, language, False)
     else:
-        # Try Whisper first (default)
         try:
             return await transcribe_whisper(file, language, None)
         except Exception as e:
             logger.warning(f"Whisper failed, trying Voxtral: {e}")
             await file.seek(0)
-            return await transcribe_voxtral(file, language or "de")
+            try:
+                return await transcribe_voxtral(file, language or "de", False)
+            except Exception as e2:
+                logger.warning(f"Voxtral failed, trying API: {e2}")
+                await file.seek(0)
+                return await transcribe_voxtral_api(file, language, False)
 
 
-# Error handlers
 @app.exception_handler(Exception)
 async def global_exception_handler(request, exc):
     logger.error(f"Unhandled error: {exc}")
diff --git a/services/mana-stt/app/vllm_service.py b/services/mana-stt/app/vllm_service.py
new file mode 100644
index 000000000..4ca1857a1
--- /dev/null
+++ b/services/mana-stt/app/vllm_service.py
@@ -0,0 +1,178 @@
+"""
+vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription
+
+vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API.
+This service proxies requests to the vLLM server.
+
+Requirements:
+- vLLM server running on VLLM_URL (default: http://localhost:8100)
+- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602
+"""
+
+import os
+import logging
+import time
+import tempfile
+import httpx
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+# vLLM server configuration
+VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
+VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300"))  # 5 minutes for long audio
+
+# Model IDs
+VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507"
+VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
+
+
+@dataclass
+class VllmTranscriptionResult:
+    text: str
+    language: Optional[str] = None
+    model: str = "voxtral-vllm"
+    latency_ms: Optional[float] = None
+    duration_seconds: Optional[float] = None
+
+
+async def check_health() -> dict:
+    """Check if vLLM server is healthy."""
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{VLLM_URL}/health")
+            if response.status_code == 200:
+                return {"status": "healthy", "url": VLLM_URL}
+            return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code}
+    except Exception as e:
+        return {"status": "unavailable", "url": VLLM_URL, "error": str(e)}
+
+
+async def get_models() -> list:
+    """Get available models from vLLM server."""
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{VLLM_URL}/v1/models")
+            if response.status_code == 200:
+                data = response.json()
+                return [m["id"] for m in data.get("data", [])]
+            return []
+    except Exception:
+        return []
+
+
+def is_available() -> bool:
+    """Check if vLLM server is configured."""
+    return bool(VLLM_URL)
+
+
+async def transcribe_audio_bytes(
+    audio_bytes: bytes,
+    filename: str,
+    language: Optional[str] = "de",
+    model: Optional[str] = None,
+) -> VllmTranscriptionResult:
+    """
+    Transcribe audio using vLLM Voxtral server.
+
+    Args:
+        audio_bytes: Raw audio bytes
+        filename: Original filename (for format detection)
+        language: Language code (de, en, fr, etc.)
+        model: Model to use (defaults to Voxtral-Mini-3B-2507)
+
+    Returns:
+        VllmTranscriptionResult with transcription
+    """
+    start_time = time.time()
+    model_id = model or VOXTRAL_3B
+
+    logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)")
+
+    # Save to temp file (vLLM API accepts file uploads)
+    ext = Path(filename).suffix or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+        tmp.write(audio_bytes)
+        tmp_path = tmp.name
+
+    try:
+        async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client:
+            # Use OpenAI-compatible transcription endpoint
+            with open(tmp_path, "rb") as f:
+                files = {"file": (filename, f, "audio/wav")}
+                data = {
+                    "model": model_id,
+                    "language": language or "de",
+                    "response_format": "json",
+                    "temperature": 0.0,  # Deterministic for transcription
+                }
+
+                response = await client.post(
+                    f"{VLLM_URL}/v1/audio/transcriptions",
+                    files=files,
+                    data=data,
+                )
+
+            if response.status_code != 200:
+                error_detail = response.text
+                logger.error(f"vLLM error: {response.status_code} - {error_detail}")
+                raise RuntimeError(f"vLLM transcription failed: {error_detail}")
+
+            result = response.json()
+            text = result.get("text", "")
+            duration = result.get("duration")
+
+            latency_ms = (time.time() - start_time) * 1000
+            logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
+
+            return VllmTranscriptionResult(
+                text=text.strip(),
+                language=language,
+                model=f"vllm-{model_id.split('/')[-1]}",
+                latency_ms=latency_ms,
+                duration_seconds=duration,
+            )
+
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+
+
+async def transcribe_with_realtime(
+    audio_bytes: bytes,
+    filename: str,
+    language: Optional[str] = "de",
+) -> VllmTranscriptionResult:
+    """
+    Transcribe using Voxtral 4B Realtime model.
+
+    Optimized for low latency (<500ms).
+    """
+    return await transcribe_audio_bytes(
+        audio_bytes=audio_bytes,
+        filename=filename,
+        language=language,
+        model=VOXTRAL_4B_REALTIME,
+    )
+
+
+# Supported languages (same as Voxtral)
+SUPPORTED_LANGUAGES = [
+    "en",  # English
+    "zh",  # Chinese
+    "hi",  # Hindi
+    "es",  # Spanish
+    "ar",  # Arabic
+    "fr",  # French
+    "pt",  # Portuguese
+    "ru",  # Russian
+    "de",  # German
+    "ja",  # Japanese
+    "ko",  # Korean
+    "it",  # Italian
+    "nl",  # Dutch
+]
diff --git a/services/mana-stt/app/voxtral_api_service.py b/services/mana-stt/app/voxtral_api_service.py
new file mode 100644
index 000000000..53d78f808
--- /dev/null
+++ b/services/mana-stt/app/voxtral_api_service.py
@@ -0,0 +1,213 @@
+"""
+Voxtral API Service - Mistral Cloud API Fallback
+Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded.
+
+Features:
+- Speaker diarization
+- Word-level timestamps
+- Context biasing for domain-specific terms
+- 13 language support
+"""
+
+import os
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional, Literal
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+# Lazy load client
+_mistral_client = None
+
+MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
+DEFAULT_MODEL = "voxtral-mini-latest"  # voxtral-mini-2602
+
+
+@dataclass
+class Speaker:
+    """Speaker information from diarization."""
+    id: str
+    start: float
+    end: float
+
+
+@dataclass
+class WordTimestamp:
+    """Word-level timestamp."""
+    word: str
+    start: float
+    end: float
+
+
+@dataclass
+class SegmentTimestamp:
+    """Segment-level timestamp."""
+    text: str
+    start: float
+    end: float
+    speaker: Optional[str] = None
+
+
+@dataclass
+class VoxtralApiResult:
+    """Result from Voxtral API transcription."""
+    text: str
+    language: Optional[str] = None
+    model: str = "voxtral-api"
+    duration_seconds: Optional[float] = None
+    words: list[WordTimestamp] = field(default_factory=list)
+    segments: list[SegmentTimestamp] = field(default_factory=list)
+    speakers: list[Speaker] = field(default_factory=list)
+
+
+def get_mistral_client():
+    """Get or create Mistral client instance."""
+    global _mistral_client
+
+    if _mistral_client is None:
+        if not MISTRAL_API_KEY:
+            raise RuntimeError(
+                "MISTRAL_API_KEY environment variable not set. "
+                "Get your API key at https://console.mistral.ai/"
+            )
+
+        try:
+            from mistralai import Mistral
+            _mistral_client = Mistral(api_key=MISTRAL_API_KEY)
+            logger.info("Mistral API client initialized")
+        except ImportError:
+            raise RuntimeError(
+                "mistralai package not installed. "
+                "Run: pip install mistralai"
+            )
+
+    return _mistral_client
+
+
+def is_available() -> bool:
+    """Check if Mistral API is configured and available."""
+    return bool(MISTRAL_API_KEY)
+
+
+async def transcribe_audio_bytes(
+    audio_bytes: bytes,
+    filename: str,
+    language: Optional[str] = None,
+    timestamp_granularity: Optional[Literal["word", "segment"]] = None,
+    diarization: bool = False,
+    context_bias: Optional[list[str]] = None,
+) -> VoxtralApiResult:
+    """
+    Transcribe audio using Mistral's Voxtral API.
+
+    Args:
+        audio_bytes: Raw audio bytes
+        filename: Original filename (for extension detection)
+        language: Language code (de, en, fr, etc.) - auto-detect if None
+        timestamp_granularity: "word" or "segment" for timestamps
+        diarization: Enable speaker diarization
+        context_bias: List of domain-specific terms to improve accuracy (max 100)
+
+    Returns:
+        VoxtralApiResult with transcription and optional metadata
+    """
+    client = get_mistral_client()
+
+    logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)")
+
+    try:
+        # Build request parameters
+        request_params = {
+            "model": DEFAULT_MODEL,
+            "file": {
+                "content": audio_bytes,
+                "file_name": filename,
+            },
+        }
+
+        # Language and timestamps are mutually exclusive in current API
+        if language and not timestamp_granularity:
+            request_params["language"] = language
+
+        if timestamp_granularity:
+            request_params["timestamp_granularities"] = [timestamp_granularity]
+
+        if diarization:
+            request_params["diarization"] = True
+
+        if context_bias:
+            # API accepts comma-separated string, max 100 terms
+            bias_terms = context_bias[:100]
+            request_params["context_bias"] = ",".join(bias_terms)
+
+        # Make API call
+        response = client.audio.transcriptions.complete(**request_params)
+
+        # Parse response
+        result = VoxtralApiResult(
+            text=response.text,
+            language=getattr(response, "language", language),
+            model=f"voxtral-api-{DEFAULT_MODEL}",
+            duration_seconds=getattr(response, "duration", None),
+        )
+
+        # Parse word timestamps if present
+        if hasattr(response, "words") and response.words:
+            result.words = [
+                WordTimestamp(
+                    word=w.word,
+                    start=w.start,
+                    end=w.end,
+                )
+                for w in response.words
+            ]
+
+        # Parse segment timestamps if present
+        if hasattr(response, "segments") and response.segments:
+            result.segments = [
+                SegmentTimestamp(
+                    text=s.text,
+                    start=s.start,
+                    end=s.end,
+                    speaker=getattr(s, "speaker", None),
+                )
+                for s in response.segments
+            ]
+
+        # Parse speakers if diarization enabled
+        if hasattr(response, "speakers") and response.speakers:
+            result.speakers = [
+                Speaker(
+                    id=sp.id,
+                    start=sp.start,
+                    end=sp.end,
+                )
+                for sp in response.speakers
+            ]
+
+        logger.info(f"Mistral API transcription complete: {len(result.text)} characters")
+        return result
+
+    except Exception as e:
+        logger.error(f"Mistral API transcription failed: {e}")
+        raise
+
+
+# Supported languages by Voxtral API (13 languages)
+SUPPORTED_LANGUAGES = [
+    "en",  # English
+    "zh",  # Chinese
+    "hi",  # Hindi
+    "es",  # Spanish
+    "ar",  # Arabic
+    "fr",  # French
+    "pt",  # Portuguese
+    "ru",  # Russian
+    "de",  # German
+    "ja",  # Japanese
+    "ko",  # Korean
+    "it",  # Italian
+    "nl",  # Dutch
+]
diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py
index 989ef863b..320e5020d 100644
--- a/services/mana-stt/app/voxtral_service.py
+++ b/services/mana-stt/app/voxtral_service.py
@@ -1,12 +1,15 @@
 """
 Voxtral STT Service using Hugging Face Transformers
 Mistral AI's Speech-to-Text model (Apache 2.0 License)
+
+Uses VoxtralForConditionalGeneration with apply_transcription_request
+as per official HuggingFace documentation.
 """
 
 import os
 import tempfile
 import logging
-import base64
+import time
 from pathlib import Path
 from typing import Optional
 from dataclasses import dataclass
@@ -16,68 +19,80 @@ logger = logging.getLogger(__name__)
 # Lazy load to avoid import errors
 _voxtral_model = None
 _voxtral_processor = None
+_model_name = None
+
+# Default model
+DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507"
 
 
 @dataclass
 class VoxtralTranscriptionResult:
     text: str
     language: Optional[str] = None
-    model: str = "voxtral-mini"
+    model: str = "voxtral-mini-3b"
+    latency_ms: Optional[float] = None
 
 
-def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
+def get_voxtral_model(model_name: str = DEFAULT_MODEL):
     """
     Get or create Voxtral model instance.
 
-    Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
-    Voxtral Small (24B) requires more VRAM.
+    Uses VoxtralForConditionalGeneration (the correct class for Voxtral).
     """
-    global _voxtral_model, _voxtral_processor
+    global _voxtral_model, _voxtral_processor, _model_name
+
+    # Reload if different model requested
+    if _voxtral_model is not None and _model_name != model_name:
+        logger.info(f"Switching model from {_model_name} to {model_name}")
+        _voxtral_model = None
+        _voxtral_processor = None
 
     if _voxtral_model is None:
         logger.info(f"Loading Voxtral model: {model_name}")
         try:
             import torch
-            from transformers import AutoModel, AutoProcessor
+            from transformers import VoxtralForConditionalGeneration, AutoProcessor
 
-            # Determine device
+            # Determine device and dtype
             if torch.backends.mps.is_available():
                 device = "mps"
+                # MPS works better with float16
                 torch_dtype = torch.float16
             elif torch.cuda.is_available():
                 device = "cuda"
-                torch_dtype = torch.float16
+                torch_dtype = torch.bfloat16
             else:
                 device = "cpu"
                 torch_dtype = torch.float32
 
-            logger.info(f"Using device: {device}")
+            logger.info(f"Using device: {device}, dtype: {torch_dtype}")
 
             # Load processor
-            _voxtral_processor = AutoProcessor.from_pretrained(
-                model_name,
-                trust_remote_code=True,
-            )
+            _voxtral_processor = AutoProcessor.from_pretrained(model_name)
 
-            # Load model - Voxtral uses AutoModel, not AutoModelForSpeechSeq2Seq
-            _voxtral_model = AutoModel.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                device_map="auto" if device != "mps" else None,
-                trust_remote_code=True,
-            )
-
-            # Move to MPS if available (device_map doesn't support MPS)
+            # Load model with VoxtralForConditionalGeneration
             if device == "mps":
+                # MPS doesn't support device_map, load to CPU first then move
+                _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
+                    model_name,
+                    torch_dtype=torch_dtype,
+                )
                 _voxtral_model = _voxtral_model.to(device)
+            else:
+                _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
+                    model_name,
+                    torch_dtype=torch_dtype,
+                    device_map=device,
+                )
 
+            _model_name = model_name
             logger.info(f"Voxtral model loaded successfully on {device}")
 
         except ImportError as e:
             logger.error(f"Failed to import transformers: {e}")
             raise RuntimeError(
-                "transformers not installed. "
-                "Run: pip install transformers torch"
+                "transformers >= 4.54.0 required. "
+                "Run: pip install --upgrade transformers"
             )
         except Exception as e:
             logger.error(f"Failed to load Voxtral model: {e}")
@@ -89,17 +104,16 @@ def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
 def transcribe_audio(
     audio_path: str,
     language: Optional[str] = "de",
-    model_name: str = "mistralai/Voxtral-Mini-3B-2507",
+    model_name: str = DEFAULT_MODEL,
 ) -> VoxtralTranscriptionResult:
     """
     Transcribe audio file using Voxtral.
 
-    Voxtral is a multimodal audio understanding model that can be prompted
-    for transcription tasks.
+    Uses the official apply_transcription_request method.
 
     Args:
         audio_path: Path to audio file
-        language: Target language for transcription
+        language: Language code (de, en, fr, etc.)
         model_name: Hugging Face model ID
 
     Returns:
@@ -108,84 +122,49 @@ def transcribe_audio(
     import torch
 
     model, processor = get_voxtral_model(model_name)
+    device = next(model.parameters()).device
+    dtype = next(model.parameters()).dtype
 
     logger.info(f"Transcribing with Voxtral: {audio_path}")
+    start_time = time.time()
 
     try:
-        # Load audio file as bytes and encode to base64
-        with open(audio_path, "rb") as f:
-            audio_bytes = f.read()
-        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
-
-        # Determine audio format from extension
-        ext = Path(audio_path).suffix.lower()
-        mime_types = {
-            ".wav": "audio/wav",
-            ".mp3": "audio/mpeg",
-            ".m4a": "audio/m4a",
-            ".flac": "audio/flac",
-            ".ogg": "audio/ogg",
-            ".webm": "audio/webm",
-        }
-        mime_type = mime_types.get(ext, "audio/wav")
-
-        # Language mapping for prompts
-        lang_names = {
-            "de": "German",
-            "en": "English",
-            "fr": "French",
-            "es": "Spanish",
-            "pt": "Portuguese",
-            "it": "Italian",
-            "nl": "Dutch",
-            "hi": "Hindi",
-        }
-        lang_name = lang_names.get(language, "German")
-
-        # Create transcription prompt with base64 audio
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}},
-                    {"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."},
-                ],
-            }
-        ]
-
-        # Apply chat template and process inputs
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            return_tensors="pt",
-            return_dict=True,
+        # Use apply_transcription_request (official method)
+        # This handles audio loading and preprocessing internally
+        inputs = processor.apply_transcription_request(
+            language=language or "en",
+            audio=audio_path,
+            model_id=model_name,
         )
 
-        # Move to same device as model
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Move inputs to device and dtype
+        inputs = inputs.to(device, dtype=dtype)
 
         # Generate transcription
         with torch.no_grad():
-            generated_ids = model.generate(
+            outputs = model.generate(
                 **inputs,
-                max_new_tokens=512,
+                max_new_tokens=500,
                 do_sample=False,
             )
 
-        # Decode only the generated tokens (exclude input)
-        input_len = inputs["input_ids"].shape[-1]
-        text = processor.batch_decode(
-            generated_ids[:, input_len:],
+        # Decode - skip input tokens
+        input_len = inputs.input_ids.shape[1]
+        decoded = processor.batch_decode(
+            outputs[:, input_len:],
             skip_special_tokens=True,
-        )[0]
+        )
 
-        logger.info(f"Voxtral transcription complete: {len(text)} characters")
+        text = decoded[0] if decoded else ""
+        latency_ms = (time.time() - start_time) * 1000
+
+        logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
 
         return VoxtralTranscriptionResult(
             text=text.strip(),
             language=language,
-            model="voxtral-mini",
+            model=model_name.split("/")[-1],
+            latency_ms=latency_ms,
         )
 
     except Exception as e:
@@ -197,7 +176,7 @@ async def transcribe_audio_bytes(
     audio_bytes: bytes,
     filename: str,
     language: Optional[str] = "de",
-    model_name: str = "mistralai/Voxtral-Mini-3B-2507",
+    model_name: str = DEFAULT_MODEL,
 ) -> VoxtralTranscriptionResult:
     """
     Transcribe audio from bytes (for API uploads).
@@ -222,14 +201,67 @@ async def transcribe_audio_bytes(
             pass
 
 
-# Supported languages by Voxtral
+def unload_model():
+    """Unload model to free memory."""
+    global _voxtral_model, _voxtral_processor, _model_name
+
+    if _voxtral_model is not None:
+        del _voxtral_model
+        del _voxtral_processor
+        _voxtral_model = None
+        _voxtral_processor = None
+        _model_name = None
+
+        import gc
+        gc.collect()
+
+        try:
+            import torch
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+            elif torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except Exception:
+            pass
+
+        logger.info("Voxtral model unloaded")
+
+
+def is_loaded() -> bool:
+    """Check if model is currently loaded."""
+    return _voxtral_model is not None
+
+
+def get_loaded_model_name() -> Optional[str]:
+    """Get name of currently loaded model."""
+    return _model_name
+
+
+# Supported languages (13 languages as per Mistral docs)
 SUPPORTED_LANGUAGES = [
     "en",  # English
-    "de",  # German
-    "fr",  # French
+    "zh",  # Chinese
+    "hi",  # Hindi
     "es",  # Spanish
+    "ar",  # Arabic
+    "fr",  # French
     "pt",  # Portuguese
+    "ru",  # Russian
+    "de",  # German
+    "ja",  # Japanese
+    "ko",  # Korean
     "it",  # Italian
     "nl",  # Dutch
-    "hi",  # Hindi
+]
+
+# Available models
+AVAILABLE_MODELS = [
+    {
+        "id": "voxtral-mini-3b",
+        "name": "Voxtral-Mini-3B-2507",
+        "huggingface_id": "mistralai/Voxtral-Mini-3B-2507",
+        "params": "3B",
+        "vram": "~6GB",
+        "description": "Balanced quality and speed for local deployment",
+    },
 ]
diff --git a/services/mana-stt/com.manacore.mana-stt.plist b/services/mana-stt/com.manacore.mana-stt.plist
new file mode 100644
index 000000000..97ef62521
--- /dev/null
+++ b/services/mana-stt/com.manacore.mana-stt.plist
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>com.manacore.mana-stt</string>
+
+    <key>ProgramArguments</key>
+    <array>
+        <string>/bin/bash</string>
+        <string>-c</string>
+        <string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt &amp;&amp; .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 3020</string>
+    </array>
+
+    <key>WorkingDirectory</key>
+    <string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
+
+    <key>EnvironmentVariables</key>
+    <dict>
+        <key>PATH</key>
+        <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
+        <key>PORT</key>
+        <string>3020</string>
+    </dict>
+
+    <key>RunAtLoad</key>
+    <true/>
+
+    <key>KeepAlive</key>
+    <true/>
+
+    <key>StandardOutPath</key>
+    <string>/Users/mana/logs/mana-stt.log</string>
+
+    <key>StandardErrorPath</key>
+    <string>/Users/mana/logs/mana-stt.error.log</string>
+
+    <key>ThrottleInterval</key>
+    <integer>10</integer>
+</dict>
+</plist>
diff --git a/services/mana-stt/com.manacore.vllm-voxtral.plist b/services/mana-stt/com.manacore.vllm-voxtral.plist
new file mode 100644
index 000000000..4cf9f5711
--- /dev/null
+++ b/services/mana-stt/com.manacore.vllm-voxtral.plist
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>com.manacore.vllm-voxtral</string>
+
+    <key>ProgramArguments</key>
+    <array>
+        <string>/bin/bash</string>
+        <string>-c</string>
+        <string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt &amp;&amp; ./scripts/start-vllm-voxtral.sh</string>
+    </array>
+
+    <key>WorkingDirectory</key>
+    <string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
+
+    <key>EnvironmentVariables</key>
+    <dict>
+        <key>PATH</key>
+        <string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
+        <key>VLLM_PORT</key>
+        <string>8100</string>
+    </dict>
+
+    <key>RunAtLoad</key>
+    <true/>
+
+    <key>KeepAlive</key>
+    <true/>
+
+    <key>StandardOutPath</key>
+    <string>/Users/mana/logs/vllm-voxtral.log</string>
+
+    <key>StandardErrorPath</key>
+    <string>/Users/mana/logs/vllm-voxtral.error.log</string>
+
+    <key>ThrottleInterval</key>
+    <integer>30</integer>
+</dict>
+</plist>
diff --git a/services/mana-stt/install-service.sh b/services/mana-stt/install-service.sh
new file mode 100755
index 000000000..6ee618cbe
--- /dev/null
+++ b/services/mana-stt/install-service.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Install mana-stt as a launchd service on macOS
+# Run this script on the Mac Mini server
+
+set -e
+
+SERVICE_NAME="com.manacore.mana-stt"
+PLIST_FILE="$SERVICE_NAME.plist"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
+LOG_DIR="$HOME/logs"
+
+echo "Installing mana-stt launchd service..."
+
+# Create logs directory
+mkdir -p "$LOG_DIR"
+
+# Stop existing service if running
+if launchctl list | grep -q "$SERVICE_NAME"; then
+    echo "Stopping existing service..."
+    launchctl unload "$LAUNCH_AGENTS_DIR/$PLIST_FILE" 2>/dev/null || true
+fi
+
+# Copy plist to LaunchAgents
+cp "$SCRIPT_DIR/$PLIST_FILE" "$LAUNCH_AGENTS_DIR/"
+
+# Load the service
+echo "Loading service..."
+launchctl load "$LAUNCH_AGENTS_DIR/$PLIST_FILE"
+
+# Check status
+sleep 2
+if launchctl list | grep -q "$SERVICE_NAME"; then
+    echo "Service installed and running!"
+    echo ""
+    echo "Useful commands:"
+    echo "  View logs:    tail -f $LOG_DIR/mana-stt.log"
+    echo "  View errors:  tail -f $LOG_DIR/mana-stt.error.log"
+    echo "  Stop:         launchctl unload $LAUNCH_AGENTS_DIR/$PLIST_FILE"
+    echo "  Start:        launchctl load $LAUNCH_AGENTS_DIR/$PLIST_FILE"
+    echo "  Health check: curl http://localhost:3020/health"
+else
+    echo "ERROR: Service failed to start. Check logs at $LOG_DIR/mana-stt.error.log"
+    exit 1
+fi
diff --git a/services/mana-stt/install-services.sh b/services/mana-stt/install-services.sh
new file mode 100755
index 000000000..e863f9236
--- /dev/null
+++ b/services/mana-stt/install-services.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Install mana-stt and vllm-voxtral as launchd services on macOS
+# Run this script on the Mac Mini server
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
+LOG_DIR="$HOME/logs"
+
+echo "============================================"
+echo "Installing ManaCore STT Services"
+echo "============================================"
+echo ""
+
+# Create logs directory
+mkdir -p "$LOG_DIR"
+
+install_service() {
+    local service_name="$1"
+    local plist_file="$service_name.plist"
+
+    echo "Installing $service_name..."
+
+    # Stop existing service if running
+    if launchctl list | grep -q "$service_name"; then
+        echo "  Stopping existing service..."
+        launchctl unload "$LAUNCH_AGENTS_DIR/$plist_file" 2>/dev/null || true
+    fi
+
+    # Copy plist to LaunchAgents
+    cp "$SCRIPT_DIR/$plist_file" "$LAUNCH_AGENTS_DIR/"
+
+    # Load the service
+    echo "  Loading service..."
+    launchctl load "$LAUNCH_AGENTS_DIR/$plist_file"
+
+    sleep 2
+    if launchctl list | grep -q "$service_name"; then
+        echo "  ✓ $service_name installed and running"
+    else
+        echo "  ✗ $service_name failed to start"
+        return 1
+    fi
+}
+
+# Install vLLM first (STT depends on it)
+install_service "com.manacore.vllm-voxtral"
+
+# Wait for vLLM to initialize
+echo ""
+echo "Waiting for vLLM server to initialize..."
+for i in {1..30}; do
+    if curl -s http://localhost:8100/health > /dev/null 2>&1; then
+        echo "  ✓ vLLM server is ready"
+        break
+    fi
+    if [ $i -eq 30 ]; then
+        echo "  ! vLLM server not responding yet (may still be loading model)"
+    fi
+    sleep 2
+done
+
+# Install STT service
+echo ""
+install_service "com.manacore.mana-stt"
+
+echo ""
+echo "============================================"
+echo "Installation complete!"
+echo "============================================"
+echo ""
+echo "Services:"
+echo "  vLLM Voxtral: http://localhost:8100"
+echo "  ManaCore STT: http://localhost:3020"
+echo ""
+echo "Useful commands:"
+echo "  View vLLM logs:  tail -f $LOG_DIR/vllm-voxtral.log"
+echo "  View STT logs:   tail -f $LOG_DIR/mana-stt.log"
+echo "  Health check:    curl http://localhost:3020/health"
+echo ""
+echo "Stop all:"
+echo "  launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.vllm-voxtral.plist"
+echo "  launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.mana-stt.plist"
diff --git a/services/mana-stt/scripts/setup-vllm.sh b/services/mana-stt/scripts/setup-vllm.sh
new file mode 100755
index 000000000..c6a6ad48f
--- /dev/null
+++ b/services/mana-stt/scripts/setup-vllm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Setup vLLM for Voxtral on Mac Mini M4
+#
+# vLLM runs in CPU mode on macOS (no CUDA), but still provides
+# the optimized inference pipeline for Voxtral models.
+#
+# Usage: ./scripts/setup-vllm.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
+VENV_DIR="$SERVICE_DIR/.venv-vllm"
+
+echo "============================================"
+echo "vLLM Setup for Voxtral on Mac Mini M4"
+echo "============================================"
+echo ""
+
+# Check Python version
+PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
+PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
+PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
+
+if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then
+    echo "Error: Python 3.10+ required (found $PYTHON_VERSION)"
+    exit 1
+fi
+echo "Python version: $PYTHON_VERSION"
+
+# Create separate venv for vLLM (to avoid conflicts with whisper)
+echo ""
+echo "Creating virtual environment for vLLM..."
+python3 -m venv "$VENV_DIR"
+source "$VENV_DIR/bin/activate"
+
+# Upgrade pip
+pip install --upgrade pip --quiet
+
+# Install vLLM with audio support
+echo ""
+echo "Installing vLLM with audio support..."
+echo "This may take a few minutes..."
+
+# Install uv for faster package installation
+pip install uv --quiet
+
+# Install vLLM with audio support (nightly for best Voxtral support)
+uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || {
+    echo "Nightly install failed, trying stable..."
+    uv pip install "vllm[audio]>=0.10.0"
+}
+
+# Install mistral-common with audio
+uv pip install "mistral-common[audio]>=1.8.1"
+
+echo ""
+echo "============================================"
+echo "Installation complete!"
+echo "============================================"
+echo ""
+echo "To start Voxtral Mini 3B server:"
+echo "  source $VENV_DIR/bin/activate"
+echo "  vllm serve mistralai/Voxtral-Mini-3B-2507 \\"
+echo "    --tokenizer_mode mistral \\"
+echo "    --config_format mistral \\"
+echo "    --load_format mistral \\"
+echo "    --host 0.0.0.0 \\"
+echo "    --port 8100"
+echo ""
+echo "To start Voxtral Realtime 4B server:"
+echo "  source $VENV_DIR/bin/activate"
+echo "  vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\"
+echo "    --host 0.0.0.0 \\"
+echo "    --port 8100"
+echo ""
+echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions"
+echo ""
+echo "Test with:"
+echo "  curl http://localhost:8100/v1/audio/transcriptions \\"
+echo "    -F file=@test.mp3 \\"
+echo "    -F model=mistralai/Voxtral-Mini-3B-2507 \\"
+echo "    -F language=de"
diff --git a/services/mana-stt/scripts/start-vllm-voxtral.sh b/services/mana-stt/scripts/start-vllm-voxtral.sh
new file mode 100755
index 000000000..280ba1970
--- /dev/null
+++ b/services/mana-stt/scripts/start-vllm-voxtral.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Start vLLM server for Voxtral
+#
+# Usage: ./scripts/start-vllm-voxtral.sh [model]
+#   model: "3b" (default) or "4b" for Realtime
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
+VENV_DIR="$SERVICE_DIR/.venv-vllm"
+MODEL="${1:-3b}"
+PORT="${VLLM_PORT:-8100}"
+
+# Activate venv
+source "$VENV_DIR/bin/activate"
+
+echo "Starting vLLM Voxtral server..."
+echo "Port: $PORT"
+
+if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
+    echo "Model: Voxtral Mini 4B Realtime"
+    exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
+        --host 0.0.0.0 \
+        --port "$PORT" \
+        --max-model-len 8192
+else
+    echo "Model: Voxtral Mini 3B"
+    exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
+        --tokenizer_mode mistral \
+        --config_format mistral \
+        --load_format mistral \
+        --host 0.0.0.0 \
+        --port "$PORT" \
+        --max-model-len 32768
+fi