diff --git a/services/mana-stt/.env.example b/services/mana-stt/.env.example
new file mode 100644
index 000000000..df9062cbe
--- /dev/null
+++ b/services/mana-stt/.env.example
@@ -0,0 +1,31 @@
+# ManaCore STT Service Configuration
+# Copy to .env and adjust values as needed
+
+# Server
+PORT=3020
+
+# Whisper (Lightning MLX)
+WHISPER_MODEL=large-v3
+
+# Voxtral (Local Models)
+# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
+VOXTRAL_MODEL=voxtral-realtime-4b
+
+# Model Loading
+# Set to true to preload models on startup (slower startup, faster first request)
+PRELOAD_MODELS=false
+
+# Load Management
+# Maximum concurrent transcription requests before API fallback
+MAX_CONCURRENT_REQUESTS=3
+
+# API Fallback
+# Enable automatic fallback to Mistral API when overloaded
+API_FALLBACK_ENABLED=true
+
+# Mistral API Key (required for API fallback)
+# Get your key at https://console.mistral.ai/
+MISTRAL_API_KEY=
+
+# CORS Origins (comma-separated)
+CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173
diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py
index 717115f2f..5423f044e 100644
--- a/services/mana-stt/app/main.py
+++ b/services/mana-stt/app/main.py
@@ -1,16 +1,17 @@
"""
ManaCore STT API Service
-Speech-to-Text with Whisper (MLX) and Voxtral
+Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
"""
import os
import logging
+import time
from typing import Optional
from contextlib import asynccontextmanager
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel
@@ -31,32 +32,39 @@ CORS_ORIGINS = os.getenv(
"https://mana.how,https://chat.mana.how,http://localhost:5173"
).split(",")
+# vLLM configuration
+VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
+USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true"
+
# Response models
class TranscriptionResponse(BaseModel):
text: str
language: Optional[str] = None
model: str
+ latency_ms: Optional[float] = None
duration_seconds: Optional[float] = None
class HealthResponse(BaseModel):
status: str
whisper_loaded: bool
- voxtral_loaded: bool
+ vllm_available: bool
+ vllm_url: Optional[str] = None
+ mistral_api_available: bool
models: dict
class ModelsResponse(BaseModel):
whisper: list
- voxtral: list
+ voxtral_vllm: list
default_whisper: str
# Track loaded models
models_status = {
"whisper_loaded": False,
- "voxtral_loaded": False,
+ "vllm_available": False,
}
@@ -65,9 +73,24 @@ async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
logger.info("Starting ManaCore STT Service...")
- # Optionally preload models on startup
+ # Check vLLM availability
+ if USE_VLLM:
+ from app.vllm_service import check_health
+ health = await check_health()
+ models_status["vllm_available"] = health.get("status") == "healthy"
+ if models_status["vllm_available"]:
+ logger.info(f"vLLM server available at {VLLM_URL}")
+ else:
+ logger.warning(f"vLLM server not available: {health}")
+
+ # Check Mistral API
+ from app.voxtral_api_service import is_available as api_available
+ if api_available():
+ logger.info("Mistral API fallback configured")
+
+ # Optionally preload Whisper
if PRELOAD_MODELS:
- logger.info("Preloading models (PRELOAD_MODELS=true)...")
+ logger.info("Preloading Whisper model...")
try:
from app.whisper_service import get_whisper_model
get_whisper_model(DEFAULT_WHISPER_MODEL)
@@ -76,16 +99,6 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"Failed to preload Whisper: {e}")
- try:
- from app.voxtral_service import get_voxtral_model
- get_voxtral_model()
- models_status["voxtral_loaded"] = True
- logger.info("Voxtral model preloaded")
- except Exception as e:
- logger.warning(f"Failed to preload Voxtral: {e}")
- else:
- logger.info("Models will be loaded on first request (lazy loading)")
-
logger.info(f"STT Service ready on port {PORT}")
yield
logger.info("Shutting down STT Service...")
@@ -94,8 +107,8 @@ async def lifespan(app: FastAPI):
# Create FastAPI app
app = FastAPI(
title="ManaCore STT Service",
- description="Speech-to-Text API with Whisper (MLX) and Voxtral",
- version="1.0.0",
+ description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API",
+ version="2.0.0",
lifespan=lifespan,
)
@@ -112,10 +125,17 @@ app.add_middleware(
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint."""
+ from app.voxtral_api_service import is_available as api_available
+ from app.vllm_service import check_health
+
+ vllm_health = await check_health()
+
return HealthResponse(
status="healthy",
whisper_loaded=models_status["whisper_loaded"],
- voxtral_loaded=models_status["voxtral_loaded"],
+ vllm_available=vllm_health.get("status") == "healthy",
+ vllm_url=VLLM_URL if USE_VLLM else None,
+ mistral_api_available=api_available(),
models={
"default_whisper": DEFAULT_WHISPER_MODEL,
},
@@ -126,11 +146,13 @@ async def health_check():
async def list_models():
"""List available models."""
from app.whisper_service import AVAILABLE_MODELS as whisper_models
- from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
+ from app.vllm_service import get_models
+
+ vllm_models = await get_models()
return ModelsResponse(
whisper=whisper_models,
- voxtral=voxtral_languages,
+ voxtral_vllm=vllm_models,
default_whisper=DEFAULT_WHISPER_MODEL,
)
@@ -138,25 +160,19 @@ async def list_models():
@app.post("/transcribe", response_model=TranscriptionResponse)
async def transcribe_whisper(
file: UploadFile = File(..., description="Audio file to transcribe"),
- language: Optional[str] = Form(
- None,
- description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
- ),
- model: str = Form(
- None,
- description="Whisper model to use (default: large-v3-turbo)"
- ),
+ language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
+ model: Optional[str] = Form(None, description="Whisper model to use"),
):
"""
Transcribe audio using Whisper (Lightning MLX).
+ Best for: General transcription, many languages
Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
- # Validate file type
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_extensions:
@@ -165,20 +181,17 @@ async def transcribe_whisper(
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
)
+ start_time = time.time()
+
try:
from app.whisper_service import transcribe_audio_bytes
- # Read file
audio_bytes = await file.read()
-
- # Check file size (100MB limit)
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
- # Use default model if not specified
model_name = model or DEFAULT_WHISPER_MODEL
- # Transcribe
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
@@ -187,38 +200,53 @@ async def transcribe_whisper(
)
models_status["whisper_loaded"] = True
+ latency_ms = (time.time() - start_time) * 1000
return TranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisper-{model_name}",
+ latency_ms=latency_ms,
)
except Exception as e:
- logger.error(f"Transcription error: {e}")
+ logger.error(f"Whisper transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
async def transcribe_voxtral(
file: UploadFile = File(..., description="Audio file to transcribe"),
- language: str = Form(
- "de",
- description="Language code (de, en, fr, es, pt, it, nl, hi)"
- ),
+ language: str = Form("de", description="Language code"),
+ use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"),
):
"""
- Transcribe audio using Voxtral Mini (Mistral AI).
+ Transcribe audio using Voxtral via vLLM server.
- Best for: German, French, European languages
- Supported formats: mp3, wav, m4a, flac
+ Models:
+ - Voxtral Mini 3B (default): Best quality
+ - Voxtral Realtime 4B: Lower latency (<500ms)
+
+ Falls back to Mistral API if vLLM is unavailable.
+
+ Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
- # Validate language
- from app.voxtral_service import SUPPORTED_LANGUAGES
+ from app.vllm_service import (
+ SUPPORTED_LANGUAGES,
+ is_available as vllm_available,
+ transcribe_audio_bytes as vllm_transcribe,
+ transcribe_with_realtime,
+ check_health,
+ )
+ from app.voxtral_api_service import (
+ is_available as api_available,
+ transcribe_audio_bytes as api_transcribe,
+ )
+
if language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
@@ -226,10 +254,94 @@ async def transcribe_voxtral(
)
try:
- from app.voxtral_service import transcribe_audio_bytes
-
audio_bytes = await file.read()
+ if len(audio_bytes) > 100 * 1024 * 1024:
+ raise HTTPException(status_code=400, detail="File too large (max 100MB)")
+ # Try vLLM first
+ if USE_VLLM:
+ health = await check_health()
+ if health.get("status") == "healthy":
+ logger.info("Using vLLM for Voxtral transcription")
+ if use_realtime:
+ result = await transcribe_with_realtime(
+ audio_bytes=audio_bytes,
+ filename=file.filename,
+ language=language,
+ )
+ else:
+ result = await vllm_transcribe(
+ audio_bytes=audio_bytes,
+ filename=file.filename,
+ language=language,
+ )
+
+ return TranscriptionResponse(
+ text=result.text,
+ language=result.language,
+ model=result.model,
+ latency_ms=result.latency_ms,
+ duration_seconds=result.duration_seconds,
+ )
+
+ # Fallback to Mistral API
+ if api_available():
+ logger.info("Falling back to Mistral API")
+ result = await api_transcribe(
+ audio_bytes=audio_bytes,
+ filename=file.filename,
+ language=language,
+ )
+
+ return TranscriptionResponse(
+ text=result.text,
+ language=result.language,
+ model=result.model,
+ latency_ms=None,
+ duration_seconds=result.duration_seconds,
+ )
+
+ raise HTTPException(
+ status_code=503,
+ detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY."
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Voxtral transcription error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse)
+async def transcribe_voxtral_api(
+ file: UploadFile = File(..., description="Audio file to transcribe"),
+ language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
+ diarization: bool = Form(False, description="Enable speaker diarization"),
+):
+ """
+ Transcribe audio using Mistral's Voxtral API directly.
+
+ Features:
+ - Speaker diarization
+ - Auto language detection
+ - High quality (~4% WER)
+
+ Requires MISTRAL_API_KEY environment variable.
+ """
+ from app.voxtral_api_service import is_available, transcribe_audio_bytes
+
+ if not is_available():
+ raise HTTPException(
+ status_code=503,
+ detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable."
+ )
+
+ if not file.filename:
+ raise HTTPException(status_code=400, detail="No file provided")
+
+ try:
+ audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
@@ -237,59 +349,61 @@ async def transcribe_voxtral(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
+ diarization=diarization,
)
- models_status["voxtral_loaded"] = True
-
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
+ duration_seconds=result.duration_seconds,
)
except Exception as e:
- logger.error(f"Voxtral transcription error: {e}")
+ logger.error(f"Mistral API error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
async def transcribe_auto(
file: UploadFile = File(..., description="Audio file to transcribe"),
- language: Optional[str] = Form(
- None,
- description="Language hint (optional)"
- ),
- prefer: str = Form(
- "whisper",
- description="Preferred model: 'whisper' or 'voxtral'"
- ),
+ language: Optional[str] = Form(None, description="Language hint"),
+ prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
):
"""
- Transcribe audio with automatic model selection.
+ Transcribe with automatic model selection and fallback.
- - Uses Whisper by default (faster, more languages)
- - Falls back to Voxtral if Whisper fails
+ Fallback chain:
+ 1. Preferred model (whisper or voxtral)
+ 2. Alternative model
+ 3. Mistral API
"""
if prefer == "voxtral":
- # Try Voxtral first
try:
- return await transcribe_voxtral(file, language or "de")
+ return await transcribe_voxtral(file, language or "de", False)
except Exception as e:
logger.warning(f"Voxtral failed, trying Whisper: {e}")
- # Reset file position
await file.seek(0)
- return await transcribe_whisper(file, language, None)
+ try:
+ return await transcribe_whisper(file, language, None)
+ except Exception as e2:
+ logger.warning(f"Whisper failed, trying API: {e2}")
+ await file.seek(0)
+ return await transcribe_voxtral_api(file, language, False)
else:
- # Try Whisper first (default)
try:
return await transcribe_whisper(file, language, None)
except Exception as e:
logger.warning(f"Whisper failed, trying Voxtral: {e}")
await file.seek(0)
- return await transcribe_voxtral(file, language or "de")
+ try:
+ return await transcribe_voxtral(file, language or "de", False)
+ except Exception as e2:
+ logger.warning(f"Voxtral failed, trying API: {e2}")
+ await file.seek(0)
+ return await transcribe_voxtral_api(file, language, False)
-# Error handlers
@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
logger.error(f"Unhandled error: {exc}")
diff --git a/services/mana-stt/app/vllm_service.py b/services/mana-stt/app/vllm_service.py
new file mode 100644
index 000000000..4ca1857a1
--- /dev/null
+++ b/services/mana-stt/app/vllm_service.py
@@ -0,0 +1,178 @@
+"""
+vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription
+
+vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API.
+This service proxies requests to the vLLM server.
+
+Requirements:
+- vLLM server running on VLLM_URL (default: http://localhost:8100)
+- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602
+"""
+
+import os
+import logging
+import time
+import tempfile
+import httpx
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+# vLLM server configuration
+VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
+VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio
+
+# Model IDs
+VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507"
+VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
+
+
+@dataclass
+class VllmTranscriptionResult:
+ text: str
+ language: Optional[str] = None
+ model: str = "voxtral-vllm"
+ latency_ms: Optional[float] = None
+ duration_seconds: Optional[float] = None
+
+
+async def check_health() -> dict:
+ """Check if vLLM server is healthy."""
+ try:
+ async with httpx.AsyncClient(timeout=5.0) as client:
+ response = await client.get(f"{VLLM_URL}/health")
+ if response.status_code == 200:
+ return {"status": "healthy", "url": VLLM_URL}
+ return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code}
+ except Exception as e:
+ return {"status": "unavailable", "url": VLLM_URL, "error": str(e)}
+
+
+async def get_models() -> list:
+ """Get available models from vLLM server."""
+ try:
+ async with httpx.AsyncClient(timeout=5.0) as client:
+ response = await client.get(f"{VLLM_URL}/v1/models")
+ if response.status_code == 200:
+ data = response.json()
+ return [m["id"] for m in data.get("data", [])]
+ return []
+ except Exception:
+ return []
+
+
+def is_available() -> bool:
+ """Check if vLLM server is configured."""
+ return bool(VLLM_URL)
+
+
+async def transcribe_audio_bytes(
+ audio_bytes: bytes,
+ filename: str,
+ language: Optional[str] = "de",
+ model: Optional[str] = None,
+) -> VllmTranscriptionResult:
+ """
+ Transcribe audio using vLLM Voxtral server.
+
+ Args:
+ audio_bytes: Raw audio bytes
+ filename: Original filename (for format detection)
+ language: Language code (de, en, fr, etc.)
+ model: Model to use (defaults to Voxtral-Mini-3B-2507)
+
+ Returns:
+ VllmTranscriptionResult with transcription
+ """
+ start_time = time.time()
+ model_id = model or VOXTRAL_3B
+
+ logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)")
+
+ # Save to temp file (vLLM API accepts file uploads)
+ ext = Path(filename).suffix or ".wav"
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+ tmp.write(audio_bytes)
+ tmp_path = tmp.name
+
+ try:
+ async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client:
+ # Use OpenAI-compatible transcription endpoint
+ with open(tmp_path, "rb") as f:
+ files = {"file": (filename, f, "audio/wav")}
+ data = {
+ "model": model_id,
+ "language": language or "de",
+ "response_format": "json",
+ "temperature": 0.0, # Deterministic for transcription
+ }
+
+ response = await client.post(
+ f"{VLLM_URL}/v1/audio/transcriptions",
+ files=files,
+ data=data,
+ )
+
+ if response.status_code != 200:
+ error_detail = response.text
+ logger.error(f"vLLM error: {response.status_code} - {error_detail}")
+ raise RuntimeError(f"vLLM transcription failed: {error_detail}")
+
+ result = response.json()
+ text = result.get("text", "")
+ duration = result.get("duration")
+
+ latency_ms = (time.time() - start_time) * 1000
+ logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
+
+ return VllmTranscriptionResult(
+ text=text.strip(),
+ language=language,
+ model=f"vllm-{model_id.split('/')[-1]}",
+ latency_ms=latency_ms,
+ duration_seconds=duration,
+ )
+
+ finally:
+ try:
+ os.unlink(tmp_path)
+ except Exception:
+ pass
+
+
+async def transcribe_with_realtime(
+ audio_bytes: bytes,
+ filename: str,
+ language: Optional[str] = "de",
+) -> VllmTranscriptionResult:
+ """
+ Transcribe using Voxtral 4B Realtime model.
+
+ Optimized for low latency (<500ms).
+ """
+ return await transcribe_audio_bytes(
+ audio_bytes=audio_bytes,
+ filename=filename,
+ language=language,
+ model=VOXTRAL_4B_REALTIME,
+ )
+
+
+# Supported languages (same as Voxtral)
+SUPPORTED_LANGUAGES = [
+ "en", # English
+ "zh", # Chinese
+ "hi", # Hindi
+ "es", # Spanish
+ "ar", # Arabic
+ "fr", # French
+ "pt", # Portuguese
+ "ru", # Russian
+ "de", # German
+ "ja", # Japanese
+ "ko", # Korean
+ "it", # Italian
+ "nl", # Dutch
+]
diff --git a/services/mana-stt/app/voxtral_api_service.py b/services/mana-stt/app/voxtral_api_service.py
new file mode 100644
index 000000000..53d78f808
--- /dev/null
+++ b/services/mana-stt/app/voxtral_api_service.py
@@ -0,0 +1,213 @@
+"""
+Voxtral API Service - Mistral Cloud API Fallback
+Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded.
+
+Features:
+- Speaker diarization
+- Word-level timestamps
+- Context biasing for domain-specific terms
+- 13 language support
+"""
+
+import os
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional, Literal
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+# Lazy load client
+_mistral_client = None
+
+MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
+DEFAULT_MODEL = "voxtral-mini-latest" # voxtral-mini-2602
+
+
+@dataclass
+class Speaker:
+ """Speaker information from diarization."""
+ id: str
+ start: float
+ end: float
+
+
+@dataclass
+class WordTimestamp:
+ """Word-level timestamp."""
+ word: str
+ start: float
+ end: float
+
+
+@dataclass
+class SegmentTimestamp:
+ """Segment-level timestamp."""
+ text: str
+ start: float
+ end: float
+ speaker: Optional[str] = None
+
+
+@dataclass
+class VoxtralApiResult:
+ """Result from Voxtral API transcription."""
+ text: str
+ language: Optional[str] = None
+ model: str = "voxtral-api"
+ duration_seconds: Optional[float] = None
+ words: list[WordTimestamp] = field(default_factory=list)
+ segments: list[SegmentTimestamp] = field(default_factory=list)
+ speakers: list[Speaker] = field(default_factory=list)
+
+
+def get_mistral_client():
+ """Get or create Mistral client instance."""
+ global _mistral_client
+
+ if _mistral_client is None:
+ if not MISTRAL_API_KEY:
+ raise RuntimeError(
+ "MISTRAL_API_KEY environment variable not set. "
+ "Get your API key at https://console.mistral.ai/"
+ )
+
+ try:
+ from mistralai import Mistral
+ _mistral_client = Mistral(api_key=MISTRAL_API_KEY)
+ logger.info("Mistral API client initialized")
+ except ImportError:
+ raise RuntimeError(
+ "mistralai package not installed. "
+ "Run: pip install mistralai"
+ )
+
+ return _mistral_client
+
+
+def is_available() -> bool:
+ """Check if Mistral API is configured and available."""
+ return bool(MISTRAL_API_KEY)
+
+
+async def transcribe_audio_bytes(
+ audio_bytes: bytes,
+ filename: str,
+ language: Optional[str] = None,
+ timestamp_granularity: Optional[Literal["word", "segment"]] = None,
+ diarization: bool = False,
+ context_bias: Optional[list[str]] = None,
+) -> VoxtralApiResult:
+ """
+ Transcribe audio using Mistral's Voxtral API.
+
+ Args:
+ audio_bytes: Raw audio bytes
+ filename: Original filename (for extension detection)
+ language: Language code (de, en, fr, etc.) - auto-detect if None
+ timestamp_granularity: "word" or "segment" for timestamps
+ diarization: Enable speaker diarization
+ context_bias: List of domain-specific terms to improve accuracy (max 100)
+
+ Returns:
+ VoxtralApiResult with transcription and optional metadata
+ """
+ client = get_mistral_client()
+
+ logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)")
+
+ try:
+ # Build request parameters
+ request_params = {
+ "model": DEFAULT_MODEL,
+ "file": {
+ "content": audio_bytes,
+ "file_name": filename,
+ },
+ }
+
+ # Language and timestamps are mutually exclusive in current API
+ if language and not timestamp_granularity:
+ request_params["language"] = language
+
+ if timestamp_granularity:
+ request_params["timestamp_granularities"] = [timestamp_granularity]
+
+ if diarization:
+ request_params["diarization"] = True
+
+ if context_bias:
+ # API accepts comma-separated string, max 100 terms
+ bias_terms = context_bias[:100]
+ request_params["context_bias"] = ",".join(bias_terms)
+
+ # Make API call
+ response = client.audio.transcriptions.complete(**request_params)
+
+ # Parse response
+ result = VoxtralApiResult(
+ text=response.text,
+ language=getattr(response, "language", language),
+ model=f"voxtral-api-{DEFAULT_MODEL}",
+ duration_seconds=getattr(response, "duration", None),
+ )
+
+ # Parse word timestamps if present
+ if hasattr(response, "words") and response.words:
+ result.words = [
+ WordTimestamp(
+ word=w.word,
+ start=w.start,
+ end=w.end,
+ )
+ for w in response.words
+ ]
+
+ # Parse segment timestamps if present
+ if hasattr(response, "segments") and response.segments:
+ result.segments = [
+ SegmentTimestamp(
+ text=s.text,
+ start=s.start,
+ end=s.end,
+ speaker=getattr(s, "speaker", None),
+ )
+ for s in response.segments
+ ]
+
+ # Parse speakers if diarization enabled
+ if hasattr(response, "speakers") and response.speakers:
+ result.speakers = [
+ Speaker(
+ id=sp.id,
+ start=sp.start,
+ end=sp.end,
+ )
+ for sp in response.speakers
+ ]
+
+ logger.info(f"Mistral API transcription complete: {len(result.text)} characters")
+ return result
+
+ except Exception as e:
+ logger.error(f"Mistral API transcription failed: {e}")
+ raise
+
+
+# Supported languages by Voxtral API (13 languages)
+SUPPORTED_LANGUAGES = [
+ "en", # English
+ "zh", # Chinese
+ "hi", # Hindi
+ "es", # Spanish
+ "ar", # Arabic
+ "fr", # French
+ "pt", # Portuguese
+ "ru", # Russian
+ "de", # German
+ "ja", # Japanese
+ "ko", # Korean
+ "it", # Italian
+ "nl", # Dutch
+]
diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py
index 989ef863b..320e5020d 100644
--- a/services/mana-stt/app/voxtral_service.py
+++ b/services/mana-stt/app/voxtral_service.py
@@ -1,12 +1,15 @@
"""
Voxtral STT Service using Hugging Face Transformers
Mistral AI's Speech-to-Text model (Apache 2.0 License)
+
+Uses VoxtralForConditionalGeneration with apply_transcription_request
+as per official HuggingFace documentation.
"""
import os
import tempfile
import logging
-import base64
+import time
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
@@ -16,68 +19,80 @@ logger = logging.getLogger(__name__)
# Lazy load to avoid import errors
_voxtral_model = None
_voxtral_processor = None
+_model_name = None
+
+# Default model
+DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507"
@dataclass
class VoxtralTranscriptionResult:
text: str
language: Optional[str] = None
- model: str = "voxtral-mini"
+ model: str = "voxtral-mini-3b"
+ latency_ms: Optional[float] = None
-def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
+def get_voxtral_model(model_name: str = DEFAULT_MODEL):
"""
Get or create Voxtral model instance.
- Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
- Voxtral Small (24B) requires more VRAM.
+ Uses VoxtralForConditionalGeneration (the correct class for Voxtral).
"""
- global _voxtral_model, _voxtral_processor
+ global _voxtral_model, _voxtral_processor, _model_name
+
+ # Reload if different model requested
+ if _voxtral_model is not None and _model_name != model_name:
+ logger.info(f"Switching model from {_model_name} to {model_name}")
+ _voxtral_model = None
+ _voxtral_processor = None
if _voxtral_model is None:
logger.info(f"Loading Voxtral model: {model_name}")
try:
import torch
- from transformers import AutoModel, AutoProcessor
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
- # Determine device
+ # Determine device and dtype
if torch.backends.mps.is_available():
device = "mps"
+ # MPS works better with float16
torch_dtype = torch.float16
elif torch.cuda.is_available():
device = "cuda"
- torch_dtype = torch.float16
+ torch_dtype = torch.bfloat16
else:
device = "cpu"
torch_dtype = torch.float32
- logger.info(f"Using device: {device}")
+ logger.info(f"Using device: {device}, dtype: {torch_dtype}")
# Load processor
- _voxtral_processor = AutoProcessor.from_pretrained(
- model_name,
- trust_remote_code=True,
- )
+ _voxtral_processor = AutoProcessor.from_pretrained(model_name)
- # Load model - Voxtral uses AutoModel, not AutoModelForSpeechSeq2Seq
- _voxtral_model = AutoModel.from_pretrained(
- model_name,
- torch_dtype=torch_dtype,
- device_map="auto" if device != "mps" else None,
- trust_remote_code=True,
- )
-
- # Move to MPS if available (device_map doesn't support MPS)
+ # Load model with VoxtralForConditionalGeneration
if device == "mps":
+ # MPS doesn't support device_map, load to CPU first then move
+ _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
+ model_name,
+ torch_dtype=torch_dtype,
+ )
_voxtral_model = _voxtral_model.to(device)
+ else:
+ _voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
+ model_name,
+ torch_dtype=torch_dtype,
+ device_map=device,
+ )
+ _model_name = model_name
logger.info(f"Voxtral model loaded successfully on {device}")
except ImportError as e:
logger.error(f"Failed to import transformers: {e}")
raise RuntimeError(
- "transformers not installed. "
- "Run: pip install transformers torch"
+ "transformers >= 4.54.0 required. "
+ "Run: pip install --upgrade transformers"
)
except Exception as e:
logger.error(f"Failed to load Voxtral model: {e}")
@@ -89,17 +104,16 @@ def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
def transcribe_audio(
audio_path: str,
language: Optional[str] = "de",
- model_name: str = "mistralai/Voxtral-Mini-3B-2507",
+ model_name: str = DEFAULT_MODEL,
) -> VoxtralTranscriptionResult:
"""
Transcribe audio file using Voxtral.
- Voxtral is a multimodal audio understanding model that can be prompted
- for transcription tasks.
+ Uses the official apply_transcription_request method.
Args:
audio_path: Path to audio file
- language: Target language for transcription
+ language: Language code (de, en, fr, etc.)
model_name: Hugging Face model ID
Returns:
@@ -108,84 +122,49 @@ def transcribe_audio(
import torch
model, processor = get_voxtral_model(model_name)
+ device = next(model.parameters()).device
+ dtype = next(model.parameters()).dtype
logger.info(f"Transcribing with Voxtral: {audio_path}")
+ start_time = time.time()
try:
- # Load audio file as bytes and encode to base64
- with open(audio_path, "rb") as f:
- audio_bytes = f.read()
- audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
-
- # Determine audio format from extension
- ext = Path(audio_path).suffix.lower()
- mime_types = {
- ".wav": "audio/wav",
- ".mp3": "audio/mpeg",
- ".m4a": "audio/m4a",
- ".flac": "audio/flac",
- ".ogg": "audio/ogg",
- ".webm": "audio/webm",
- }
- mime_type = mime_types.get(ext, "audio/wav")
-
- # Language mapping for prompts
- lang_names = {
- "de": "German",
- "en": "English",
- "fr": "French",
- "es": "Spanish",
- "pt": "Portuguese",
- "it": "Italian",
- "nl": "Dutch",
- "hi": "Hindi",
- }
- lang_name = lang_names.get(language, "German")
-
- # Create transcription prompt with base64 audio
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}},
- {"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."},
- ],
- }
- ]
-
- # Apply chat template and process inputs
- inputs = processor.apply_chat_template(
- messages,
- tokenize=True,
- return_tensors="pt",
- return_dict=True,
+ # Use apply_transcription_request (official method)
+ # This handles audio loading and preprocessing internally
+ inputs = processor.apply_transcription_request(
+ language=language or "en",
+ audio=audio_path,
+ model_id=model_name,
)
- # Move to same device as model
- device = next(model.parameters()).device
- inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+ # Move inputs to device and dtype
+ inputs = inputs.to(device, dtype=dtype)
# Generate transcription
with torch.no_grad():
- generated_ids = model.generate(
+ outputs = model.generate(
**inputs,
- max_new_tokens=512,
+ max_new_tokens=500,
do_sample=False,
)
- # Decode only the generated tokens (exclude input)
- input_len = inputs["input_ids"].shape[-1]
- text = processor.batch_decode(
- generated_ids[:, input_len:],
+ # Decode - skip input tokens
+ input_len = inputs.input_ids.shape[1]
+ decoded = processor.batch_decode(
+ outputs[:, input_len:],
skip_special_tokens=True,
- )[0]
+ )
- logger.info(f"Voxtral transcription complete: {len(text)} characters")
+ text = decoded[0] if decoded else ""
+ latency_ms = (time.time() - start_time) * 1000
+
+ logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
return VoxtralTranscriptionResult(
text=text.strip(),
language=language,
- model="voxtral-mini",
+ model=model_name.split("/")[-1],
+ latency_ms=latency_ms,
)
except Exception as e:
@@ -197,7 +176,7 @@ async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = "de",
- model_name: str = "mistralai/Voxtral-Mini-3B-2507",
+ model_name: str = DEFAULT_MODEL,
) -> VoxtralTranscriptionResult:
"""
Transcribe audio from bytes (for API uploads).
@@ -222,14 +201,67 @@ async def transcribe_audio_bytes(
pass
-# Supported languages by Voxtral
+def unload_model():
+ """Unload model to free memory."""
+ global _voxtral_model, _voxtral_processor, _model_name
+
+ if _voxtral_model is not None:
+ del _voxtral_model
+ del _voxtral_processor
+ _voxtral_model = None
+ _voxtral_processor = None
+ _model_name = None
+
+ import gc
+ gc.collect()
+
+ try:
+ import torch
+ if torch.backends.mps.is_available():
+ torch.mps.empty_cache()
+ elif torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ except Exception:
+ pass
+
+ logger.info("Voxtral model unloaded")
+
+
+def is_loaded() -> bool:
+ """Check if model is currently loaded."""
+ return _voxtral_model is not None
+
+
+def get_loaded_model_name() -> Optional[str]:
+ """Get name of currently loaded model."""
+ return _model_name
+
+
+# Supported languages (13 languages as per Mistral docs)
SUPPORTED_LANGUAGES = [
"en", # English
- "de", # German
- "fr", # French
+ "zh", # Chinese
+ "hi", # Hindi
"es", # Spanish
+ "ar", # Arabic
+ "fr", # French
"pt", # Portuguese
+ "ru", # Russian
+ "de", # German
+ "ja", # Japanese
+ "ko", # Korean
"it", # Italian
"nl", # Dutch
- "hi", # Hindi
+]
+
+# Available models
+AVAILABLE_MODELS = [
+ {
+ "id": "voxtral-mini-3b",
+ "name": "Voxtral-Mini-3B-2507",
+ "huggingface_id": "mistralai/Voxtral-Mini-3B-2507",
+ "params": "3B",
+ "vram": "~6GB",
+ "description": "Balanced quality and speed for local deployment",
+ },
]
diff --git a/services/mana-stt/com.manacore.mana-stt.plist b/services/mana-stt/com.manacore.mana-stt.plist
new file mode 100644
index 000000000..97ef62521
--- /dev/null
+++ b/services/mana-stt/com.manacore.mana-stt.plist
@@ -0,0 +1,41 @@
+
+
+
+
+ Label
+ com.manacore.mana-stt
+
+ ProgramArguments
+
+ /bin/bash
+ -c
+ cd /Users/mana/projects/manacore-monorepo/services/mana-stt && .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 3020
+
+
+ WorkingDirectory
+ /Users/mana/projects/manacore-monorepo/services/mana-stt
+
+ EnvironmentVariables
+
+ PATH
+ /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin
+ PORT
+ 3020
+
+
+ RunAtLoad
+
+
+ KeepAlive
+
+
+ StandardOutPath
+ /Users/mana/logs/mana-stt.log
+
+ StandardErrorPath
+ /Users/mana/logs/mana-stt.error.log
+
+ ThrottleInterval
+ 10
+
+
diff --git a/services/mana-stt/com.manacore.vllm-voxtral.plist b/services/mana-stt/com.manacore.vllm-voxtral.plist
new file mode 100644
index 000000000..4cf9f5711
--- /dev/null
+++ b/services/mana-stt/com.manacore.vllm-voxtral.plist
@@ -0,0 +1,41 @@
+
+
+
+
+ Label
+ com.manacore.vllm-voxtral
+
+ ProgramArguments
+
+ /bin/bash
+ -c
+ cd /Users/mana/projects/manacore-monorepo/services/mana-stt && ./scripts/start-vllm-voxtral.sh
+
+
+ WorkingDirectory
+ /Users/mana/projects/manacore-monorepo/services/mana-stt
+
+ EnvironmentVariables
+
+ PATH
+ /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin
+ VLLM_PORT
+ 8100
+
+
+ RunAtLoad
+
+
+ KeepAlive
+
+
+ StandardOutPath
+ /Users/mana/logs/vllm-voxtral.log
+
+ StandardErrorPath
+ /Users/mana/logs/vllm-voxtral.error.log
+
+ ThrottleInterval
+ 30
+
+
diff --git a/services/mana-stt/install-service.sh b/services/mana-stt/install-service.sh
new file mode 100755
index 000000000..6ee618cbe
--- /dev/null
+++ b/services/mana-stt/install-service.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Install mana-stt as a launchd service on macOS
+# Run this script on the Mac Mini server
+
+set -e
+
+SERVICE_NAME="com.manacore.mana-stt"
+PLIST_FILE="$SERVICE_NAME.plist"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
+LOG_DIR="$HOME/logs"
+
+echo "Installing mana-stt launchd service..."
+
+# Create logs directory
+mkdir -p "$LOG_DIR"
+
+# Stop existing service if running
+if launchctl list | grep -q "$SERVICE_NAME"; then
+ echo "Stopping existing service..."
+ launchctl unload "$LAUNCH_AGENTS_DIR/$PLIST_FILE" 2>/dev/null || true
+fi
+
+# Copy plist to LaunchAgents
+cp "$SCRIPT_DIR/$PLIST_FILE" "$LAUNCH_AGENTS_DIR/"
+
+# Load the service
+echo "Loading service..."
+launchctl load "$LAUNCH_AGENTS_DIR/$PLIST_FILE"
+
+# Check status
+sleep 2
+if launchctl list | grep -q "$SERVICE_NAME"; then
+ echo "Service installed and running!"
+ echo ""
+ echo "Useful commands:"
+ echo " View logs: tail -f $LOG_DIR/mana-stt.log"
+ echo " View errors: tail -f $LOG_DIR/mana-stt.error.log"
+ echo " Stop: launchctl unload $LAUNCH_AGENTS_DIR/$PLIST_FILE"
+ echo " Start: launchctl load $LAUNCH_AGENTS_DIR/$PLIST_FILE"
+ echo " Health check: curl http://localhost:3020/health"
+else
+ echo "ERROR: Service failed to start. Check logs at $LOG_DIR/mana-stt.error.log"
+ exit 1
+fi
diff --git a/services/mana-stt/install-services.sh b/services/mana-stt/install-services.sh
new file mode 100755
index 000000000..e863f9236
--- /dev/null
+++ b/services/mana-stt/install-services.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Install mana-stt and vllm-voxtral as launchd services on macOS
+# Run this script on the Mac Mini server
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
+LOG_DIR="$HOME/logs"
+
+echo "============================================"
+echo "Installing ManaCore STT Services"
+echo "============================================"
+echo ""
+
+# Create logs directory
+mkdir -p "$LOG_DIR"
+
+install_service() {
+ local service_name="$1"
+ local plist_file="$service_name.plist"
+
+ echo "Installing $service_name..."
+
+ # Stop existing service if running
+ if launchctl list | grep -q "$service_name"; then
+ echo " Stopping existing service..."
+ launchctl unload "$LAUNCH_AGENTS_DIR/$plist_file" 2>/dev/null || true
+ fi
+
+ # Copy plist to LaunchAgents
+ cp "$SCRIPT_DIR/$plist_file" "$LAUNCH_AGENTS_DIR/"
+
+ # Load the service
+ echo " Loading service..."
+ launchctl load "$LAUNCH_AGENTS_DIR/$plist_file"
+
+ sleep 2
+ if launchctl list | grep -q "$service_name"; then
+ echo " ✓ $service_name installed and running"
+ else
+ echo " ✗ $service_name failed to start"
+ return 1
+ fi
+}
+
+# Install vLLM first (STT depends on it)
+install_service "com.manacore.vllm-voxtral"
+
+# Wait for vLLM to initialize
+echo ""
+echo "Waiting for vLLM server to initialize..."
+for i in {1..30}; do
+ if curl -s http://localhost:8100/health > /dev/null 2>&1; then
+ echo " ✓ vLLM server is ready"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo " ! vLLM server not responding yet (may still be loading model)"
+ fi
+ sleep 2
+done
+
+# Install STT service
+echo ""
+install_service "com.manacore.mana-stt"
+
+echo ""
+echo "============================================"
+echo "Installation complete!"
+echo "============================================"
+echo ""
+echo "Services:"
+echo " vLLM Voxtral: http://localhost:8100"
+echo " ManaCore STT: http://localhost:3020"
+echo ""
+echo "Useful commands:"
+echo " View vLLM logs: tail -f $LOG_DIR/vllm-voxtral.log"
+echo " View STT logs: tail -f $LOG_DIR/mana-stt.log"
+echo " Health check: curl http://localhost:3020/health"
+echo ""
+echo "Stop all:"
+echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.vllm-voxtral.plist"
+echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.mana-stt.plist"
diff --git a/services/mana-stt/scripts/setup-vllm.sh b/services/mana-stt/scripts/setup-vllm.sh
new file mode 100755
index 000000000..c6a6ad48f
--- /dev/null
+++ b/services/mana-stt/scripts/setup-vllm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Setup vLLM for Voxtral on Mac Mini M4
+#
+# vLLM runs in CPU mode on macOS (no CUDA), but still provides
+# the optimized inference pipeline for Voxtral models.
+#
+# Usage: ./scripts/setup-vllm.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
+VENV_DIR="$SERVICE_DIR/.venv-vllm"
+
+echo "============================================"
+echo "vLLM Setup for Voxtral on Mac Mini M4"
+echo "============================================"
+echo ""
+
+# Check Python version
+PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
+PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
+PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
+
+if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then
+ echo "Error: Python 3.10+ required (found $PYTHON_VERSION)"
+ exit 1
+fi
+echo "Python version: $PYTHON_VERSION"
+
+# Create separate venv for vLLM (to avoid conflicts with whisper)
+echo ""
+echo "Creating virtual environment for vLLM..."
+python3 -m venv "$VENV_DIR"
+source "$VENV_DIR/bin/activate"
+
+# Upgrade pip
+pip install --upgrade pip --quiet
+
+# Install vLLM with audio support
+echo ""
+echo "Installing vLLM with audio support..."
+echo "This may take a few minutes..."
+
+# Install uv for faster package installation
+pip install uv --quiet
+
+# Install vLLM with audio support (nightly for best Voxtral support)
+uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || {
+ echo "Nightly install failed, trying stable..."
+ uv pip install "vllm[audio]>=0.10.0"
+}
+
+# Install mistral-common with audio
+uv pip install "mistral-common[audio]>=1.8.1"
+
+echo ""
+echo "============================================"
+echo "Installation complete!"
+echo "============================================"
+echo ""
+echo "To start Voxtral Mini 3B server:"
+echo " source $VENV_DIR/bin/activate"
+echo " vllm serve mistralai/Voxtral-Mini-3B-2507 \\"
+echo " --tokenizer_mode mistral \\"
+echo " --config_format mistral \\"
+echo " --load_format mistral \\"
+echo " --host 0.0.0.0 \\"
+echo " --port 8100"
+echo ""
+echo "To start Voxtral Realtime 4B server:"
+echo " source $VENV_DIR/bin/activate"
+echo " vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\"
+echo " --host 0.0.0.0 \\"
+echo " --port 8100"
+echo ""
+echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions"
+echo ""
+echo "Test with:"
+echo " curl http://localhost:8100/v1/audio/transcriptions \\"
+echo " -F file=@test.mp3 \\"
+echo " -F model=mistralai/Voxtral-Mini-3B-2507 \\"
+echo " -F language=de"
diff --git a/services/mana-stt/scripts/start-vllm-voxtral.sh b/services/mana-stt/scripts/start-vllm-voxtral.sh
new file mode 100755
index 000000000..280ba1970
--- /dev/null
+++ b/services/mana-stt/scripts/start-vllm-voxtral.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Start vLLM server for Voxtral
+#
+# Usage: ./scripts/start-vllm-voxtral.sh [model]
+# model: "3b" (default) or "4b" for Realtime
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
+VENV_DIR="$SERVICE_DIR/.venv-vllm"
+MODEL="${1:-3b}"
+PORT="${VLLM_PORT:-8100}"
+
+# Activate venv
+source "$VENV_DIR/bin/activate"
+
+echo "Starting vLLM Voxtral server..."
+echo "Port: $PORT"
+
+if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
+ echo "Model: Voxtral Mini 4B Realtime"
+ exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
+ --host 0.0.0.0 \
+ --port "$PORT" \
+ --max-model-len 8192
+else
+ echo "Model: Voxtral Mini 3B"
+ exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
+ --tokenizer_mode mistral \
+ --config_format mistral \
+ --load_format mistral \
+ --host 0.0.0.0 \
+ --port "$PORT" \
+ --max-model-len 32768
+fi