feat(mana-stt): add vLLM integration for Voxtral transcription

- Add vllm_service.py as proxy to vLLM server for Voxtral 3B/4B
- Add voxtral_api_service.py for Mistral API fallback
- Update main.py with /transcribe/voxtral endpoint using vLLM
- Add /transcribe/auto endpoint with automatic fallback chain
- Create setup-vllm.sh and start-vllm-voxtral.sh scripts
- Add launchd plist files for Mac Mini deployment
- Add install-services.sh for automated service installation

Architecture:
- vLLM server runs Voxtral models on port 8100
- mana-stt proxies to vLLM with Mistral API fallback
- Fallback chain: vLLM -> Mistral API

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-11 16:10:00 +01:00
parent a2e2a5b73c
commit 60394076e5
11 changed files with 1060 additions and 162 deletions

View file

@ -0,0 +1,31 @@
# ManaCore STT Service Configuration
# Copy to .env and adjust values as needed
# Server
PORT=3020
# Whisper (Lightning MLX)
WHISPER_MODEL=large-v3
# Voxtral (Local Models)
# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
VOXTRAL_MODEL=voxtral-realtime-4b
# Model Loading
# Set to true to preload models on startup (slower startup, faster first request)
PRELOAD_MODELS=false
# Load Management
# Maximum concurrent transcription requests before API fallback
MAX_CONCURRENT_REQUESTS=3
# API Fallback
# Enable automatic fallback to Mistral API when overloaded
API_FALLBACK_ENABLED=true
# Mistral API Key (required for API fallback)
# Get your key at https://console.mistral.ai/
MISTRAL_API_KEY=
# CORS Origins (comma-separated)
CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173

View file

@ -1,16 +1,17 @@
"""
ManaCore STT API Service
Speech-to-Text with Whisper (MLX) and Voxtral
Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
"""
import os
import logging
import time
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel
@ -31,32 +32,39 @@ CORS_ORIGINS = os.getenv(
"https://mana.how,https://chat.mana.how,http://localhost:5173"
).split(",")
# vLLM configuration
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true"
# Response models
class TranscriptionResponse(BaseModel):
text: str
language: Optional[str] = None
model: str
latency_ms: Optional[float] = None
duration_seconds: Optional[float] = None
class HealthResponse(BaseModel):
status: str
whisper_loaded: bool
voxtral_loaded: bool
vllm_available: bool
vllm_url: Optional[str] = None
mistral_api_available: bool
models: dict
class ModelsResponse(BaseModel):
whisper: list
voxtral: list
voxtral_vllm: list
default_whisper: str
# Track loaded models
models_status = {
"whisper_loaded": False,
"voxtral_loaded": False,
"vllm_available": False,
}
@ -65,9 +73,24 @@ async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
logger.info("Starting ManaCore STT Service...")
# Optionally preload models on startup
# Check vLLM availability
if USE_VLLM:
from app.vllm_service import check_health
health = await check_health()
models_status["vllm_available"] = health.get("status") == "healthy"
if models_status["vllm_available"]:
logger.info(f"vLLM server available at {VLLM_URL}")
else:
logger.warning(f"vLLM server not available: {health}")
# Check Mistral API
from app.voxtral_api_service import is_available as api_available
if api_available():
logger.info("Mistral API fallback configured")
# Optionally preload Whisper
if PRELOAD_MODELS:
logger.info("Preloading models (PRELOAD_MODELS=true)...")
logger.info("Preloading Whisper model...")
try:
from app.whisper_service import get_whisper_model
get_whisper_model(DEFAULT_WHISPER_MODEL)
@ -76,16 +99,6 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"Failed to preload Whisper: {e}")
try:
from app.voxtral_service import get_voxtral_model
get_voxtral_model()
models_status["voxtral_loaded"] = True
logger.info("Voxtral model preloaded")
except Exception as e:
logger.warning(f"Failed to preload Voxtral: {e}")
else:
logger.info("Models will be loaded on first request (lazy loading)")
logger.info(f"STT Service ready on port {PORT}")
yield
logger.info("Shutting down STT Service...")
@ -94,8 +107,8 @@ async def lifespan(app: FastAPI):
# Create FastAPI app
app = FastAPI(
title="ManaCore STT Service",
description="Speech-to-Text API with Whisper (MLX) and Voxtral",
version="1.0.0",
description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API",
version="2.0.0",
lifespan=lifespan,
)
@ -112,10 +125,17 @@ app.add_middleware(
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint."""
from app.voxtral_api_service import is_available as api_available
from app.vllm_service import check_health
vllm_health = await check_health()
return HealthResponse(
status="healthy",
whisper_loaded=models_status["whisper_loaded"],
voxtral_loaded=models_status["voxtral_loaded"],
vllm_available=vllm_health.get("status") == "healthy",
vllm_url=VLLM_URL if USE_VLLM else None,
mistral_api_available=api_available(),
models={
"default_whisper": DEFAULT_WHISPER_MODEL,
},
@ -126,11 +146,13 @@ async def health_check():
async def list_models():
"""List available models."""
from app.whisper_service import AVAILABLE_MODELS as whisper_models
from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
from app.vllm_service import get_models
vllm_models = await get_models()
return ModelsResponse(
whisper=whisper_models,
voxtral=voxtral_languages,
voxtral_vllm=vllm_models,
default_whisper=DEFAULT_WHISPER_MODEL,
)
@ -138,25 +160,19 @@ async def list_models():
@app.post("/transcribe", response_model=TranscriptionResponse)
async def transcribe_whisper(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
),
model: str = Form(
None,
description="Whisper model to use (default: large-v3-turbo)"
),
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
model: Optional[str] = Form(None, description="Whisper model to use"),
):
"""
Transcribe audio using Whisper (Lightning MLX).
Best for: General transcription, many languages
Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate file type
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_extensions:
@ -165,20 +181,17 @@ async def transcribe_whisper(
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
)
start_time = time.time()
try:
from app.whisper_service import transcribe_audio_bytes
# Read file
audio_bytes = await file.read()
# Check file size (100MB limit)
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
# Use default model if not specified
model_name = model or DEFAULT_WHISPER_MODEL
# Transcribe
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
@ -187,38 +200,53 @@ async def transcribe_whisper(
)
models_status["whisper_loaded"] = True
latency_ms = (time.time() - start_time) * 1000
return TranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisper-{model_name}",
latency_ms=latency_ms,
)
except Exception as e:
logger.error(f"Transcription error: {e}")
logger.error(f"Whisper transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
async def transcribe_voxtral(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: str = Form(
"de",
description="Language code (de, en, fr, es, pt, it, nl, hi)"
),
language: str = Form("de", description="Language code"),
use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"),
):
"""
Transcribe audio using Voxtral Mini (Mistral AI).
Transcribe audio using Voxtral via vLLM server.
Best for: German, French, European languages
Supported formats: mp3, wav, m4a, flac
Models:
- Voxtral Mini 3B (default): Best quality
- Voxtral Realtime 4B: Lower latency (<500ms)
Falls back to Mistral API if vLLM is unavailable.
Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate language
from app.voxtral_service import SUPPORTED_LANGUAGES
from app.vllm_service import (
SUPPORTED_LANGUAGES,
is_available as vllm_available,
transcribe_audio_bytes as vllm_transcribe,
transcribe_with_realtime,
check_health,
)
from app.voxtral_api_service import (
is_available as api_available,
transcribe_audio_bytes as api_transcribe,
)
if language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
@ -226,10 +254,94 @@ async def transcribe_voxtral(
)
try:
from app.voxtral_service import transcribe_audio_bytes
audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
# Try vLLM first
if USE_VLLM:
health = await check_health()
if health.get("status") == "healthy":
logger.info("Using vLLM for Voxtral transcription")
if use_realtime:
result = await transcribe_with_realtime(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
)
else:
result = await vllm_transcribe(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
)
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
latency_ms=result.latency_ms,
duration_seconds=result.duration_seconds,
)
# Fallback to Mistral API
if api_available():
logger.info("Falling back to Mistral API")
result = await api_transcribe(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
)
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
latency_ms=None,
duration_seconds=result.duration_seconds,
)
raise HTTPException(
status_code=503,
detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY."
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Voxtral transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse)
async def transcribe_voxtral_api(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
diarization: bool = Form(False, description="Enable speaker diarization"),
):
"""
Transcribe audio using Mistral's Voxtral API directly.
Features:
- Speaker diarization
- Auto language detection
- High quality (~4% WER)
Requires MISTRAL_API_KEY environment variable.
"""
from app.voxtral_api_service import is_available, transcribe_audio_bytes
if not is_available():
raise HTTPException(
status_code=503,
detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable."
)
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
try:
audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
@ -237,59 +349,61 @@ async def transcribe_voxtral(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
diarization=diarization,
)
models_status["voxtral_loaded"] = True
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
duration_seconds=result.duration_seconds,
)
except Exception as e:
logger.error(f"Voxtral transcription error: {e}")
logger.error(f"Mistral API error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
async def transcribe_auto(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language hint (optional)"
),
prefer: str = Form(
"whisper",
description="Preferred model: 'whisper' or 'voxtral'"
),
language: Optional[str] = Form(None, description="Language hint"),
prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
):
"""
Transcribe audio with automatic model selection.
Transcribe with automatic model selection and fallback.
- Uses Whisper by default (faster, more languages)
- Falls back to Voxtral if Whisper fails
Fallback chain:
1. Preferred model (whisper or voxtral)
2. Alternative model
3. Mistral API
"""
if prefer == "voxtral":
# Try Voxtral first
try:
return await transcribe_voxtral(file, language or "de")
return await transcribe_voxtral(file, language or "de", False)
except Exception as e:
logger.warning(f"Voxtral failed, trying Whisper: {e}")
# Reset file position
await file.seek(0)
return await transcribe_whisper(file, language, None)
try:
return await transcribe_whisper(file, language, None)
except Exception as e2:
logger.warning(f"Whisper failed, trying API: {e2}")
await file.seek(0)
return await transcribe_voxtral_api(file, language, False)
else:
# Try Whisper first (default)
try:
return await transcribe_whisper(file, language, None)
except Exception as e:
logger.warning(f"Whisper failed, trying Voxtral: {e}")
await file.seek(0)
return await transcribe_voxtral(file, language or "de")
try:
return await transcribe_voxtral(file, language or "de", False)
except Exception as e2:
logger.warning(f"Voxtral failed, trying API: {e2}")
await file.seek(0)
return await transcribe_voxtral_api(file, language, False)
# Error handlers
@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
logger.error(f"Unhandled error: {exc}")

View file

@ -0,0 +1,178 @@
"""
vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription
vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API.
This service proxies requests to the vLLM server.
Requirements:
- vLLM server running on VLLM_URL (default: http://localhost:8100)
- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602
"""
import os
import logging
import time
import tempfile
import httpx
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# vLLM server configuration
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio
# Model IDs
VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507"
VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
@dataclass
class VllmTranscriptionResult:
text: str
language: Optional[str] = None
model: str = "voxtral-vllm"
latency_ms: Optional[float] = None
duration_seconds: Optional[float] = None
async def check_health() -> dict:
"""Check if vLLM server is healthy."""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(f"{VLLM_URL}/health")
if response.status_code == 200:
return {"status": "healthy", "url": VLLM_URL}
return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code}
except Exception as e:
return {"status": "unavailable", "url": VLLM_URL, "error": str(e)}
async def get_models() -> list:
"""Get available models from vLLM server."""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(f"{VLLM_URL}/v1/models")
if response.status_code == 200:
data = response.json()
return [m["id"] for m in data.get("data", [])]
return []
except Exception:
return []
def is_available() -> bool:
"""Check if vLLM server is configured."""
return bool(VLLM_URL)
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = "de",
model: Optional[str] = None,
) -> VllmTranscriptionResult:
"""
Transcribe audio using vLLM Voxtral server.
Args:
audio_bytes: Raw audio bytes
filename: Original filename (for format detection)
language: Language code (de, en, fr, etc.)
model: Model to use (defaults to Voxtral-Mini-3B-2507)
Returns:
VllmTranscriptionResult with transcription
"""
start_time = time.time()
model_id = model or VOXTRAL_3B
logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)")
# Save to temp file (vLLM API accepts file uploads)
ext = Path(filename).suffix or ".wav"
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client:
# Use OpenAI-compatible transcription endpoint
with open(tmp_path, "rb") as f:
files = {"file": (filename, f, "audio/wav")}
data = {
"model": model_id,
"language": language or "de",
"response_format": "json",
"temperature": 0.0, # Deterministic for transcription
}
response = await client.post(
f"{VLLM_URL}/v1/audio/transcriptions",
files=files,
data=data,
)
if response.status_code != 200:
error_detail = response.text
logger.error(f"vLLM error: {response.status_code} - {error_detail}")
raise RuntimeError(f"vLLM transcription failed: {error_detail}")
result = response.json()
text = result.get("text", "")
duration = result.get("duration")
latency_ms = (time.time() - start_time) * 1000
logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
return VllmTranscriptionResult(
text=text.strip(),
language=language,
model=f"vllm-{model_id.split('/')[-1]}",
latency_ms=latency_ms,
duration_seconds=duration,
)
finally:
try:
os.unlink(tmp_path)
except Exception:
pass
async def transcribe_with_realtime(
audio_bytes: bytes,
filename: str,
language: Optional[str] = "de",
) -> VllmTranscriptionResult:
"""
Transcribe using Voxtral 4B Realtime model.
Optimized for low latency (<500ms).
"""
return await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=filename,
language=language,
model=VOXTRAL_4B_REALTIME,
)
# Supported languages (same as Voxtral)
SUPPORTED_LANGUAGES = [
"en", # English
"zh", # Chinese
"hi", # Hindi
"es", # Spanish
"ar", # Arabic
"fr", # French
"pt", # Portuguese
"ru", # Russian
"de", # German
"ja", # Japanese
"ko", # Korean
"it", # Italian
"nl", # Dutch
]

View file

@ -0,0 +1,213 @@
"""
Voxtral API Service - Mistral Cloud API Fallback
Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded.
Features:
- Speaker diarization
- Word-level timestamps
- Context biasing for domain-specific terms
- 13 language support
"""
import os
import logging
import tempfile
from pathlib import Path
from typing import Optional, Literal
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# Lazy load client
_mistral_client = None
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
DEFAULT_MODEL = "voxtral-mini-latest" # voxtral-mini-2602
@dataclass
class Speaker:
"""Speaker information from diarization."""
id: str
start: float
end: float
@dataclass
class WordTimestamp:
"""Word-level timestamp."""
word: str
start: float
end: float
@dataclass
class SegmentTimestamp:
"""Segment-level timestamp."""
text: str
start: float
end: float
speaker: Optional[str] = None
@dataclass
class VoxtralApiResult:
"""Result from Voxtral API transcription."""
text: str
language: Optional[str] = None
model: str = "voxtral-api"
duration_seconds: Optional[float] = None
words: list[WordTimestamp] = field(default_factory=list)
segments: list[SegmentTimestamp] = field(default_factory=list)
speakers: list[Speaker] = field(default_factory=list)
def get_mistral_client():
"""Get or create Mistral client instance."""
global _mistral_client
if _mistral_client is None:
if not MISTRAL_API_KEY:
raise RuntimeError(
"MISTRAL_API_KEY environment variable not set. "
"Get your API key at https://console.mistral.ai/"
)
try:
from mistralai import Mistral
_mistral_client = Mistral(api_key=MISTRAL_API_KEY)
logger.info("Mistral API client initialized")
except ImportError:
raise RuntimeError(
"mistralai package not installed. "
"Run: pip install mistralai"
)
return _mistral_client
def is_available() -> bool:
"""Check if Mistral API is configured and available."""
return bool(MISTRAL_API_KEY)
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = None,
timestamp_granularity: Optional[Literal["word", "segment"]] = None,
diarization: bool = False,
context_bias: Optional[list[str]] = None,
) -> VoxtralApiResult:
"""
Transcribe audio using Mistral's Voxtral API.
Args:
audio_bytes: Raw audio bytes
filename: Original filename (for extension detection)
language: Language code (de, en, fr, etc.) - auto-detect if None
timestamp_granularity: "word" or "segment" for timestamps
diarization: Enable speaker diarization
context_bias: List of domain-specific terms to improve accuracy (max 100)
Returns:
VoxtralApiResult with transcription and optional metadata
"""
client = get_mistral_client()
logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)")
try:
# Build request parameters
request_params = {
"model": DEFAULT_MODEL,
"file": {
"content": audio_bytes,
"file_name": filename,
},
}
# Language and timestamps are mutually exclusive in current API
if language and not timestamp_granularity:
request_params["language"] = language
if timestamp_granularity:
request_params["timestamp_granularities"] = [timestamp_granularity]
if diarization:
request_params["diarization"] = True
if context_bias:
# API accepts comma-separated string, max 100 terms
bias_terms = context_bias[:100]
request_params["context_bias"] = ",".join(bias_terms)
# Make API call
response = client.audio.transcriptions.complete(**request_params)
# Parse response
result = VoxtralApiResult(
text=response.text,
language=getattr(response, "language", language),
model=f"voxtral-api-{DEFAULT_MODEL}",
duration_seconds=getattr(response, "duration", None),
)
# Parse word timestamps if present
if hasattr(response, "words") and response.words:
result.words = [
WordTimestamp(
word=w.word,
start=w.start,
end=w.end,
)
for w in response.words
]
# Parse segment timestamps if present
if hasattr(response, "segments") and response.segments:
result.segments = [
SegmentTimestamp(
text=s.text,
start=s.start,
end=s.end,
speaker=getattr(s, "speaker", None),
)
for s in response.segments
]
# Parse speakers if diarization enabled
if hasattr(response, "speakers") and response.speakers:
result.speakers = [
Speaker(
id=sp.id,
start=sp.start,
end=sp.end,
)
for sp in response.speakers
]
logger.info(f"Mistral API transcription complete: {len(result.text)} characters")
return result
except Exception as e:
logger.error(f"Mistral API transcription failed: {e}")
raise
# Supported languages by Voxtral API (13 languages)
SUPPORTED_LANGUAGES = [
"en", # English
"zh", # Chinese
"hi", # Hindi
"es", # Spanish
"ar", # Arabic
"fr", # French
"pt", # Portuguese
"ru", # Russian
"de", # German
"ja", # Japanese
"ko", # Korean
"it", # Italian
"nl", # Dutch
]

View file

@ -1,12 +1,15 @@
"""
Voxtral STT Service using Hugging Face Transformers
Mistral AI's Speech-to-Text model (Apache 2.0 License)
Uses VoxtralForConditionalGeneration with apply_transcription_request
as per official HuggingFace documentation.
"""
import os
import tempfile
import logging
import base64
import time
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
@ -16,68 +19,80 @@ logger = logging.getLogger(__name__)
# Lazy load to avoid import errors
_voxtral_model = None
_voxtral_processor = None
_model_name = None
# Default model
DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507"
@dataclass
class VoxtralTranscriptionResult:
text: str
language: Optional[str] = None
model: str = "voxtral-mini"
model: str = "voxtral-mini-3b"
latency_ms: Optional[float] = None
def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
def get_voxtral_model(model_name: str = DEFAULT_MODEL):
"""
Get or create Voxtral model instance.
Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
Voxtral Small (24B) requires more VRAM.
Uses VoxtralForConditionalGeneration (the correct class for Voxtral).
"""
global _voxtral_model, _voxtral_processor
global _voxtral_model, _voxtral_processor, _model_name
# Reload if different model requested
if _voxtral_model is not None and _model_name != model_name:
logger.info(f"Switching model from {_model_name} to {model_name}")
_voxtral_model = None
_voxtral_processor = None
if _voxtral_model is None:
logger.info(f"Loading Voxtral model: {model_name}")
try:
import torch
from transformers import AutoModel, AutoProcessor
from transformers import VoxtralForConditionalGeneration, AutoProcessor
# Determine device
# Determine device and dtype
if torch.backends.mps.is_available():
device = "mps"
# MPS works better with float16
torch_dtype = torch.float16
elif torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.float16
torch_dtype = torch.bfloat16
else:
device = "cpu"
torch_dtype = torch.float32
logger.info(f"Using device: {device}")
logger.info(f"Using device: {device}, dtype: {torch_dtype}")
# Load processor
_voxtral_processor = AutoProcessor.from_pretrained(
model_name,
trust_remote_code=True,
)
_voxtral_processor = AutoProcessor.from_pretrained(model_name)
# Load model - Voxtral uses AutoModel, not AutoModelForSpeechSeq2Seq
_voxtral_model = AutoModel.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map="auto" if device != "mps" else None,
trust_remote_code=True,
)
# Move to MPS if available (device_map doesn't support MPS)
# Load model with VoxtralForConditionalGeneration
if device == "mps":
# MPS doesn't support device_map, load to CPU first then move
_voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch_dtype,
)
_voxtral_model = _voxtral_model.to(device)
else:
_voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map=device,
)
_model_name = model_name
logger.info(f"Voxtral model loaded successfully on {device}")
except ImportError as e:
logger.error(f"Failed to import transformers: {e}")
raise RuntimeError(
"transformers not installed. "
"Run: pip install transformers torch"
"transformers >= 4.54.0 required. "
"Run: pip install --upgrade transformers"
)
except Exception as e:
logger.error(f"Failed to load Voxtral model: {e}")
@ -89,17 +104,16 @@ def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
def transcribe_audio(
audio_path: str,
language: Optional[str] = "de",
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
model_name: str = DEFAULT_MODEL,
) -> VoxtralTranscriptionResult:
"""
Transcribe audio file using Voxtral.
Voxtral is a multimodal audio understanding model that can be prompted
for transcription tasks.
Uses the official apply_transcription_request method.
Args:
audio_path: Path to audio file
language: Target language for transcription
language: Language code (de, en, fr, etc.)
model_name: Hugging Face model ID
Returns:
@ -108,84 +122,49 @@ def transcribe_audio(
import torch
model, processor = get_voxtral_model(model_name)
device = next(model.parameters()).device
dtype = next(model.parameters()).dtype
logger.info(f"Transcribing with Voxtral: {audio_path}")
start_time = time.time()
try:
# Load audio file as bytes and encode to base64
with open(audio_path, "rb") as f:
audio_bytes = f.read()
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
# Determine audio format from extension
ext = Path(audio_path).suffix.lower()
mime_types = {
".wav": "audio/wav",
".mp3": "audio/mpeg",
".m4a": "audio/m4a",
".flac": "audio/flac",
".ogg": "audio/ogg",
".webm": "audio/webm",
}
mime_type = mime_types.get(ext, "audio/wav")
# Language mapping for prompts
lang_names = {
"de": "German",
"en": "English",
"fr": "French",
"es": "Spanish",
"pt": "Portuguese",
"it": "Italian",
"nl": "Dutch",
"hi": "Hindi",
}
lang_name = lang_names.get(language, "German")
# Create transcription prompt with base64 audio
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}},
{"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."},
],
}
]
# Apply chat template and process inputs
inputs = processor.apply_chat_template(
messages,
tokenize=True,
return_tensors="pt",
return_dict=True,
# Use apply_transcription_request (official method)
# This handles audio loading and preprocessing internally
inputs = processor.apply_transcription_request(
language=language or "en",
audio=audio_path,
model_id=model_name,
)
# Move to same device as model
device = next(model.parameters()).device
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
# Move inputs to device and dtype
inputs = inputs.to(device, dtype=dtype)
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(
outputs = model.generate(
**inputs,
max_new_tokens=512,
max_new_tokens=500,
do_sample=False,
)
# Decode only the generated tokens (exclude input)
input_len = inputs["input_ids"].shape[-1]
text = processor.batch_decode(
generated_ids[:, input_len:],
# Decode - skip input tokens
input_len = inputs.input_ids.shape[1]
decoded = processor.batch_decode(
outputs[:, input_len:],
skip_special_tokens=True,
)[0]
)
logger.info(f"Voxtral transcription complete: {len(text)} characters")
text = decoded[0] if decoded else ""
latency_ms = (time.time() - start_time) * 1000
logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
return VoxtralTranscriptionResult(
text=text.strip(),
language=language,
model="voxtral-mini",
model=model_name.split("/")[-1],
latency_ms=latency_ms,
)
except Exception as e:
@ -197,7 +176,7 @@ async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = "de",
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
model_name: str = DEFAULT_MODEL,
) -> VoxtralTranscriptionResult:
"""
Transcribe audio from bytes (for API uploads).
@ -222,14 +201,67 @@ async def transcribe_audio_bytes(
pass
# Supported languages by Voxtral
def unload_model():
"""Unload model to free memory."""
global _voxtral_model, _voxtral_processor, _model_name
if _voxtral_model is not None:
del _voxtral_model
del _voxtral_processor
_voxtral_model = None
_voxtral_processor = None
_model_name = None
import gc
gc.collect()
try:
import torch
if torch.backends.mps.is_available():
torch.mps.empty_cache()
elif torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception:
pass
logger.info("Voxtral model unloaded")
def is_loaded() -> bool:
"""Check if model is currently loaded."""
return _voxtral_model is not None
def get_loaded_model_name() -> Optional[str]:
"""Get name of currently loaded model."""
return _model_name
# Supported languages (13 languages as per Mistral docs)
SUPPORTED_LANGUAGES = [
"en", # English
"de", # German
"fr", # French
"zh", # Chinese
"hi", # Hindi
"es", # Spanish
"ar", # Arabic
"fr", # French
"pt", # Portuguese
"ru", # Russian
"de", # German
"ja", # Japanese
"ko", # Korean
"it", # Italian
"nl", # Dutch
"hi", # Hindi
]
# Available models
AVAILABLE_MODELS = [
{
"id": "voxtral-mini-3b",
"name": "Voxtral-Mini-3B-2507",
"huggingface_id": "mistralai/Voxtral-Mini-3B-2507",
"params": "3B",
"vram": "~6GB",
"description": "Balanced quality and speed for local deployment",
},
]

View file

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.manacore.mana-stt</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>-c</string>
<string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt &amp;&amp; .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 3020</string>
</array>
<key>WorkingDirectory</key>
<string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
<key>EnvironmentVariables</key>
<dict>
<key>PATH</key>
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
<key>PORT</key>
<string>3020</string>
</dict>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<true/>
<key>StandardOutPath</key>
<string>/Users/mana/logs/mana-stt.log</string>
<key>StandardErrorPath</key>
<string>/Users/mana/logs/mana-stt.error.log</string>
<key>ThrottleInterval</key>
<integer>10</integer>
</dict>
</plist>

View file

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.manacore.vllm-voxtral</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>-c</string>
<string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt &amp;&amp; ./scripts/start-vllm-voxtral.sh</string>
</array>
<key>WorkingDirectory</key>
<string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
<key>EnvironmentVariables</key>
<dict>
<key>PATH</key>
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
<key>VLLM_PORT</key>
<string>8100</string>
</dict>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<true/>
<key>StandardOutPath</key>
<string>/Users/mana/logs/vllm-voxtral.log</string>
<key>StandardErrorPath</key>
<string>/Users/mana/logs/vllm-voxtral.error.log</string>
<key>ThrottleInterval</key>
<integer>30</integer>
</dict>
</plist>

View file

@ -0,0 +1,45 @@
#!/bin/bash
# Install mana-stt as a launchd service on macOS
# Run this script on the Mac Mini server
set -e
SERVICE_NAME="com.manacore.mana-stt"
PLIST_FILE="$SERVICE_NAME.plist"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
LOG_DIR="$HOME/logs"
echo "Installing mana-stt launchd service..."
# Create logs directory
mkdir -p "$LOG_DIR"
# Stop existing service if running
if launchctl list | grep -q "$SERVICE_NAME"; then
echo "Stopping existing service..."
launchctl unload "$LAUNCH_AGENTS_DIR/$PLIST_FILE" 2>/dev/null || true
fi
# Copy plist to LaunchAgents
cp "$SCRIPT_DIR/$PLIST_FILE" "$LAUNCH_AGENTS_DIR/"
# Load the service
echo "Loading service..."
launchctl load "$LAUNCH_AGENTS_DIR/$PLIST_FILE"
# Check status
sleep 2
if launchctl list | grep -q "$SERVICE_NAME"; then
echo "Service installed and running!"
echo ""
echo "Useful commands:"
echo " View logs: tail -f $LOG_DIR/mana-stt.log"
echo " View errors: tail -f $LOG_DIR/mana-stt.error.log"
echo " Stop: launchctl unload $LAUNCH_AGENTS_DIR/$PLIST_FILE"
echo " Start: launchctl load $LAUNCH_AGENTS_DIR/$PLIST_FILE"
echo " Health check: curl http://localhost:3020/health"
else
echo "ERROR: Service failed to start. Check logs at $LOG_DIR/mana-stt.error.log"
exit 1
fi

View file

@ -0,0 +1,84 @@
#!/bin/bash
# Install mana-stt and vllm-voxtral as launchd services on macOS
# Run this script on the Mac Mini server
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
LOG_DIR="$HOME/logs"
echo "============================================"
echo "Installing ManaCore STT Services"
echo "============================================"
echo ""
# Create logs directory
mkdir -p "$LOG_DIR"
install_service() {
local service_name="$1"
local plist_file="$service_name.plist"
echo "Installing $service_name..."
# Stop existing service if running
if launchctl list | grep -q "$service_name"; then
echo " Stopping existing service..."
launchctl unload "$LAUNCH_AGENTS_DIR/$plist_file" 2>/dev/null || true
fi
# Copy plist to LaunchAgents
cp "$SCRIPT_DIR/$plist_file" "$LAUNCH_AGENTS_DIR/"
# Load the service
echo " Loading service..."
launchctl load "$LAUNCH_AGENTS_DIR/$plist_file"
sleep 2
if launchctl list | grep -q "$service_name"; then
echo "$service_name installed and running"
else
echo "$service_name failed to start"
return 1
fi
}
# Install vLLM first (STT depends on it)
install_service "com.manacore.vllm-voxtral"
# Wait for vLLM to initialize
echo ""
echo "Waiting for vLLM server to initialize..."
for i in {1..30}; do
if curl -s http://localhost:8100/health > /dev/null 2>&1; then
echo " ✓ vLLM server is ready"
break
fi
if [ $i -eq 30 ]; then
echo " ! vLLM server not responding yet (may still be loading model)"
fi
sleep 2
done
# Install STT service
echo ""
install_service "com.manacore.mana-stt"
echo ""
echo "============================================"
echo "Installation complete!"
echo "============================================"
echo ""
echo "Services:"
echo " vLLM Voxtral: http://localhost:8100"
echo " ManaCore STT: http://localhost:3020"
echo ""
echo "Useful commands:"
echo " View vLLM logs: tail -f $LOG_DIR/vllm-voxtral.log"
echo " View STT logs: tail -f $LOG_DIR/mana-stt.log"
echo " Health check: curl http://localhost:3020/health"
echo ""
echo "Stop all:"
echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.vllm-voxtral.plist"
echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.mana-stt.plist"

View file

@ -0,0 +1,83 @@
#!/bin/bash
# Setup vLLM for Voxtral on Mac Mini M4
#
# vLLM runs in CPU mode on macOS (no CUDA), but still provides
# the optimized inference pipeline for Voxtral models.
#
# Usage: ./scripts/setup-vllm.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
VENV_DIR="$SERVICE_DIR/.venv-vllm"
echo "============================================"
echo "vLLM Setup for Voxtral on Mac Mini M4"
echo "============================================"
echo ""
# Check Python version
PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then
echo "Error: Python 3.10+ required (found $PYTHON_VERSION)"
exit 1
fi
echo "Python version: $PYTHON_VERSION"
# Create separate venv for vLLM (to avoid conflicts with whisper)
echo ""
echo "Creating virtual environment for vLLM..."
python3 -m venv "$VENV_DIR"
source "$VENV_DIR/bin/activate"
# Upgrade pip
pip install --upgrade pip --quiet
# Install vLLM with audio support
echo ""
echo "Installing vLLM with audio support..."
echo "This may take a few minutes..."
# Install uv for faster package installation
pip install uv --quiet
# Install vLLM with audio support (nightly for best Voxtral support)
uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || {
echo "Nightly install failed, trying stable..."
uv pip install "vllm[audio]>=0.10.0"
}
# Install mistral-common with audio
uv pip install "mistral-common[audio]>=1.8.1"
echo ""
echo "============================================"
echo "Installation complete!"
echo "============================================"
echo ""
echo "To start Voxtral Mini 3B server:"
echo " source $VENV_DIR/bin/activate"
echo " vllm serve mistralai/Voxtral-Mini-3B-2507 \\"
echo " --tokenizer_mode mistral \\"
echo " --config_format mistral \\"
echo " --load_format mistral \\"
echo " --host 0.0.0.0 \\"
echo " --port 8100"
echo ""
echo "To start Voxtral Realtime 4B server:"
echo " source $VENV_DIR/bin/activate"
echo " vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\"
echo " --host 0.0.0.0 \\"
echo " --port 8100"
echo ""
echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions"
echo ""
echo "Test with:"
echo " curl http://localhost:8100/v1/audio/transcriptions \\"
echo " -F file=@test.mp3 \\"
echo " -F model=mistralai/Voxtral-Mini-3B-2507 \\"
echo " -F language=de"

View file

@ -0,0 +1,36 @@
#!/bin/bash
# Start vLLM server for Voxtral
#
# Usage: ./scripts/start-vllm-voxtral.sh [model]
# model: "3b" (default) or "4b" for Realtime
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
VENV_DIR="$SERVICE_DIR/.venv-vllm"
MODEL="${1:-3b}"
PORT="${VLLM_PORT:-8100}"
# Activate venv
source "$VENV_DIR/bin/activate"
echo "Starting vLLM Voxtral server..."
echo "Port: $PORT"
if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
echo "Model: Voxtral Mini 4B Realtime"
exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
--host 0.0.0.0 \
--port "$PORT" \
--max-model-len 8192
else
echo "Model: Voxtral Mini 3B"
exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
--tokenizer_mode mistral \
--config_format mistral \
--load_format mistral \
--host 0.0.0.0 \
--port "$PORT" \
--max-model-len 32768
fi