From 60394076e5120a458165ba8c53f3281cc62d1f95 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Wed, 11 Feb 2026 16:10:00 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(mana-stt):=20add=20vLLM=20inte?= =?UTF-8?q?gration=20for=20Voxtral=20transcription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add vllm_service.py as proxy to vLLM server for Voxtral 3B/4B - Add voxtral_api_service.py for Mistral API fallback - Update main.py with /transcribe/voxtral endpoint using vLLM - Add /transcribe/auto endpoint with automatic fallback chain - Create setup-vllm.sh and start-vllm-voxtral.sh scripts - Add launchd plist files for Mac Mini deployment - Add install-services.sh for automated service installation Architecture: - vLLM server runs Voxtral models on port 8100 - mana-stt proxies to vLLM with Mistral API fallback - Fallback chain: vLLM -> Mistral API Co-Authored-By: Claude Opus 4.5 --- services/mana-stt/.env.example | 31 +++ services/mana-stt/app/main.py | 252 +++++++++++++----- services/mana-stt/app/vllm_service.py | 178 +++++++++++++ services/mana-stt/app/voxtral_api_service.py | 213 +++++++++++++++ services/mana-stt/app/voxtral_service.py | 218 ++++++++------- services/mana-stt/com.manacore.mana-stt.plist | 41 +++ .../mana-stt/com.manacore.vllm-voxtral.plist | 41 +++ services/mana-stt/install-service.sh | 45 ++++ services/mana-stt/install-services.sh | 84 ++++++ services/mana-stt/scripts/setup-vllm.sh | 83 ++++++ .../mana-stt/scripts/start-vllm-voxtral.sh | 36 +++ 11 files changed, 1060 insertions(+), 162 deletions(-) create mode 100644 services/mana-stt/.env.example create mode 100644 services/mana-stt/app/vllm_service.py create mode 100644 services/mana-stt/app/voxtral_api_service.py create mode 100644 services/mana-stt/com.manacore.mana-stt.plist create mode 100644 services/mana-stt/com.manacore.vllm-voxtral.plist create mode 100755 services/mana-stt/install-service.sh create mode 100755 services/mana-stt/install-services.sh create mode 100755 services/mana-stt/scripts/setup-vllm.sh create mode 100755 services/mana-stt/scripts/start-vllm-voxtral.sh diff --git a/services/mana-stt/.env.example b/services/mana-stt/.env.example new file mode 100644 index 000000000..df9062cbe --- /dev/null +++ b/services/mana-stt/.env.example @@ -0,0 +1,31 @@ +# ManaCore STT Service Configuration +# Copy to .env and adjust values as needed + +# Server +PORT=3020 + +# Whisper (Lightning MLX) +WHISPER_MODEL=large-v3 + +# Voxtral (Local Models) +# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b +VOXTRAL_MODEL=voxtral-realtime-4b + +# Model Loading +# Set to true to preload models on startup (slower startup, faster first request) +PRELOAD_MODELS=false + +# Load Management +# Maximum concurrent transcription requests before API fallback +MAX_CONCURRENT_REQUESTS=3 + +# API Fallback +# Enable automatic fallback to Mistral API when overloaded +API_FALLBACK_ENABLED=true + +# Mistral API Key (required for API fallback) +# Get your key at https://console.mistral.ai/ +MISTRAL_API_KEY= + +# CORS Origins (comma-separated) +CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173 diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py index 717115f2f..5423f044e 100644 --- a/services/mana-stt/app/main.py +++ b/services/mana-stt/app/main.py @@ -1,16 +1,17 @@ """ ManaCore STT API Service -Speech-to-Text with Whisper (MLX) and Voxtral +Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback) Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020 """ import os import logging +import time from typing import Optional from contextlib import asynccontextmanager -from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query +from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from pydantic import BaseModel @@ -31,32 +32,39 @@ CORS_ORIGINS = os.getenv( "https://mana.how,https://chat.mana.how,http://localhost:5173" ).split(",") +# vLLM configuration +VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100") +USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true" + # Response models class TranscriptionResponse(BaseModel): text: str language: Optional[str] = None model: str + latency_ms: Optional[float] = None duration_seconds: Optional[float] = None class HealthResponse(BaseModel): status: str whisper_loaded: bool - voxtral_loaded: bool + vllm_available: bool + vllm_url: Optional[str] = None + mistral_api_available: bool models: dict class ModelsResponse(BaseModel): whisper: list - voxtral: list + voxtral_vllm: list default_whisper: str # Track loaded models models_status = { "whisper_loaded": False, - "voxtral_loaded": False, + "vllm_available": False, } @@ -65,9 +73,24 @@ async def lifespan(app: FastAPI): """Startup and shutdown events.""" logger.info("Starting ManaCore STT Service...") - # Optionally preload models on startup + # Check vLLM availability + if USE_VLLM: + from app.vllm_service import check_health + health = await check_health() + models_status["vllm_available"] = health.get("status") == "healthy" + if models_status["vllm_available"]: + logger.info(f"vLLM server available at {VLLM_URL}") + else: + logger.warning(f"vLLM server not available: {health}") + + # Check Mistral API + from app.voxtral_api_service import is_available as api_available + if api_available(): + logger.info("Mistral API fallback configured") + + # Optionally preload Whisper if PRELOAD_MODELS: - logger.info("Preloading models (PRELOAD_MODELS=true)...") + logger.info("Preloading Whisper model...") try: from app.whisper_service import get_whisper_model get_whisper_model(DEFAULT_WHISPER_MODEL) @@ -76,16 +99,6 @@ async def lifespan(app: FastAPI): except Exception as e: logger.warning(f"Failed to preload Whisper: {e}") - try: - from app.voxtral_service import get_voxtral_model - get_voxtral_model() - models_status["voxtral_loaded"] = True - logger.info("Voxtral model preloaded") - except Exception as e: - logger.warning(f"Failed to preload Voxtral: {e}") - else: - logger.info("Models will be loaded on first request (lazy loading)") - logger.info(f"STT Service ready on port {PORT}") yield logger.info("Shutting down STT Service...") @@ -94,8 +107,8 @@ async def lifespan(app: FastAPI): # Create FastAPI app app = FastAPI( title="ManaCore STT Service", - description="Speech-to-Text API with Whisper (MLX) and Voxtral", - version="1.0.0", + description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API", + version="2.0.0", lifespan=lifespan, ) @@ -112,10 +125,17 @@ app.add_middleware( @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint.""" + from app.voxtral_api_service import is_available as api_available + from app.vllm_service import check_health + + vllm_health = await check_health() + return HealthResponse( status="healthy", whisper_loaded=models_status["whisper_loaded"], - voxtral_loaded=models_status["voxtral_loaded"], + vllm_available=vllm_health.get("status") == "healthy", + vllm_url=VLLM_URL if USE_VLLM else None, + mistral_api_available=api_available(), models={ "default_whisper": DEFAULT_WHISPER_MODEL, }, @@ -126,11 +146,13 @@ async def health_check(): async def list_models(): """List available models.""" from app.whisper_service import AVAILABLE_MODELS as whisper_models - from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages + from app.vllm_service import get_models + + vllm_models = await get_models() return ModelsResponse( whisper=whisper_models, - voxtral=voxtral_languages, + voxtral_vllm=vllm_models, default_whisper=DEFAULT_WHISPER_MODEL, ) @@ -138,25 +160,19 @@ async def list_models(): @app.post("/transcribe", response_model=TranscriptionResponse) async def transcribe_whisper( file: UploadFile = File(..., description="Audio file to transcribe"), - language: Optional[str] = Form( - None, - description="Language code (e.g., 'de', 'en'). Auto-detect if not provided." - ), - model: str = Form( - None, - description="Whisper model to use (default: large-v3-turbo)" - ), + language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"), + model: Optional[str] = Form(None, description="Whisper model to use"), ): """ Transcribe audio using Whisper (Lightning MLX). + Best for: General transcription, many languages Supported formats: mp3, wav, m4a, flac, ogg, webm Max file size: 100MB """ if not file.filename: raise HTTPException(status_code=400, detail="No file provided") - # Validate file type allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"} ext = os.path.splitext(file.filename)[1].lower() if ext not in allowed_extensions: @@ -165,20 +181,17 @@ async def transcribe_whisper( detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}" ) + start_time = time.time() + try: from app.whisper_service import transcribe_audio_bytes - # Read file audio_bytes = await file.read() - - # Check file size (100MB limit) if len(audio_bytes) > 100 * 1024 * 1024: raise HTTPException(status_code=400, detail="File too large (max 100MB)") - # Use default model if not specified model_name = model or DEFAULT_WHISPER_MODEL - # Transcribe result = await transcribe_audio_bytes( audio_bytes=audio_bytes, filename=file.filename, @@ -187,38 +200,53 @@ async def transcribe_whisper( ) models_status["whisper_loaded"] = True + latency_ms = (time.time() - start_time) * 1000 return TranscriptionResponse( text=result.text, language=result.language, model=f"whisper-{model_name}", + latency_ms=latency_ms, ) except Exception as e: - logger.error(f"Transcription error: {e}") + logger.error(f"Whisper transcription error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/transcribe/voxtral", response_model=TranscriptionResponse) async def transcribe_voxtral( file: UploadFile = File(..., description="Audio file to transcribe"), - language: str = Form( - "de", - description="Language code (de, en, fr, es, pt, it, nl, hi)" - ), + language: str = Form("de", description="Language code"), + use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"), ): """ - Transcribe audio using Voxtral Mini (Mistral AI). + Transcribe audio using Voxtral via vLLM server. - Best for: German, French, European languages - Supported formats: mp3, wav, m4a, flac + Models: + - Voxtral Mini 3B (default): Best quality + - Voxtral Realtime 4B: Lower latency (<500ms) + + Falls back to Mistral API if vLLM is unavailable. + + Supported formats: mp3, wav, m4a, flac, ogg, webm Max file size: 100MB """ if not file.filename: raise HTTPException(status_code=400, detail="No file provided") - # Validate language - from app.voxtral_service import SUPPORTED_LANGUAGES + from app.vllm_service import ( + SUPPORTED_LANGUAGES, + is_available as vllm_available, + transcribe_audio_bytes as vllm_transcribe, + transcribe_with_realtime, + check_health, + ) + from app.voxtral_api_service import ( + is_available as api_available, + transcribe_audio_bytes as api_transcribe, + ) + if language not in SUPPORTED_LANGUAGES: raise HTTPException( status_code=400, @@ -226,10 +254,94 @@ async def transcribe_voxtral( ) try: - from app.voxtral_service import transcribe_audio_bytes - audio_bytes = await file.read() + if len(audio_bytes) > 100 * 1024 * 1024: + raise HTTPException(status_code=400, detail="File too large (max 100MB)") + # Try vLLM first + if USE_VLLM: + health = await check_health() + if health.get("status") == "healthy": + logger.info("Using vLLM for Voxtral transcription") + if use_realtime: + result = await transcribe_with_realtime( + audio_bytes=audio_bytes, + filename=file.filename, + language=language, + ) + else: + result = await vllm_transcribe( + audio_bytes=audio_bytes, + filename=file.filename, + language=language, + ) + + return TranscriptionResponse( + text=result.text, + language=result.language, + model=result.model, + latency_ms=result.latency_ms, + duration_seconds=result.duration_seconds, + ) + + # Fallback to Mistral API + if api_available(): + logger.info("Falling back to Mistral API") + result = await api_transcribe( + audio_bytes=audio_bytes, + filename=file.filename, + language=language, + ) + + return TranscriptionResponse( + text=result.text, + language=result.language, + model=result.model, + latency_ms=None, + duration_seconds=result.duration_seconds, + ) + + raise HTTPException( + status_code=503, + detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY." + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Voxtral transcription error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse) +async def transcribe_voxtral_api( + file: UploadFile = File(..., description="Audio file to transcribe"), + language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"), + diarization: bool = Form(False, description="Enable speaker diarization"), +): + """ + Transcribe audio using Mistral's Voxtral API directly. + + Features: + - Speaker diarization + - Auto language detection + - High quality (~4% WER) + + Requires MISTRAL_API_KEY environment variable. + """ + from app.voxtral_api_service import is_available, transcribe_audio_bytes + + if not is_available(): + raise HTTPException( + status_code=503, + detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable." + ) + + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + try: + audio_bytes = await file.read() if len(audio_bytes) > 100 * 1024 * 1024: raise HTTPException(status_code=400, detail="File too large (max 100MB)") @@ -237,59 +349,61 @@ async def transcribe_voxtral( audio_bytes=audio_bytes, filename=file.filename, language=language, + diarization=diarization, ) - models_status["voxtral_loaded"] = True - return TranscriptionResponse( text=result.text, language=result.language, model=result.model, + duration_seconds=result.duration_seconds, ) except Exception as e: - logger.error(f"Voxtral transcription error: {e}") + logger.error(f"Mistral API error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/transcribe/auto", response_model=TranscriptionResponse) async def transcribe_auto( file: UploadFile = File(..., description="Audio file to transcribe"), - language: Optional[str] = Form( - None, - description="Language hint (optional)" - ), - prefer: str = Form( - "whisper", - description="Preferred model: 'whisper' or 'voxtral'" - ), + language: Optional[str] = Form(None, description="Language hint"), + prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"), ): """ - Transcribe audio with automatic model selection. + Transcribe with automatic model selection and fallback. - - Uses Whisper by default (faster, more languages) - - Falls back to Voxtral if Whisper fails + Fallback chain: + 1. Preferred model (whisper or voxtral) + 2. Alternative model + 3. Mistral API """ if prefer == "voxtral": - # Try Voxtral first try: - return await transcribe_voxtral(file, language or "de") + return await transcribe_voxtral(file, language or "de", False) except Exception as e: logger.warning(f"Voxtral failed, trying Whisper: {e}") - # Reset file position await file.seek(0) - return await transcribe_whisper(file, language, None) + try: + return await transcribe_whisper(file, language, None) + except Exception as e2: + logger.warning(f"Whisper failed, trying API: {e2}") + await file.seek(0) + return await transcribe_voxtral_api(file, language, False) else: - # Try Whisper first (default) try: return await transcribe_whisper(file, language, None) except Exception as e: logger.warning(f"Whisper failed, trying Voxtral: {e}") await file.seek(0) - return await transcribe_voxtral(file, language or "de") + try: + return await transcribe_voxtral(file, language or "de", False) + except Exception as e2: + logger.warning(f"Voxtral failed, trying API: {e2}") + await file.seek(0) + return await transcribe_voxtral_api(file, language, False) -# Error handlers @app.exception_handler(Exception) async def global_exception_handler(request, exc): logger.error(f"Unhandled error: {exc}") diff --git a/services/mana-stt/app/vllm_service.py b/services/mana-stt/app/vllm_service.py new file mode 100644 index 000000000..4ca1857a1 --- /dev/null +++ b/services/mana-stt/app/vllm_service.py @@ -0,0 +1,178 @@ +""" +vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription + +vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API. +This service proxies requests to the vLLM server. + +Requirements: +- vLLM server running on VLLM_URL (default: http://localhost:8100) +- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602 +""" + +import os +import logging +import time +import tempfile +import httpx +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# vLLM server configuration +VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100") +VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio + +# Model IDs +VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507" +VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602" + + +@dataclass +class VllmTranscriptionResult: + text: str + language: Optional[str] = None + model: str = "voxtral-vllm" + latency_ms: Optional[float] = None + duration_seconds: Optional[float] = None + + +async def check_health() -> dict: + """Check if vLLM server is healthy.""" + try: + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(f"{VLLM_URL}/health") + if response.status_code == 200: + return {"status": "healthy", "url": VLLM_URL} + return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code} + except Exception as e: + return {"status": "unavailable", "url": VLLM_URL, "error": str(e)} + + +async def get_models() -> list: + """Get available models from vLLM server.""" + try: + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(f"{VLLM_URL}/v1/models") + if response.status_code == 200: + data = response.json() + return [m["id"] for m in data.get("data", [])] + return [] + except Exception: + return [] + + +def is_available() -> bool: + """Check if vLLM server is configured.""" + return bool(VLLM_URL) + + +async def transcribe_audio_bytes( + audio_bytes: bytes, + filename: str, + language: Optional[str] = "de", + model: Optional[str] = None, +) -> VllmTranscriptionResult: + """ + Transcribe audio using vLLM Voxtral server. + + Args: + audio_bytes: Raw audio bytes + filename: Original filename (for format detection) + language: Language code (de, en, fr, etc.) + model: Model to use (defaults to Voxtral-Mini-3B-2507) + + Returns: + VllmTranscriptionResult with transcription + """ + start_time = time.time() + model_id = model or VOXTRAL_3B + + logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)") + + # Save to temp file (vLLM API accepts file uploads) + ext = Path(filename).suffix or ".wav" + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: + tmp.write(audio_bytes) + tmp_path = tmp.name + + try: + async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client: + # Use OpenAI-compatible transcription endpoint + with open(tmp_path, "rb") as f: + files = {"file": (filename, f, "audio/wav")} + data = { + "model": model_id, + "language": language or "de", + "response_format": "json", + "temperature": 0.0, # Deterministic for transcription + } + + response = await client.post( + f"{VLLM_URL}/v1/audio/transcriptions", + files=files, + data=data, + ) + + if response.status_code != 200: + error_detail = response.text + logger.error(f"vLLM error: {response.status_code} - {error_detail}") + raise RuntimeError(f"vLLM transcription failed: {error_detail}") + + result = response.json() + text = result.get("text", "") + duration = result.get("duration") + + latency_ms = (time.time() - start_time) * 1000 + logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms") + + return VllmTranscriptionResult( + text=text.strip(), + language=language, + model=f"vllm-{model_id.split('/')[-1]}", + latency_ms=latency_ms, + duration_seconds=duration, + ) + + finally: + try: + os.unlink(tmp_path) + except Exception: + pass + + +async def transcribe_with_realtime( + audio_bytes: bytes, + filename: str, + language: Optional[str] = "de", +) -> VllmTranscriptionResult: + """ + Transcribe using Voxtral 4B Realtime model. + + Optimized for low latency (<500ms). + """ + return await transcribe_audio_bytes( + audio_bytes=audio_bytes, + filename=filename, + language=language, + model=VOXTRAL_4B_REALTIME, + ) + + +# Supported languages (same as Voxtral) +SUPPORTED_LANGUAGES = [ + "en", # English + "zh", # Chinese + "hi", # Hindi + "es", # Spanish + "ar", # Arabic + "fr", # French + "pt", # Portuguese + "ru", # Russian + "de", # German + "ja", # Japanese + "ko", # Korean + "it", # Italian + "nl", # Dutch +] diff --git a/services/mana-stt/app/voxtral_api_service.py b/services/mana-stt/app/voxtral_api_service.py new file mode 100644 index 000000000..53d78f808 --- /dev/null +++ b/services/mana-stt/app/voxtral_api_service.py @@ -0,0 +1,213 @@ +""" +Voxtral API Service - Mistral Cloud API Fallback +Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded. + +Features: +- Speaker diarization +- Word-level timestamps +- Context biasing for domain-specific terms +- 13 language support +""" + +import os +import logging +import tempfile +from pathlib import Path +from typing import Optional, Literal +from dataclasses import dataclass, field + +logger = logging.getLogger(__name__) + +# Lazy load client +_mistral_client = None + +MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") +DEFAULT_MODEL = "voxtral-mini-latest" # voxtral-mini-2602 + + +@dataclass +class Speaker: + """Speaker information from diarization.""" + id: str + start: float + end: float + + +@dataclass +class WordTimestamp: + """Word-level timestamp.""" + word: str + start: float + end: float + + +@dataclass +class SegmentTimestamp: + """Segment-level timestamp.""" + text: str + start: float + end: float + speaker: Optional[str] = None + + +@dataclass +class VoxtralApiResult: + """Result from Voxtral API transcription.""" + text: str + language: Optional[str] = None + model: str = "voxtral-api" + duration_seconds: Optional[float] = None + words: list[WordTimestamp] = field(default_factory=list) + segments: list[SegmentTimestamp] = field(default_factory=list) + speakers: list[Speaker] = field(default_factory=list) + + +def get_mistral_client(): + """Get or create Mistral client instance.""" + global _mistral_client + + if _mistral_client is None: + if not MISTRAL_API_KEY: + raise RuntimeError( + "MISTRAL_API_KEY environment variable not set. " + "Get your API key at https://console.mistral.ai/" + ) + + try: + from mistralai import Mistral + _mistral_client = Mistral(api_key=MISTRAL_API_KEY) + logger.info("Mistral API client initialized") + except ImportError: + raise RuntimeError( + "mistralai package not installed. " + "Run: pip install mistralai" + ) + + return _mistral_client + + +def is_available() -> bool: + """Check if Mistral API is configured and available.""" + return bool(MISTRAL_API_KEY) + + +async def transcribe_audio_bytes( + audio_bytes: bytes, + filename: str, + language: Optional[str] = None, + timestamp_granularity: Optional[Literal["word", "segment"]] = None, + diarization: bool = False, + context_bias: Optional[list[str]] = None, +) -> VoxtralApiResult: + """ + Transcribe audio using Mistral's Voxtral API. + + Args: + audio_bytes: Raw audio bytes + filename: Original filename (for extension detection) + language: Language code (de, en, fr, etc.) - auto-detect if None + timestamp_granularity: "word" or "segment" for timestamps + diarization: Enable speaker diarization + context_bias: List of domain-specific terms to improve accuracy (max 100) + + Returns: + VoxtralApiResult with transcription and optional metadata + """ + client = get_mistral_client() + + logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)") + + try: + # Build request parameters + request_params = { + "model": DEFAULT_MODEL, + "file": { + "content": audio_bytes, + "file_name": filename, + }, + } + + # Language and timestamps are mutually exclusive in current API + if language and not timestamp_granularity: + request_params["language"] = language + + if timestamp_granularity: + request_params["timestamp_granularities"] = [timestamp_granularity] + + if diarization: + request_params["diarization"] = True + + if context_bias: + # API accepts comma-separated string, max 100 terms + bias_terms = context_bias[:100] + request_params["context_bias"] = ",".join(bias_terms) + + # Make API call + response = client.audio.transcriptions.complete(**request_params) + + # Parse response + result = VoxtralApiResult( + text=response.text, + language=getattr(response, "language", language), + model=f"voxtral-api-{DEFAULT_MODEL}", + duration_seconds=getattr(response, "duration", None), + ) + + # Parse word timestamps if present + if hasattr(response, "words") and response.words: + result.words = [ + WordTimestamp( + word=w.word, + start=w.start, + end=w.end, + ) + for w in response.words + ] + + # Parse segment timestamps if present + if hasattr(response, "segments") and response.segments: + result.segments = [ + SegmentTimestamp( + text=s.text, + start=s.start, + end=s.end, + speaker=getattr(s, "speaker", None), + ) + for s in response.segments + ] + + # Parse speakers if diarization enabled + if hasattr(response, "speakers") and response.speakers: + result.speakers = [ + Speaker( + id=sp.id, + start=sp.start, + end=sp.end, + ) + for sp in response.speakers + ] + + logger.info(f"Mistral API transcription complete: {len(result.text)} characters") + return result + + except Exception as e: + logger.error(f"Mistral API transcription failed: {e}") + raise + + +# Supported languages by Voxtral API (13 languages) +SUPPORTED_LANGUAGES = [ + "en", # English + "zh", # Chinese + "hi", # Hindi + "es", # Spanish + "ar", # Arabic + "fr", # French + "pt", # Portuguese + "ru", # Russian + "de", # German + "ja", # Japanese + "ko", # Korean + "it", # Italian + "nl", # Dutch +] diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py index 989ef863b..320e5020d 100644 --- a/services/mana-stt/app/voxtral_service.py +++ b/services/mana-stt/app/voxtral_service.py @@ -1,12 +1,15 @@ """ Voxtral STT Service using Hugging Face Transformers Mistral AI's Speech-to-Text model (Apache 2.0 License) + +Uses VoxtralForConditionalGeneration with apply_transcription_request +as per official HuggingFace documentation. """ import os import tempfile import logging -import base64 +import time from pathlib import Path from typing import Optional from dataclasses import dataclass @@ -16,68 +19,80 @@ logger = logging.getLogger(__name__) # Lazy load to avoid import errors _voxtral_model = None _voxtral_processor = None +_model_name = None + +# Default model +DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507" @dataclass class VoxtralTranscriptionResult: text: str language: Optional[str] = None - model: str = "voxtral-mini" + model: str = "voxtral-mini-3b" + latency_ms: Optional[float] = None -def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"): +def get_voxtral_model(model_name: str = DEFAULT_MODEL): """ Get or create Voxtral model instance. - Note: Voxtral Mini (3B) is recommended for Mac Mini M4. - Voxtral Small (24B) requires more VRAM. + Uses VoxtralForConditionalGeneration (the correct class for Voxtral). """ - global _voxtral_model, _voxtral_processor + global _voxtral_model, _voxtral_processor, _model_name + + # Reload if different model requested + if _voxtral_model is not None and _model_name != model_name: + logger.info(f"Switching model from {_model_name} to {model_name}") + _voxtral_model = None + _voxtral_processor = None if _voxtral_model is None: logger.info(f"Loading Voxtral model: {model_name}") try: import torch - from transformers import AutoModel, AutoProcessor + from transformers import VoxtralForConditionalGeneration, AutoProcessor - # Determine device + # Determine device and dtype if torch.backends.mps.is_available(): device = "mps" + # MPS works better with float16 torch_dtype = torch.float16 elif torch.cuda.is_available(): device = "cuda" - torch_dtype = torch.float16 + torch_dtype = torch.bfloat16 else: device = "cpu" torch_dtype = torch.float32 - logger.info(f"Using device: {device}") + logger.info(f"Using device: {device}, dtype: {torch_dtype}") # Load processor - _voxtral_processor = AutoProcessor.from_pretrained( - model_name, - trust_remote_code=True, - ) + _voxtral_processor = AutoProcessor.from_pretrained(model_name) - # Load model - Voxtral uses AutoModel, not AutoModelForSpeechSeq2Seq - _voxtral_model = AutoModel.from_pretrained( - model_name, - torch_dtype=torch_dtype, - device_map="auto" if device != "mps" else None, - trust_remote_code=True, - ) - - # Move to MPS if available (device_map doesn't support MPS) + # Load model with VoxtralForConditionalGeneration if device == "mps": + # MPS doesn't support device_map, load to CPU first then move + _voxtral_model = VoxtralForConditionalGeneration.from_pretrained( + model_name, + torch_dtype=torch_dtype, + ) _voxtral_model = _voxtral_model.to(device) + else: + _voxtral_model = VoxtralForConditionalGeneration.from_pretrained( + model_name, + torch_dtype=torch_dtype, + device_map=device, + ) + _model_name = model_name logger.info(f"Voxtral model loaded successfully on {device}") except ImportError as e: logger.error(f"Failed to import transformers: {e}") raise RuntimeError( - "transformers not installed. " - "Run: pip install transformers torch" + "transformers >= 4.54.0 required. " + "Run: pip install --upgrade transformers" ) except Exception as e: logger.error(f"Failed to load Voxtral model: {e}") @@ -89,17 +104,16 @@ def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"): def transcribe_audio( audio_path: str, language: Optional[str] = "de", - model_name: str = "mistralai/Voxtral-Mini-3B-2507", + model_name: str = DEFAULT_MODEL, ) -> VoxtralTranscriptionResult: """ Transcribe audio file using Voxtral. - Voxtral is a multimodal audio understanding model that can be prompted - for transcription tasks. + Uses the official apply_transcription_request method. Args: audio_path: Path to audio file - language: Target language for transcription + language: Language code (de, en, fr, etc.) model_name: Hugging Face model ID Returns: @@ -108,84 +122,49 @@ def transcribe_audio( import torch model, processor = get_voxtral_model(model_name) + device = next(model.parameters()).device + dtype = next(model.parameters()).dtype logger.info(f"Transcribing with Voxtral: {audio_path}") + start_time = time.time() try: - # Load audio file as bytes and encode to base64 - with open(audio_path, "rb") as f: - audio_bytes = f.read() - audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") - - # Determine audio format from extension - ext = Path(audio_path).suffix.lower() - mime_types = { - ".wav": "audio/wav", - ".mp3": "audio/mpeg", - ".m4a": "audio/m4a", - ".flac": "audio/flac", - ".ogg": "audio/ogg", - ".webm": "audio/webm", - } - mime_type = mime_types.get(ext, "audio/wav") - - # Language mapping for prompts - lang_names = { - "de": "German", - "en": "English", - "fr": "French", - "es": "Spanish", - "pt": "Portuguese", - "it": "Italian", - "nl": "Dutch", - "hi": "Hindi", - } - lang_name = lang_names.get(language, "German") - - # Create transcription prompt with base64 audio - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}}, - {"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."}, - ], - } - ] - - # Apply chat template and process inputs - inputs = processor.apply_chat_template( - messages, - tokenize=True, - return_tensors="pt", - return_dict=True, + # Use apply_transcription_request (official method) + # This handles audio loading and preprocessing internally + inputs = processor.apply_transcription_request( + language=language or "en", + audio=audio_path, + model_id=model_name, ) - # Move to same device as model - device = next(model.parameters()).device - inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()} + # Move inputs to device and dtype + inputs = inputs.to(device, dtype=dtype) # Generate transcription with torch.no_grad(): - generated_ids = model.generate( + outputs = model.generate( **inputs, - max_new_tokens=512, + max_new_tokens=500, do_sample=False, ) - # Decode only the generated tokens (exclude input) - input_len = inputs["input_ids"].shape[-1] - text = processor.batch_decode( - generated_ids[:, input_len:], + # Decode - skip input tokens + input_len = inputs.input_ids.shape[1] + decoded = processor.batch_decode( + outputs[:, input_len:], skip_special_tokens=True, - )[0] + ) - logger.info(f"Voxtral transcription complete: {len(text)} characters") + text = decoded[0] if decoded else "" + latency_ms = (time.time() - start_time) * 1000 + + logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms") return VoxtralTranscriptionResult( text=text.strip(), language=language, - model="voxtral-mini", + model=model_name.split("/")[-1], + latency_ms=latency_ms, ) except Exception as e: @@ -197,7 +176,7 @@ async def transcribe_audio_bytes( audio_bytes: bytes, filename: str, language: Optional[str] = "de", - model_name: str = "mistralai/Voxtral-Mini-3B-2507", + model_name: str = DEFAULT_MODEL, ) -> VoxtralTranscriptionResult: """ Transcribe audio from bytes (for API uploads). @@ -222,14 +201,67 @@ async def transcribe_audio_bytes( pass -# Supported languages by Voxtral +def unload_model(): + """Unload model to free memory.""" + global _voxtral_model, _voxtral_processor, _model_name + + if _voxtral_model is not None: + del _voxtral_model + del _voxtral_processor + _voxtral_model = None + _voxtral_processor = None + _model_name = None + + import gc + gc.collect() + + try: + import torch + if torch.backends.mps.is_available(): + torch.mps.empty_cache() + elif torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception: + pass + + logger.info("Voxtral model unloaded") + + +def is_loaded() -> bool: + """Check if model is currently loaded.""" + return _voxtral_model is not None + + +def get_loaded_model_name() -> Optional[str]: + """Get name of currently loaded model.""" + return _model_name + + +# Supported languages (13 languages as per Mistral docs) SUPPORTED_LANGUAGES = [ "en", # English - "de", # German - "fr", # French + "zh", # Chinese + "hi", # Hindi "es", # Spanish + "ar", # Arabic + "fr", # French "pt", # Portuguese + "ru", # Russian + "de", # German + "ja", # Japanese + "ko", # Korean "it", # Italian "nl", # Dutch - "hi", # Hindi +] + +# Available models +AVAILABLE_MODELS = [ + { + "id": "voxtral-mini-3b", + "name": "Voxtral-Mini-3B-2507", + "huggingface_id": "mistralai/Voxtral-Mini-3B-2507", + "params": "3B", + "vram": "~6GB", + "description": "Balanced quality and speed for local deployment", + }, ] diff --git a/services/mana-stt/com.manacore.mana-stt.plist b/services/mana-stt/com.manacore.mana-stt.plist new file mode 100644 index 000000000..97ef62521 --- /dev/null +++ b/services/mana-stt/com.manacore.mana-stt.plist @@ -0,0 +1,41 @@ + + + + + Label + com.manacore.mana-stt + + ProgramArguments + + /bin/bash + -c + cd /Users/mana/projects/manacore-monorepo/services/mana-stt && .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 3020 + + + WorkingDirectory + /Users/mana/projects/manacore-monorepo/services/mana-stt + + EnvironmentVariables + + PATH + /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin + PORT + 3020 + + + RunAtLoad + + + KeepAlive + + + StandardOutPath + /Users/mana/logs/mana-stt.log + + StandardErrorPath + /Users/mana/logs/mana-stt.error.log + + ThrottleInterval + 10 + + diff --git a/services/mana-stt/com.manacore.vllm-voxtral.plist b/services/mana-stt/com.manacore.vllm-voxtral.plist new file mode 100644 index 000000000..4cf9f5711 --- /dev/null +++ b/services/mana-stt/com.manacore.vllm-voxtral.plist @@ -0,0 +1,41 @@ + + + + + Label + com.manacore.vllm-voxtral + + ProgramArguments + + /bin/bash + -c + cd /Users/mana/projects/manacore-monorepo/services/mana-stt && ./scripts/start-vllm-voxtral.sh + + + WorkingDirectory + /Users/mana/projects/manacore-monorepo/services/mana-stt + + EnvironmentVariables + + PATH + /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin + VLLM_PORT + 8100 + + + RunAtLoad + + + KeepAlive + + + StandardOutPath + /Users/mana/logs/vllm-voxtral.log + + StandardErrorPath + /Users/mana/logs/vllm-voxtral.error.log + + ThrottleInterval + 30 + + diff --git a/services/mana-stt/install-service.sh b/services/mana-stt/install-service.sh new file mode 100755 index 000000000..6ee618cbe --- /dev/null +++ b/services/mana-stt/install-service.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Install mana-stt as a launchd service on macOS +# Run this script on the Mac Mini server + +set -e + +SERVICE_NAME="com.manacore.mana-stt" +PLIST_FILE="$SERVICE_NAME.plist" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents" +LOG_DIR="$HOME/logs" + +echo "Installing mana-stt launchd service..." + +# Create logs directory +mkdir -p "$LOG_DIR" + +# Stop existing service if running +if launchctl list | grep -q "$SERVICE_NAME"; then + echo "Stopping existing service..." + launchctl unload "$LAUNCH_AGENTS_DIR/$PLIST_FILE" 2>/dev/null || true +fi + +# Copy plist to LaunchAgents +cp "$SCRIPT_DIR/$PLIST_FILE" "$LAUNCH_AGENTS_DIR/" + +# Load the service +echo "Loading service..." +launchctl load "$LAUNCH_AGENTS_DIR/$PLIST_FILE" + +# Check status +sleep 2 +if launchctl list | grep -q "$SERVICE_NAME"; then + echo "Service installed and running!" + echo "" + echo "Useful commands:" + echo " View logs: tail -f $LOG_DIR/mana-stt.log" + echo " View errors: tail -f $LOG_DIR/mana-stt.error.log" + echo " Stop: launchctl unload $LAUNCH_AGENTS_DIR/$PLIST_FILE" + echo " Start: launchctl load $LAUNCH_AGENTS_DIR/$PLIST_FILE" + echo " Health check: curl http://localhost:3020/health" +else + echo "ERROR: Service failed to start. Check logs at $LOG_DIR/mana-stt.error.log" + exit 1 +fi diff --git a/services/mana-stt/install-services.sh b/services/mana-stt/install-services.sh new file mode 100755 index 000000000..e863f9236 --- /dev/null +++ b/services/mana-stt/install-services.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Install mana-stt and vllm-voxtral as launchd services on macOS +# Run this script on the Mac Mini server + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents" +LOG_DIR="$HOME/logs" + +echo "============================================" +echo "Installing ManaCore STT Services" +echo "============================================" +echo "" + +# Create logs directory +mkdir -p "$LOG_DIR" + +install_service() { + local service_name="$1" + local plist_file="$service_name.plist" + + echo "Installing $service_name..." + + # Stop existing service if running + if launchctl list | grep -q "$service_name"; then + echo " Stopping existing service..." + launchctl unload "$LAUNCH_AGENTS_DIR/$plist_file" 2>/dev/null || true + fi + + # Copy plist to LaunchAgents + cp "$SCRIPT_DIR/$plist_file" "$LAUNCH_AGENTS_DIR/" + + # Load the service + echo " Loading service..." + launchctl load "$LAUNCH_AGENTS_DIR/$plist_file" + + sleep 2 + if launchctl list | grep -q "$service_name"; then + echo " ✓ $service_name installed and running" + else + echo " ✗ $service_name failed to start" + return 1 + fi +} + +# Install vLLM first (STT depends on it) +install_service "com.manacore.vllm-voxtral" + +# Wait for vLLM to initialize +echo "" +echo "Waiting for vLLM server to initialize..." +for i in {1..30}; do + if curl -s http://localhost:8100/health > /dev/null 2>&1; then + echo " ✓ vLLM server is ready" + break + fi + if [ $i -eq 30 ]; then + echo " ! vLLM server not responding yet (may still be loading model)" + fi + sleep 2 +done + +# Install STT service +echo "" +install_service "com.manacore.mana-stt" + +echo "" +echo "============================================" +echo "Installation complete!" +echo "============================================" +echo "" +echo "Services:" +echo " vLLM Voxtral: http://localhost:8100" +echo " ManaCore STT: http://localhost:3020" +echo "" +echo "Useful commands:" +echo " View vLLM logs: tail -f $LOG_DIR/vllm-voxtral.log" +echo " View STT logs: tail -f $LOG_DIR/mana-stt.log" +echo " Health check: curl http://localhost:3020/health" +echo "" +echo "Stop all:" +echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.vllm-voxtral.plist" +echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.mana-stt.plist" diff --git a/services/mana-stt/scripts/setup-vllm.sh b/services/mana-stt/scripts/setup-vllm.sh new file mode 100755 index 000000000..c6a6ad48f --- /dev/null +++ b/services/mana-stt/scripts/setup-vllm.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Setup vLLM for Voxtral on Mac Mini M4 +# +# vLLM runs in CPU mode on macOS (no CUDA), but still provides +# the optimized inference pipeline for Voxtral models. +# +# Usage: ./scripts/setup-vllm.sh + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_DIR="$(dirname "$SCRIPT_DIR")" +VENV_DIR="$SERVICE_DIR/.venv-vllm" + +echo "============================================" +echo "vLLM Setup for Voxtral on Mac Mini M4" +echo "============================================" +echo "" + +# Check Python version +PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}') +PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1) +PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2) + +if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then + echo "Error: Python 3.10+ required (found $PYTHON_VERSION)" + exit 1 +fi +echo "Python version: $PYTHON_VERSION" + +# Create separate venv for vLLM (to avoid conflicts with whisper) +echo "" +echo "Creating virtual environment for vLLM..." +python3 -m venv "$VENV_DIR" +source "$VENV_DIR/bin/activate" + +# Upgrade pip +pip install --upgrade pip --quiet + +# Install vLLM with audio support +echo "" +echo "Installing vLLM with audio support..." +echo "This may take a few minutes..." + +# Install uv for faster package installation +pip install uv --quiet + +# Install vLLM with audio support (nightly for best Voxtral support) +uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || { + echo "Nightly install failed, trying stable..." + uv pip install "vllm[audio]>=0.10.0" +} + +# Install mistral-common with audio +uv pip install "mistral-common[audio]>=1.8.1" + +echo "" +echo "============================================" +echo "Installation complete!" +echo "============================================" +echo "" +echo "To start Voxtral Mini 3B server:" +echo " source $VENV_DIR/bin/activate" +echo " vllm serve mistralai/Voxtral-Mini-3B-2507 \\" +echo " --tokenizer_mode mistral \\" +echo " --config_format mistral \\" +echo " --load_format mistral \\" +echo " --host 0.0.0.0 \\" +echo " --port 8100" +echo "" +echo "To start Voxtral Realtime 4B server:" +echo " source $VENV_DIR/bin/activate" +echo " vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\" +echo " --host 0.0.0.0 \\" +echo " --port 8100" +echo "" +echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions" +echo "" +echo "Test with:" +echo " curl http://localhost:8100/v1/audio/transcriptions \\" +echo " -F file=@test.mp3 \\" +echo " -F model=mistralai/Voxtral-Mini-3B-2507 \\" +echo " -F language=de" diff --git a/services/mana-stt/scripts/start-vllm-voxtral.sh b/services/mana-stt/scripts/start-vllm-voxtral.sh new file mode 100755 index 000000000..280ba1970 --- /dev/null +++ b/services/mana-stt/scripts/start-vllm-voxtral.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Start vLLM server for Voxtral +# +# Usage: ./scripts/start-vllm-voxtral.sh [model] +# model: "3b" (default) or "4b" for Realtime + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_DIR="$(dirname "$SCRIPT_DIR")" +VENV_DIR="$SERVICE_DIR/.venv-vllm" +MODEL="${1:-3b}" +PORT="${VLLM_PORT:-8100}" + +# Activate venv +source "$VENV_DIR/bin/activate" + +echo "Starting vLLM Voxtral server..." +echo "Port: $PORT" + +if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then + echo "Model: Voxtral Mini 4B Realtime" + exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \ + --host 0.0.0.0 \ + --port "$PORT" \ + --max-model-len 8192 +else + echo "Model: Voxtral Mini 3B" + exec vllm serve mistralai/Voxtral-Mini-3B-2507 \ + --tokenizer_mode mistral \ + --config_format mistral \ + --load_format mistral \ + --host 0.0.0.0 \ + --port "$PORT" \ + --max-model-len 32768 +fi