""" vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API. This service proxies requests to the vLLM server. Requirements: - vLLM server running on VLLM_URL (default: http://localhost:8100) - Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602 """ import os import logging import time import tempfile import httpx from pathlib import Path from typing import Optional from dataclasses import dataclass logger = logging.getLogger(__name__) # vLLM server configuration VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100") VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio # Model IDs VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507" VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602" @dataclass class VllmTranscriptionResult: text: str language: Optional[str] = None model: str = "voxtral-vllm" latency_ms: Optional[float] = None duration_seconds: Optional[float] = None async def check_health() -> dict: """Check if vLLM server is healthy.""" try: async with httpx.AsyncClient(timeout=5.0) as client: response = await client.get(f"{VLLM_URL}/health") if response.status_code == 200: return {"status": "healthy", "url": VLLM_URL} return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code} except Exception as e: return {"status": "unavailable", "url": VLLM_URL, "error": str(e)} async def get_models() -> list: """Get available models from vLLM server.""" try: async with httpx.AsyncClient(timeout=5.0) as client: response = await client.get(f"{VLLM_URL}/v1/models") if response.status_code == 200: data = response.json() return [m["id"] for m in data.get("data", [])] return [] except Exception: return [] def is_available() -> bool: """Check if vLLM server is configured.""" return bool(VLLM_URL) async def transcribe_audio_bytes( audio_bytes: bytes, filename: str, language: Optional[str] = "de", model: Optional[str] = None, ) -> VllmTranscriptionResult: """ Transcribe audio using vLLM Voxtral server. Args: audio_bytes: Raw audio bytes filename: Original filename (for format detection) language: Language code (de, en, fr, etc.) model: Model to use (defaults to Voxtral-Mini-3B-2507) Returns: VllmTranscriptionResult with transcription """ start_time = time.time() model_id = model or VOXTRAL_3B logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)") # Save to temp file (vLLM API accepts file uploads) ext = Path(filename).suffix or ".wav" with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: tmp.write(audio_bytes) tmp_path = tmp.name try: async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client: # Use OpenAI-compatible transcription endpoint with open(tmp_path, "rb") as f: files = {"file": (filename, f, "audio/wav")} data = { "model": model_id, "language": language or "de", "response_format": "json", "temperature": 0.0, # Deterministic for transcription } response = await client.post( f"{VLLM_URL}/v1/audio/transcriptions", files=files, data=data, ) if response.status_code != 200: error_detail = response.text logger.error(f"vLLM error: {response.status_code} - {error_detail}") raise RuntimeError(f"vLLM transcription failed: {error_detail}") result = response.json() text = result.get("text", "") duration = result.get("duration") latency_ms = (time.time() - start_time) * 1000 logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms") return VllmTranscriptionResult( text=text.strip(), language=language, model=f"vllm-{model_id.split('/')[-1]}", latency_ms=latency_ms, duration_seconds=duration, ) finally: try: os.unlink(tmp_path) except Exception: pass async def transcribe_with_realtime( audio_bytes: bytes, filename: str, language: Optional[str] = "de", ) -> VllmTranscriptionResult: """ Transcribe using Voxtral 4B Realtime model. Optimized for low latency (<500ms). """ return await transcribe_audio_bytes( audio_bytes=audio_bytes, filename=filename, language=language, model=VOXTRAL_4B_REALTIME, ) # Supported languages (same as Voxtral) SUPPORTED_LANGUAGES = [ "en", # English "zh", # Chinese "hi", # Hindi "es", # Spanish "ar", # Arabic "fr", # French "pt", # Portuguese "ru", # Russian "de", # German "ja", # Japanese "ko", # Korean "it", # Italian "nl", # Dutch ]