mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:41:09 +02:00
✨ feat(mana-stt): add vLLM integration for Voxtral transcription
- Add vllm_service.py as proxy to vLLM server for Voxtral 3B/4B - Add voxtral_api_service.py for Mistral API fallback - Update main.py with /transcribe/voxtral endpoint using vLLM - Add /transcribe/auto endpoint with automatic fallback chain - Create setup-vllm.sh and start-vllm-voxtral.sh scripts - Add launchd plist files for Mac Mini deployment - Add install-services.sh for automated service installation Architecture: - vLLM server runs Voxtral models on port 8100 - mana-stt proxies to vLLM with Mistral API fallback - Fallback chain: vLLM -> Mistral API Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
a2e2a5b73c
commit
60394076e5
11 changed files with 1060 additions and 162 deletions
31
services/mana-stt/.env.example
Normal file
31
services/mana-stt/.env.example
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
# ManaCore STT Service Configuration
|
||||
# Copy to .env and adjust values as needed
|
||||
|
||||
# Server
|
||||
PORT=3020
|
||||
|
||||
# Whisper (Lightning MLX)
|
||||
WHISPER_MODEL=large-v3
|
||||
|
||||
# Voxtral (Local Models)
|
||||
# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
|
||||
VOXTRAL_MODEL=voxtral-realtime-4b
|
||||
|
||||
# Model Loading
|
||||
# Set to true to preload models on startup (slower startup, faster first request)
|
||||
PRELOAD_MODELS=false
|
||||
|
||||
# Load Management
|
||||
# Maximum concurrent transcription requests before API fallback
|
||||
MAX_CONCURRENT_REQUESTS=3
|
||||
|
||||
# API Fallback
|
||||
# Enable automatic fallback to Mistral API when overloaded
|
||||
API_FALLBACK_ENABLED=true
|
||||
|
||||
# Mistral API Key (required for API fallback)
|
||||
# Get your key at https://console.mistral.ai/
|
||||
MISTRAL_API_KEY=
|
||||
|
||||
# CORS Origins (comma-separated)
|
||||
CORS_ORIGINS=https://mana.how,https://chat.mana.how,http://localhost:5173
|
||||
|
|
@ -1,16 +1,17 @@
|
|||
"""
|
||||
ManaCore STT API Service
|
||||
Speech-to-Text with Whisper (MLX) and Voxtral
|
||||
Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
|
||||
|
||||
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
|
@ -31,32 +32,39 @@ CORS_ORIGINS = os.getenv(
|
|||
"https://mana.how,https://chat.mana.how,http://localhost:5173"
|
||||
).split(",")
|
||||
|
||||
# vLLM configuration
|
||||
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
|
||||
USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true"
|
||||
|
||||
|
||||
# Response models
|
||||
class TranscriptionResponse(BaseModel):
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str
|
||||
latency_ms: Optional[float] = None
|
||||
duration_seconds: Optional[float] = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
whisper_loaded: bool
|
||||
voxtral_loaded: bool
|
||||
vllm_available: bool
|
||||
vllm_url: Optional[str] = None
|
||||
mistral_api_available: bool
|
||||
models: dict
|
||||
|
||||
|
||||
class ModelsResponse(BaseModel):
|
||||
whisper: list
|
||||
voxtral: list
|
||||
voxtral_vllm: list
|
||||
default_whisper: str
|
||||
|
||||
|
||||
# Track loaded models
|
||||
models_status = {
|
||||
"whisper_loaded": False,
|
||||
"voxtral_loaded": False,
|
||||
"vllm_available": False,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -65,9 +73,24 @@ async def lifespan(app: FastAPI):
|
|||
"""Startup and shutdown events."""
|
||||
logger.info("Starting ManaCore STT Service...")
|
||||
|
||||
# Optionally preload models on startup
|
||||
# Check vLLM availability
|
||||
if USE_VLLM:
|
||||
from app.vllm_service import check_health
|
||||
health = await check_health()
|
||||
models_status["vllm_available"] = health.get("status") == "healthy"
|
||||
if models_status["vllm_available"]:
|
||||
logger.info(f"vLLM server available at {VLLM_URL}")
|
||||
else:
|
||||
logger.warning(f"vLLM server not available: {health}")
|
||||
|
||||
# Check Mistral API
|
||||
from app.voxtral_api_service import is_available as api_available
|
||||
if api_available():
|
||||
logger.info("Mistral API fallback configured")
|
||||
|
||||
# Optionally preload Whisper
|
||||
if PRELOAD_MODELS:
|
||||
logger.info("Preloading models (PRELOAD_MODELS=true)...")
|
||||
logger.info("Preloading Whisper model...")
|
||||
try:
|
||||
from app.whisper_service import get_whisper_model
|
||||
get_whisper_model(DEFAULT_WHISPER_MODEL)
|
||||
|
|
@ -76,16 +99,6 @@ async def lifespan(app: FastAPI):
|
|||
except Exception as e:
|
||||
logger.warning(f"Failed to preload Whisper: {e}")
|
||||
|
||||
try:
|
||||
from app.voxtral_service import get_voxtral_model
|
||||
get_voxtral_model()
|
||||
models_status["voxtral_loaded"] = True
|
||||
logger.info("Voxtral model preloaded")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to preload Voxtral: {e}")
|
||||
else:
|
||||
logger.info("Models will be loaded on first request (lazy loading)")
|
||||
|
||||
logger.info(f"STT Service ready on port {PORT}")
|
||||
yield
|
||||
logger.info("Shutting down STT Service...")
|
||||
|
|
@ -94,8 +107,8 @@ async def lifespan(app: FastAPI):
|
|||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title="ManaCore STT Service",
|
||||
description="Speech-to-Text API with Whisper (MLX) and Voxtral",
|
||||
version="1.0.0",
|
||||
description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API",
|
||||
version="2.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
|
|
@ -112,10 +125,17 @@ app.add_middleware(
|
|||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
from app.voxtral_api_service import is_available as api_available
|
||||
from app.vllm_service import check_health
|
||||
|
||||
vllm_health = await check_health()
|
||||
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
whisper_loaded=models_status["whisper_loaded"],
|
||||
voxtral_loaded=models_status["voxtral_loaded"],
|
||||
vllm_available=vllm_health.get("status") == "healthy",
|
||||
vllm_url=VLLM_URL if USE_VLLM else None,
|
||||
mistral_api_available=api_available(),
|
||||
models={
|
||||
"default_whisper": DEFAULT_WHISPER_MODEL,
|
||||
},
|
||||
|
|
@ -126,11 +146,13 @@ async def health_check():
|
|||
async def list_models():
|
||||
"""List available models."""
|
||||
from app.whisper_service import AVAILABLE_MODELS as whisper_models
|
||||
from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
|
||||
from app.vllm_service import get_models
|
||||
|
||||
vllm_models = await get_models()
|
||||
|
||||
return ModelsResponse(
|
||||
whisper=whisper_models,
|
||||
voxtral=voxtral_languages,
|
||||
voxtral_vllm=vllm_models,
|
||||
default_whisper=DEFAULT_WHISPER_MODEL,
|
||||
)
|
||||
|
||||
|
|
@ -138,25 +160,19 @@ async def list_models():
|
|||
@app.post("/transcribe", response_model=TranscriptionResponse)
|
||||
async def transcribe_whisper(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: Optional[str] = Form(
|
||||
None,
|
||||
description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
|
||||
),
|
||||
model: str = Form(
|
||||
None,
|
||||
description="Whisper model to use (default: large-v3-turbo)"
|
||||
),
|
||||
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
||||
model: Optional[str] = Form(None, description="Whisper model to use"),
|
||||
):
|
||||
"""
|
||||
Transcribe audio using Whisper (Lightning MLX).
|
||||
|
||||
Best for: General transcription, many languages
|
||||
Supported formats: mp3, wav, m4a, flac, ogg, webm
|
||||
Max file size: 100MB
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
# Validate file type
|
||||
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
|
||||
ext = os.path.splitext(file.filename)[1].lower()
|
||||
if ext not in allowed_extensions:
|
||||
|
|
@ -165,20 +181,17 @@ async def transcribe_whisper(
|
|||
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
from app.whisper_service import transcribe_audio_bytes
|
||||
|
||||
# Read file
|
||||
audio_bytes = await file.read()
|
||||
|
||||
# Check file size (100MB limit)
|
||||
if len(audio_bytes) > 100 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
||||
|
||||
# Use default model if not specified
|
||||
model_name = model or DEFAULT_WHISPER_MODEL
|
||||
|
||||
# Transcribe
|
||||
result = await transcribe_audio_bytes(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
|
|
@ -187,38 +200,53 @@ async def transcribe_whisper(
|
|||
)
|
||||
|
||||
models_status["whisper_loaded"] = True
|
||||
latency_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=f"whisper-{model_name}",
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription error: {e}")
|
||||
logger.error(f"Whisper transcription error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
|
||||
async def transcribe_voxtral(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: str = Form(
|
||||
"de",
|
||||
description="Language code (de, en, fr, es, pt, it, nl, hi)"
|
||||
),
|
||||
language: str = Form("de", description="Language code"),
|
||||
use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"),
|
||||
):
|
||||
"""
|
||||
Transcribe audio using Voxtral Mini (Mistral AI).
|
||||
Transcribe audio using Voxtral via vLLM server.
|
||||
|
||||
Best for: German, French, European languages
|
||||
Supported formats: mp3, wav, m4a, flac
|
||||
Models:
|
||||
- Voxtral Mini 3B (default): Best quality
|
||||
- Voxtral Realtime 4B: Lower latency (<500ms)
|
||||
|
||||
Falls back to Mistral API if vLLM is unavailable.
|
||||
|
||||
Supported formats: mp3, wav, m4a, flac, ogg, webm
|
||||
Max file size: 100MB
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
# Validate language
|
||||
from app.voxtral_service import SUPPORTED_LANGUAGES
|
||||
from app.vllm_service import (
|
||||
SUPPORTED_LANGUAGES,
|
||||
is_available as vllm_available,
|
||||
transcribe_audio_bytes as vllm_transcribe,
|
||||
transcribe_with_realtime,
|
||||
check_health,
|
||||
)
|
||||
from app.voxtral_api_service import (
|
||||
is_available as api_available,
|
||||
transcribe_audio_bytes as api_transcribe,
|
||||
)
|
||||
|
||||
if language not in SUPPORTED_LANGUAGES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
|
@ -226,10 +254,94 @@ async def transcribe_voxtral(
|
|||
)
|
||||
|
||||
try:
|
||||
from app.voxtral_service import transcribe_audio_bytes
|
||||
|
||||
audio_bytes = await file.read()
|
||||
if len(audio_bytes) > 100 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
||||
|
||||
# Try vLLM first
|
||||
if USE_VLLM:
|
||||
health = await check_health()
|
||||
if health.get("status") == "healthy":
|
||||
logger.info("Using vLLM for Voxtral transcription")
|
||||
if use_realtime:
|
||||
result = await transcribe_with_realtime(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
)
|
||||
else:
|
||||
result = await vllm_transcribe(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
)
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=result.model,
|
||||
latency_ms=result.latency_ms,
|
||||
duration_seconds=result.duration_seconds,
|
||||
)
|
||||
|
||||
# Fallback to Mistral API
|
||||
if api_available():
|
||||
logger.info("Falling back to Mistral API")
|
||||
result = await api_transcribe(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
)
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=result.model,
|
||||
latency_ms=None,
|
||||
duration_seconds=result.duration_seconds,
|
||||
)
|
||||
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY."
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Voxtral transcription error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse)
|
||||
async def transcribe_voxtral_api(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
||||
diarization: bool = Form(False, description="Enable speaker diarization"),
|
||||
):
|
||||
"""
|
||||
Transcribe audio using Mistral's Voxtral API directly.
|
||||
|
||||
Features:
|
||||
- Speaker diarization
|
||||
- Auto language detection
|
||||
- High quality (~4% WER)
|
||||
|
||||
Requires MISTRAL_API_KEY environment variable.
|
||||
"""
|
||||
from app.voxtral_api_service import is_available, transcribe_audio_bytes
|
||||
|
||||
if not is_available():
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable."
|
||||
)
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
try:
|
||||
audio_bytes = await file.read()
|
||||
if len(audio_bytes) > 100 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
||||
|
||||
|
|
@ -237,59 +349,61 @@ async def transcribe_voxtral(
|
|||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
diarization=diarization,
|
||||
)
|
||||
|
||||
models_status["voxtral_loaded"] = True
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=result.model,
|
||||
duration_seconds=result.duration_seconds,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voxtral transcription error: {e}")
|
||||
logger.error(f"Mistral API error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
|
||||
async def transcribe_auto(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: Optional[str] = Form(
|
||||
None,
|
||||
description="Language hint (optional)"
|
||||
),
|
||||
prefer: str = Form(
|
||||
"whisper",
|
||||
description="Preferred model: 'whisper' or 'voxtral'"
|
||||
),
|
||||
language: Optional[str] = Form(None, description="Language hint"),
|
||||
prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
|
||||
):
|
||||
"""
|
||||
Transcribe audio with automatic model selection.
|
||||
Transcribe with automatic model selection and fallback.
|
||||
|
||||
- Uses Whisper by default (faster, more languages)
|
||||
- Falls back to Voxtral if Whisper fails
|
||||
Fallback chain:
|
||||
1. Preferred model (whisper or voxtral)
|
||||
2. Alternative model
|
||||
3. Mistral API
|
||||
"""
|
||||
if prefer == "voxtral":
|
||||
# Try Voxtral first
|
||||
try:
|
||||
return await transcribe_voxtral(file, language or "de")
|
||||
return await transcribe_voxtral(file, language or "de", False)
|
||||
except Exception as e:
|
||||
logger.warning(f"Voxtral failed, trying Whisper: {e}")
|
||||
# Reset file position
|
||||
await file.seek(0)
|
||||
return await transcribe_whisper(file, language, None)
|
||||
try:
|
||||
return await transcribe_whisper(file, language, None)
|
||||
except Exception as e2:
|
||||
logger.warning(f"Whisper failed, trying API: {e2}")
|
||||
await file.seek(0)
|
||||
return await transcribe_voxtral_api(file, language, False)
|
||||
else:
|
||||
# Try Whisper first (default)
|
||||
try:
|
||||
return await transcribe_whisper(file, language, None)
|
||||
except Exception as e:
|
||||
logger.warning(f"Whisper failed, trying Voxtral: {e}")
|
||||
await file.seek(0)
|
||||
return await transcribe_voxtral(file, language or "de")
|
||||
try:
|
||||
return await transcribe_voxtral(file, language or "de", False)
|
||||
except Exception as e2:
|
||||
logger.warning(f"Voxtral failed, trying API: {e2}")
|
||||
await file.seek(0)
|
||||
return await transcribe_voxtral_api(file, language, False)
|
||||
|
||||
|
||||
# Error handlers
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request, exc):
|
||||
logger.error(f"Unhandled error: {exc}")
|
||||
|
|
|
|||
178
services/mana-stt/app/vllm_service.py
Normal file
178
services/mana-stt/app/vllm_service.py
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
"""
|
||||
vLLM Voxtral Service - Proxy to vLLM server for Voxtral transcription
|
||||
|
||||
vLLM provides optimized inference for Voxtral models with an OpenAI-compatible API.
|
||||
This service proxies requests to the vLLM server.
|
||||
|
||||
Requirements:
|
||||
- vLLM server running on VLLM_URL (default: http://localhost:8100)
|
||||
- Model loaded: Voxtral-Mini-3B-2507 or Voxtral-Mini-4B-Realtime-2602
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
import tempfile
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# vLLM server configuration
|
||||
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
|
||||
VLLM_TIMEOUT = int(os.getenv("VLLM_TIMEOUT", "300")) # 5 minutes for long audio
|
||||
|
||||
# Model IDs
|
||||
VOXTRAL_3B = "mistralai/Voxtral-Mini-3B-2507"
|
||||
VOXTRAL_4B_REALTIME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VllmTranscriptionResult:
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str = "voxtral-vllm"
|
||||
latency_ms: Optional[float] = None
|
||||
duration_seconds: Optional[float] = None
|
||||
|
||||
|
||||
async def check_health() -> dict:
|
||||
"""Check if vLLM server is healthy."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{VLLM_URL}/health")
|
||||
if response.status_code == 200:
|
||||
return {"status": "healthy", "url": VLLM_URL}
|
||||
return {"status": "unhealthy", "url": VLLM_URL, "code": response.status_code}
|
||||
except Exception as e:
|
||||
return {"status": "unavailable", "url": VLLM_URL, "error": str(e)}
|
||||
|
||||
|
||||
async def get_models() -> list:
|
||||
"""Get available models from vLLM server."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{VLLM_URL}/v1/models")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return [m["id"] for m in data.get("data", [])]
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Check if vLLM server is configured."""
|
||||
return bool(VLLM_URL)
|
||||
|
||||
|
||||
async def transcribe_audio_bytes(
|
||||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = "de",
|
||||
model: Optional[str] = None,
|
||||
) -> VllmTranscriptionResult:
|
||||
"""
|
||||
Transcribe audio using vLLM Voxtral server.
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio bytes
|
||||
filename: Original filename (for format detection)
|
||||
language: Language code (de, en, fr, etc.)
|
||||
model: Model to use (defaults to Voxtral-Mini-3B-2507)
|
||||
|
||||
Returns:
|
||||
VllmTranscriptionResult with transcription
|
||||
"""
|
||||
start_time = time.time()
|
||||
model_id = model or VOXTRAL_3B
|
||||
|
||||
logger.info(f"Transcribing via vLLM: {filename} ({len(audio_bytes)} bytes)")
|
||||
|
||||
# Save to temp file (vLLM API accepts file uploads)
|
||||
ext = Path(filename).suffix or ".wav"
|
||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||
tmp.write(audio_bytes)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=VLLM_TIMEOUT) as client:
|
||||
# Use OpenAI-compatible transcription endpoint
|
||||
with open(tmp_path, "rb") as f:
|
||||
files = {"file": (filename, f, "audio/wav")}
|
||||
data = {
|
||||
"model": model_id,
|
||||
"language": language or "de",
|
||||
"response_format": "json",
|
||||
"temperature": 0.0, # Deterministic for transcription
|
||||
}
|
||||
|
||||
response = await client.post(
|
||||
f"{VLLM_URL}/v1/audio/transcriptions",
|
||||
files=files,
|
||||
data=data,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = response.text
|
||||
logger.error(f"vLLM error: {response.status_code} - {error_detail}")
|
||||
raise RuntimeError(f"vLLM transcription failed: {error_detail}")
|
||||
|
||||
result = response.json()
|
||||
text = result.get("text", "")
|
||||
duration = result.get("duration")
|
||||
|
||||
latency_ms = (time.time() - start_time) * 1000
|
||||
logger.info(f"vLLM transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
|
||||
|
||||
return VllmTranscriptionResult(
|
||||
text=text.strip(),
|
||||
language=language,
|
||||
model=f"vllm-{model_id.split('/')[-1]}",
|
||||
latency_ms=latency_ms,
|
||||
duration_seconds=duration,
|
||||
)
|
||||
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def transcribe_with_realtime(
|
||||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = "de",
|
||||
) -> VllmTranscriptionResult:
|
||||
"""
|
||||
Transcribe using Voxtral 4B Realtime model.
|
||||
|
||||
Optimized for low latency (<500ms).
|
||||
"""
|
||||
return await transcribe_audio_bytes(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=filename,
|
||||
language=language,
|
||||
model=VOXTRAL_4B_REALTIME,
|
||||
)
|
||||
|
||||
|
||||
# Supported languages (same as Voxtral)
|
||||
SUPPORTED_LANGUAGES = [
|
||||
"en", # English
|
||||
"zh", # Chinese
|
||||
"hi", # Hindi
|
||||
"es", # Spanish
|
||||
"ar", # Arabic
|
||||
"fr", # French
|
||||
"pt", # Portuguese
|
||||
"ru", # Russian
|
||||
"de", # German
|
||||
"ja", # Japanese
|
||||
"ko", # Korean
|
||||
"it", # Italian
|
||||
"nl", # Dutch
|
||||
]
|
||||
213
services/mana-stt/app/voxtral_api_service.py
Normal file
213
services/mana-stt/app/voxtral_api_service.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
"""
|
||||
Voxtral API Service - Mistral Cloud API Fallback
|
||||
Uses Mistral's hosted Voxtral Mini Transcribe V2 when local service is overloaded.
|
||||
|
||||
Features:
|
||||
- Speaker diarization
|
||||
- Word-level timestamps
|
||||
- Context biasing for domain-specific terms
|
||||
- 13 language support
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Literal
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy load client
|
||||
_mistral_client = None
|
||||
|
||||
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
||||
DEFAULT_MODEL = "voxtral-mini-latest" # voxtral-mini-2602
|
||||
|
||||
|
||||
@dataclass
|
||||
class Speaker:
|
||||
"""Speaker information from diarization."""
|
||||
id: str
|
||||
start: float
|
||||
end: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class WordTimestamp:
|
||||
"""Word-level timestamp."""
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class SegmentTimestamp:
|
||||
"""Segment-level timestamp."""
|
||||
text: str
|
||||
start: float
|
||||
end: float
|
||||
speaker: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoxtralApiResult:
|
||||
"""Result from Voxtral API transcription."""
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str = "voxtral-api"
|
||||
duration_seconds: Optional[float] = None
|
||||
words: list[WordTimestamp] = field(default_factory=list)
|
||||
segments: list[SegmentTimestamp] = field(default_factory=list)
|
||||
speakers: list[Speaker] = field(default_factory=list)
|
||||
|
||||
|
||||
def get_mistral_client():
|
||||
"""Get or create Mistral client instance."""
|
||||
global _mistral_client
|
||||
|
||||
if _mistral_client is None:
|
||||
if not MISTRAL_API_KEY:
|
||||
raise RuntimeError(
|
||||
"MISTRAL_API_KEY environment variable not set. "
|
||||
"Get your API key at https://console.mistral.ai/"
|
||||
)
|
||||
|
||||
try:
|
||||
from mistralai import Mistral
|
||||
_mistral_client = Mistral(api_key=MISTRAL_API_KEY)
|
||||
logger.info("Mistral API client initialized")
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
"mistralai package not installed. "
|
||||
"Run: pip install mistralai"
|
||||
)
|
||||
|
||||
return _mistral_client
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Check if Mistral API is configured and available."""
|
||||
return bool(MISTRAL_API_KEY)
|
||||
|
||||
|
||||
async def transcribe_audio_bytes(
|
||||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = None,
|
||||
timestamp_granularity: Optional[Literal["word", "segment"]] = None,
|
||||
diarization: bool = False,
|
||||
context_bias: Optional[list[str]] = None,
|
||||
) -> VoxtralApiResult:
|
||||
"""
|
||||
Transcribe audio using Mistral's Voxtral API.
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio bytes
|
||||
filename: Original filename (for extension detection)
|
||||
language: Language code (de, en, fr, etc.) - auto-detect if None
|
||||
timestamp_granularity: "word" or "segment" for timestamps
|
||||
diarization: Enable speaker diarization
|
||||
context_bias: List of domain-specific terms to improve accuracy (max 100)
|
||||
|
||||
Returns:
|
||||
VoxtralApiResult with transcription and optional metadata
|
||||
"""
|
||||
client = get_mistral_client()
|
||||
|
||||
logger.info(f"Transcribing via Mistral API: {filename} ({len(audio_bytes)} bytes)")
|
||||
|
||||
try:
|
||||
# Build request parameters
|
||||
request_params = {
|
||||
"model": DEFAULT_MODEL,
|
||||
"file": {
|
||||
"content": audio_bytes,
|
||||
"file_name": filename,
|
||||
},
|
||||
}
|
||||
|
||||
# Language and timestamps are mutually exclusive in current API
|
||||
if language and not timestamp_granularity:
|
||||
request_params["language"] = language
|
||||
|
||||
if timestamp_granularity:
|
||||
request_params["timestamp_granularities"] = [timestamp_granularity]
|
||||
|
||||
if diarization:
|
||||
request_params["diarization"] = True
|
||||
|
||||
if context_bias:
|
||||
# API accepts comma-separated string, max 100 terms
|
||||
bias_terms = context_bias[:100]
|
||||
request_params["context_bias"] = ",".join(bias_terms)
|
||||
|
||||
# Make API call
|
||||
response = client.audio.transcriptions.complete(**request_params)
|
||||
|
||||
# Parse response
|
||||
result = VoxtralApiResult(
|
||||
text=response.text,
|
||||
language=getattr(response, "language", language),
|
||||
model=f"voxtral-api-{DEFAULT_MODEL}",
|
||||
duration_seconds=getattr(response, "duration", None),
|
||||
)
|
||||
|
||||
# Parse word timestamps if present
|
||||
if hasattr(response, "words") and response.words:
|
||||
result.words = [
|
||||
WordTimestamp(
|
||||
word=w.word,
|
||||
start=w.start,
|
||||
end=w.end,
|
||||
)
|
||||
for w in response.words
|
||||
]
|
||||
|
||||
# Parse segment timestamps if present
|
||||
if hasattr(response, "segments") and response.segments:
|
||||
result.segments = [
|
||||
SegmentTimestamp(
|
||||
text=s.text,
|
||||
start=s.start,
|
||||
end=s.end,
|
||||
speaker=getattr(s, "speaker", None),
|
||||
)
|
||||
for s in response.segments
|
||||
]
|
||||
|
||||
# Parse speakers if diarization enabled
|
||||
if hasattr(response, "speakers") and response.speakers:
|
||||
result.speakers = [
|
||||
Speaker(
|
||||
id=sp.id,
|
||||
start=sp.start,
|
||||
end=sp.end,
|
||||
)
|
||||
for sp in response.speakers
|
||||
]
|
||||
|
||||
logger.info(f"Mistral API transcription complete: {len(result.text)} characters")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Mistral API transcription failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# Supported languages by Voxtral API (13 languages)
|
||||
SUPPORTED_LANGUAGES = [
|
||||
"en", # English
|
||||
"zh", # Chinese
|
||||
"hi", # Hindi
|
||||
"es", # Spanish
|
||||
"ar", # Arabic
|
||||
"fr", # French
|
||||
"pt", # Portuguese
|
||||
"ru", # Russian
|
||||
"de", # German
|
||||
"ja", # Japanese
|
||||
"ko", # Korean
|
||||
"it", # Italian
|
||||
"nl", # Dutch
|
||||
]
|
||||
|
|
@ -1,12 +1,15 @@
|
|||
"""
|
||||
Voxtral STT Service using Hugging Face Transformers
|
||||
Mistral AI's Speech-to-Text model (Apache 2.0 License)
|
||||
|
||||
Uses VoxtralForConditionalGeneration with apply_transcription_request
|
||||
as per official HuggingFace documentation.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
import base64
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
|
@ -16,68 +19,80 @@ logger = logging.getLogger(__name__)
|
|||
# Lazy load to avoid import errors
|
||||
_voxtral_model = None
|
||||
_voxtral_processor = None
|
||||
_model_name = None
|
||||
|
||||
# Default model
|
||||
DEFAULT_MODEL = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoxtralTranscriptionResult:
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str = "voxtral-mini"
|
||||
model: str = "voxtral-mini-3b"
|
||||
latency_ms: Optional[float] = None
|
||||
|
||||
|
||||
def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
|
||||
def get_voxtral_model(model_name: str = DEFAULT_MODEL):
|
||||
"""
|
||||
Get or create Voxtral model instance.
|
||||
|
||||
Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
|
||||
Voxtral Small (24B) requires more VRAM.
|
||||
Uses VoxtralForConditionalGeneration (the correct class for Voxtral).
|
||||
"""
|
||||
global _voxtral_model, _voxtral_processor
|
||||
global _voxtral_model, _voxtral_processor, _model_name
|
||||
|
||||
# Reload if different model requested
|
||||
if _voxtral_model is not None and _model_name != model_name:
|
||||
logger.info(f"Switching model from {_model_name} to {model_name}")
|
||||
_voxtral_model = None
|
||||
_voxtral_processor = None
|
||||
|
||||
if _voxtral_model is None:
|
||||
logger.info(f"Loading Voxtral model: {model_name}")
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModel, AutoProcessor
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
|
||||
# Determine device
|
||||
# Determine device and dtype
|
||||
if torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
# MPS works better with float16
|
||||
torch_dtype = torch.float16
|
||||
elif torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
torch_dtype = torch.float16
|
||||
torch_dtype = torch.bfloat16
|
||||
else:
|
||||
device = "cpu"
|
||||
torch_dtype = torch.float32
|
||||
|
||||
logger.info(f"Using device: {device}")
|
||||
logger.info(f"Using device: {device}, dtype: {torch_dtype}")
|
||||
|
||||
# Load processor
|
||||
_voxtral_processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
_voxtral_processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
# Load model - Voxtral uses AutoModel, not AutoModelForSpeechSeq2Seq
|
||||
_voxtral_model = AutoModel.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map="auto" if device != "mps" else None,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Move to MPS if available (device_map doesn't support MPS)
|
||||
# Load model with VoxtralForConditionalGeneration
|
||||
if device == "mps":
|
||||
# MPS doesn't support device_map, load to CPU first then move
|
||||
_voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
)
|
||||
_voxtral_model = _voxtral_model.to(device)
|
||||
else:
|
||||
_voxtral_model = VoxtralForConditionalGeneration.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map=device,
|
||||
)
|
||||
|
||||
_model_name = model_name
|
||||
logger.info(f"Voxtral model loaded successfully on {device}")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import transformers: {e}")
|
||||
raise RuntimeError(
|
||||
"transformers not installed. "
|
||||
"Run: pip install transformers torch"
|
||||
"transformers >= 4.54.0 required. "
|
||||
"Run: pip install --upgrade transformers"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Voxtral model: {e}")
|
||||
|
|
@ -89,17 +104,16 @@ def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
|
|||
def transcribe_audio(
|
||||
audio_path: str,
|
||||
language: Optional[str] = "de",
|
||||
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
|
||||
model_name: str = DEFAULT_MODEL,
|
||||
) -> VoxtralTranscriptionResult:
|
||||
"""
|
||||
Transcribe audio file using Voxtral.
|
||||
|
||||
Voxtral is a multimodal audio understanding model that can be prompted
|
||||
for transcription tasks.
|
||||
Uses the official apply_transcription_request method.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file
|
||||
language: Target language for transcription
|
||||
language: Language code (de, en, fr, etc.)
|
||||
model_name: Hugging Face model ID
|
||||
|
||||
Returns:
|
||||
|
|
@ -108,84 +122,49 @@ def transcribe_audio(
|
|||
import torch
|
||||
|
||||
model, processor = get_voxtral_model(model_name)
|
||||
device = next(model.parameters()).device
|
||||
dtype = next(model.parameters()).dtype
|
||||
|
||||
logger.info(f"Transcribing with Voxtral: {audio_path}")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Load audio file as bytes and encode to base64
|
||||
with open(audio_path, "rb") as f:
|
||||
audio_bytes = f.read()
|
||||
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
|
||||
# Determine audio format from extension
|
||||
ext = Path(audio_path).suffix.lower()
|
||||
mime_types = {
|
||||
".wav": "audio/wav",
|
||||
".mp3": "audio/mpeg",
|
||||
".m4a": "audio/m4a",
|
||||
".flac": "audio/flac",
|
||||
".ogg": "audio/ogg",
|
||||
".webm": "audio/webm",
|
||||
}
|
||||
mime_type = mime_types.get(ext, "audio/wav")
|
||||
|
||||
# Language mapping for prompts
|
||||
lang_names = {
|
||||
"de": "German",
|
||||
"en": "English",
|
||||
"fr": "French",
|
||||
"es": "Spanish",
|
||||
"pt": "Portuguese",
|
||||
"it": "Italian",
|
||||
"nl": "Dutch",
|
||||
"hi": "Hindi",
|
||||
}
|
||||
lang_name = lang_names.get(language, "German")
|
||||
|
||||
# Create transcription prompt with base64 audio
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{audio_base64}"}},
|
||||
{"type": "text", "text": f"Transcribe this audio in {lang_name}. Only output the transcription, nothing else."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Apply chat template and process inputs
|
||||
inputs = processor.apply_chat_template(
|
||||
messages,
|
||||
tokenize=True,
|
||||
return_tensors="pt",
|
||||
return_dict=True,
|
||||
# Use apply_transcription_request (official method)
|
||||
# This handles audio loading and preprocessing internally
|
||||
inputs = processor.apply_transcription_request(
|
||||
language=language or "en",
|
||||
audio=audio_path,
|
||||
model_id=model_name,
|
||||
)
|
||||
|
||||
# Move to same device as model
|
||||
device = next(model.parameters()).device
|
||||
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
|
||||
# Move inputs to device and dtype
|
||||
inputs = inputs.to(device, dtype=dtype)
|
||||
|
||||
# Generate transcription
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=512,
|
||||
max_new_tokens=500,
|
||||
do_sample=False,
|
||||
)
|
||||
|
||||
# Decode only the generated tokens (exclude input)
|
||||
input_len = inputs["input_ids"].shape[-1]
|
||||
text = processor.batch_decode(
|
||||
generated_ids[:, input_len:],
|
||||
# Decode - skip input tokens
|
||||
input_len = inputs.input_ids.shape[1]
|
||||
decoded = processor.batch_decode(
|
||||
outputs[:, input_len:],
|
||||
skip_special_tokens=True,
|
||||
)[0]
|
||||
)
|
||||
|
||||
logger.info(f"Voxtral transcription complete: {len(text)} characters")
|
||||
text = decoded[0] if decoded else ""
|
||||
latency_ms = (time.time() - start_time) * 1000
|
||||
|
||||
logger.info(f"Voxtral transcription complete: {len(text)} chars in {latency_ms:.0f}ms")
|
||||
|
||||
return VoxtralTranscriptionResult(
|
||||
text=text.strip(),
|
||||
language=language,
|
||||
model="voxtral-mini",
|
||||
model=model_name.split("/")[-1],
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -197,7 +176,7 @@ async def transcribe_audio_bytes(
|
|||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = "de",
|
||||
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
|
||||
model_name: str = DEFAULT_MODEL,
|
||||
) -> VoxtralTranscriptionResult:
|
||||
"""
|
||||
Transcribe audio from bytes (for API uploads).
|
||||
|
|
@ -222,14 +201,67 @@ async def transcribe_audio_bytes(
|
|||
pass
|
||||
|
||||
|
||||
# Supported languages by Voxtral
|
||||
def unload_model():
|
||||
"""Unload model to free memory."""
|
||||
global _voxtral_model, _voxtral_processor, _model_name
|
||||
|
||||
if _voxtral_model is not None:
|
||||
del _voxtral_model
|
||||
del _voxtral_processor
|
||||
_voxtral_model = None
|
||||
_voxtral_processor = None
|
||||
_model_name = None
|
||||
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
try:
|
||||
import torch
|
||||
if torch.backends.mps.is_available():
|
||||
torch.mps.empty_cache()
|
||||
elif torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("Voxtral model unloaded")
|
||||
|
||||
|
||||
def is_loaded() -> bool:
|
||||
"""Check if model is currently loaded."""
|
||||
return _voxtral_model is not None
|
||||
|
||||
|
||||
def get_loaded_model_name() -> Optional[str]:
|
||||
"""Get name of currently loaded model."""
|
||||
return _model_name
|
||||
|
||||
|
||||
# Supported languages (13 languages as per Mistral docs)
|
||||
SUPPORTED_LANGUAGES = [
|
||||
"en", # English
|
||||
"de", # German
|
||||
"fr", # French
|
||||
"zh", # Chinese
|
||||
"hi", # Hindi
|
||||
"es", # Spanish
|
||||
"ar", # Arabic
|
||||
"fr", # French
|
||||
"pt", # Portuguese
|
||||
"ru", # Russian
|
||||
"de", # German
|
||||
"ja", # Japanese
|
||||
"ko", # Korean
|
||||
"it", # Italian
|
||||
"nl", # Dutch
|
||||
"hi", # Hindi
|
||||
]
|
||||
|
||||
# Available models
|
||||
AVAILABLE_MODELS = [
|
||||
{
|
||||
"id": "voxtral-mini-3b",
|
||||
"name": "Voxtral-Mini-3B-2507",
|
||||
"huggingface_id": "mistralai/Voxtral-Mini-3B-2507",
|
||||
"params": "3B",
|
||||
"vram": "~6GB",
|
||||
"description": "Balanced quality and speed for local deployment",
|
||||
},
|
||||
]
|
||||
|
|
|
|||
41
services/mana-stt/com.manacore.mana-stt.plist
Normal file
41
services/mana-stt/com.manacore.mana-stt.plist
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.mana-stt</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>-c</string>
|
||||
<string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt && .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 3020</string>
|
||||
</array>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
||||
<key>PORT</key>
|
||||
<string>3020</string>
|
||||
</dict>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>KeepAlive</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/Users/mana/logs/mana-stt.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/Users/mana/logs/mana-stt.error.log</string>
|
||||
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>10</integer>
|
||||
</dict>
|
||||
</plist>
|
||||
41
services/mana-stt/com.manacore.vllm-voxtral.plist
Normal file
41
services/mana-stt/com.manacore.vllm-voxtral.plist
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.vllm-voxtral</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>-c</string>
|
||||
<string>cd /Users/mana/projects/manacore-monorepo/services/mana-stt && ./scripts/start-vllm-voxtral.sh</string>
|
||||
</array>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>/Users/mana/projects/manacore-monorepo/services/mana-stt</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
||||
<key>VLLM_PORT</key>
|
||||
<string>8100</string>
|
||||
</dict>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>KeepAlive</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/Users/mana/logs/vllm-voxtral.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/Users/mana/logs/vllm-voxtral.error.log</string>
|
||||
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>30</integer>
|
||||
</dict>
|
||||
</plist>
|
||||
45
services/mana-stt/install-service.sh
Executable file
45
services/mana-stt/install-service.sh
Executable file
|
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
# Install mana-stt as a launchd service on macOS
|
||||
# Run this script on the Mac Mini server
|
||||
|
||||
set -e
|
||||
|
||||
SERVICE_NAME="com.manacore.mana-stt"
|
||||
PLIST_FILE="$SERVICE_NAME.plist"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
|
||||
LOG_DIR="$HOME/logs"
|
||||
|
||||
echo "Installing mana-stt launchd service..."
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Stop existing service if running
|
||||
if launchctl list | grep -q "$SERVICE_NAME"; then
|
||||
echo "Stopping existing service..."
|
||||
launchctl unload "$LAUNCH_AGENTS_DIR/$PLIST_FILE" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Copy plist to LaunchAgents
|
||||
cp "$SCRIPT_DIR/$PLIST_FILE" "$LAUNCH_AGENTS_DIR/"
|
||||
|
||||
# Load the service
|
||||
echo "Loading service..."
|
||||
launchctl load "$LAUNCH_AGENTS_DIR/$PLIST_FILE"
|
||||
|
||||
# Check status
|
||||
sleep 2
|
||||
if launchctl list | grep -q "$SERVICE_NAME"; then
|
||||
echo "Service installed and running!"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " View logs: tail -f $LOG_DIR/mana-stt.log"
|
||||
echo " View errors: tail -f $LOG_DIR/mana-stt.error.log"
|
||||
echo " Stop: launchctl unload $LAUNCH_AGENTS_DIR/$PLIST_FILE"
|
||||
echo " Start: launchctl load $LAUNCH_AGENTS_DIR/$PLIST_FILE"
|
||||
echo " Health check: curl http://localhost:3020/health"
|
||||
else
|
||||
echo "ERROR: Service failed to start. Check logs at $LOG_DIR/mana-stt.error.log"
|
||||
exit 1
|
||||
fi
|
||||
84
services/mana-stt/install-services.sh
Executable file
84
services/mana-stt/install-services.sh
Executable file
|
|
@ -0,0 +1,84 @@
|
|||
#!/bin/bash
|
||||
# Install mana-stt and vllm-voxtral as launchd services on macOS
|
||||
# Run this script on the Mac Mini server
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
LAUNCH_AGENTS_DIR="$HOME/Library/LaunchAgents"
|
||||
LOG_DIR="$HOME/logs"
|
||||
|
||||
echo "============================================"
|
||||
echo "Installing ManaCore STT Services"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
install_service() {
|
||||
local service_name="$1"
|
||||
local plist_file="$service_name.plist"
|
||||
|
||||
echo "Installing $service_name..."
|
||||
|
||||
# Stop existing service if running
|
||||
if launchctl list | grep -q "$service_name"; then
|
||||
echo " Stopping existing service..."
|
||||
launchctl unload "$LAUNCH_AGENTS_DIR/$plist_file" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Copy plist to LaunchAgents
|
||||
cp "$SCRIPT_DIR/$plist_file" "$LAUNCH_AGENTS_DIR/"
|
||||
|
||||
# Load the service
|
||||
echo " Loading service..."
|
||||
launchctl load "$LAUNCH_AGENTS_DIR/$plist_file"
|
||||
|
||||
sleep 2
|
||||
if launchctl list | grep -q "$service_name"; then
|
||||
echo " ✓ $service_name installed and running"
|
||||
else
|
||||
echo " ✗ $service_name failed to start"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Install vLLM first (STT depends on it)
|
||||
install_service "com.manacore.vllm-voxtral"
|
||||
|
||||
# Wait for vLLM to initialize
|
||||
echo ""
|
||||
echo "Waiting for vLLM server to initialize..."
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8100/health > /dev/null 2>&1; then
|
||||
echo " ✓ vLLM server is ready"
|
||||
break
|
||||
fi
|
||||
if [ $i -eq 30 ]; then
|
||||
echo " ! vLLM server not responding yet (may still be loading model)"
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Install STT service
|
||||
echo ""
|
||||
install_service "com.manacore.mana-stt"
|
||||
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo "Installation complete!"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
echo "Services:"
|
||||
echo " vLLM Voxtral: http://localhost:8100"
|
||||
echo " ManaCore STT: http://localhost:3020"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " View vLLM logs: tail -f $LOG_DIR/vllm-voxtral.log"
|
||||
echo " View STT logs: tail -f $LOG_DIR/mana-stt.log"
|
||||
echo " Health check: curl http://localhost:3020/health"
|
||||
echo ""
|
||||
echo "Stop all:"
|
||||
echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.vllm-voxtral.plist"
|
||||
echo " launchctl unload $LAUNCH_AGENTS_DIR/com.manacore.mana-stt.plist"
|
||||
83
services/mana-stt/scripts/setup-vllm.sh
Executable file
83
services/mana-stt/scripts/setup-vllm.sh
Executable file
|
|
@ -0,0 +1,83 @@
|
|||
#!/bin/bash
|
||||
# Setup vLLM for Voxtral on Mac Mini M4
|
||||
#
|
||||
# vLLM runs in CPU mode on macOS (no CUDA), but still provides
|
||||
# the optimized inference pipeline for Voxtral models.
|
||||
#
|
||||
# Usage: ./scripts/setup-vllm.sh
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
VENV_DIR="$SERVICE_DIR/.venv-vllm"
|
||||
|
||||
echo "============================================"
|
||||
echo "vLLM Setup for Voxtral on Mac Mini M4"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
|
||||
# Check Python version
|
||||
PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
|
||||
PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
|
||||
PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
|
||||
|
||||
if [[ "$PYTHON_MAJOR" -lt 3 ]] || [[ "$PYTHON_MAJOR" -eq 3 && "$PYTHON_MINOR" -lt 10 ]]; then
|
||||
echo "Error: Python 3.10+ required (found $PYTHON_VERSION)"
|
||||
exit 1
|
||||
fi
|
||||
echo "Python version: $PYTHON_VERSION"
|
||||
|
||||
# Create separate venv for vLLM (to avoid conflicts with whisper)
|
||||
echo ""
|
||||
echo "Creating virtual environment for vLLM..."
|
||||
python3 -m venv "$VENV_DIR"
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# Upgrade pip
|
||||
pip install --upgrade pip --quiet
|
||||
|
||||
# Install vLLM with audio support
|
||||
echo ""
|
||||
echo "Installing vLLM with audio support..."
|
||||
echo "This may take a few minutes..."
|
||||
|
||||
# Install uv for faster package installation
|
||||
pip install uv --quiet
|
||||
|
||||
# Install vLLM with audio support (nightly for best Voxtral support)
|
||||
uv pip install "vllm[audio]>=0.10.0" --extra-index-url https://wheels.vllm.ai/nightly 2>&1 || {
|
||||
echo "Nightly install failed, trying stable..."
|
||||
uv pip install "vllm[audio]>=0.10.0"
|
||||
}
|
||||
|
||||
# Install mistral-common with audio
|
||||
uv pip install "mistral-common[audio]>=1.8.1"
|
||||
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo "Installation complete!"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
echo "To start Voxtral Mini 3B server:"
|
||||
echo " source $VENV_DIR/bin/activate"
|
||||
echo " vllm serve mistralai/Voxtral-Mini-3B-2507 \\"
|
||||
echo " --tokenizer_mode mistral \\"
|
||||
echo " --config_format mistral \\"
|
||||
echo " --load_format mistral \\"
|
||||
echo " --host 0.0.0.0 \\"
|
||||
echo " --port 8100"
|
||||
echo ""
|
||||
echo "To start Voxtral Realtime 4B server:"
|
||||
echo " source $VENV_DIR/bin/activate"
|
||||
echo " vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \\"
|
||||
echo " --host 0.0.0.0 \\"
|
||||
echo " --port 8100"
|
||||
echo ""
|
||||
echo "API Endpoint: http://localhost:8100/v1/audio/transcriptions"
|
||||
echo ""
|
||||
echo "Test with:"
|
||||
echo " curl http://localhost:8100/v1/audio/transcriptions \\"
|
||||
echo " -F file=@test.mp3 \\"
|
||||
echo " -F model=mistralai/Voxtral-Mini-3B-2507 \\"
|
||||
echo " -F language=de"
|
||||
36
services/mana-stt/scripts/start-vllm-voxtral.sh
Executable file
36
services/mana-stt/scripts/start-vllm-voxtral.sh
Executable file
|
|
@ -0,0 +1,36 @@
|
|||
#!/bin/bash
|
||||
# Start vLLM server for Voxtral
|
||||
#
|
||||
# Usage: ./scripts/start-vllm-voxtral.sh [model]
|
||||
# model: "3b" (default) or "4b" for Realtime
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SERVICE_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
VENV_DIR="$SERVICE_DIR/.venv-vllm"
|
||||
MODEL="${1:-3b}"
|
||||
PORT="${VLLM_PORT:-8100}"
|
||||
|
||||
# Activate venv
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
echo "Starting vLLM Voxtral server..."
|
||||
echo "Port: $PORT"
|
||||
|
||||
if [[ "$MODEL" == "4b" || "$MODEL" == "realtime" ]]; then
|
||||
echo "Model: Voxtral Mini 4B Realtime"
|
||||
exec vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--max-model-len 8192
|
||||
else
|
||||
echo "Model: Voxtral Mini 3B"
|
||||
exec vllm serve mistralai/Voxtral-Mini-3B-2507 \
|
||||
--tokenizer_mode mistral \
|
||||
--config_format mistral \
|
||||
--load_format mistral \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--max-model-len 32768
|
||||
fi
|
||||
Loading…
Add table
Add a link
Reference in a new issue