managarten/services/mana-stt/app/main.py

309 lines
8.7 KiB
Python

"""
ManaCore STT API Service
Speech-to-Text with Whisper (MLX) and Voxtral
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
"""
import os
import logging
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Environment
PORT = int(os.getenv("PORT", "3020"))
DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
CORS_ORIGINS = os.getenv(
"CORS_ORIGINS",
"https://mana.how,https://chat.mana.how,http://localhost:5173"
).split(",")
# Response models
class TranscriptionResponse(BaseModel):
text: str
language: Optional[str] = None
model: str
duration_seconds: Optional[float] = None
class HealthResponse(BaseModel):
status: str
whisper_loaded: bool
voxtral_loaded: bool
models: dict
class ModelsResponse(BaseModel):
whisper: list
voxtral: list
default_whisper: str
# Track loaded models
models_status = {
"whisper_loaded": False,
"voxtral_loaded": False,
}
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
logger.info("Starting ManaCore STT Service...")
# Optionally preload models on startup
if PRELOAD_MODELS:
logger.info("Preloading models (PRELOAD_MODELS=true)...")
try:
from app.whisper_service import get_whisper_model
get_whisper_model(DEFAULT_WHISPER_MODEL)
models_status["whisper_loaded"] = True
logger.info("Whisper model preloaded")
except Exception as e:
logger.warning(f"Failed to preload Whisper: {e}")
try:
from app.voxtral_service import get_voxtral_model
get_voxtral_model()
models_status["voxtral_loaded"] = True
logger.info("Voxtral model preloaded")
except Exception as e:
logger.warning(f"Failed to preload Voxtral: {e}")
else:
logger.info("Models will be loaded on first request (lazy loading)")
logger.info(f"STT Service ready on port {PORT}")
yield
logger.info("Shutting down STT Service...")
# Create FastAPI app
app = FastAPI(
title="ManaCore STT Service",
description="Speech-to-Text API with Whisper (MLX) and Voxtral",
version="1.0.0",
lifespan=lifespan,
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint."""
return HealthResponse(
status="healthy",
whisper_loaded=models_status["whisper_loaded"],
voxtral_loaded=models_status["voxtral_loaded"],
models={
"default_whisper": DEFAULT_WHISPER_MODEL,
},
)
@app.get("/models", response_model=ModelsResponse)
async def list_models():
"""List available models."""
from app.whisper_service import AVAILABLE_MODELS as whisper_models
from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
return ModelsResponse(
whisper=whisper_models,
voxtral=voxtral_languages,
default_whisper=DEFAULT_WHISPER_MODEL,
)
@app.post("/transcribe", response_model=TranscriptionResponse)
async def transcribe_whisper(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
),
model: str = Form(
None,
description="Whisper model to use (default: large-v3-turbo)"
),
):
"""
Transcribe audio using Whisper (Lightning MLX).
Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate file type
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
)
try:
from app.whisper_service import transcribe_audio_bytes
# Read file
audio_bytes = await file.read()
# Check file size (100MB limit)
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
# Use default model if not specified
model_name = model or DEFAULT_WHISPER_MODEL
# Transcribe
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
model_name=model_name,
)
models_status["whisper_loaded"] = True
return TranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisper-{model_name}",
)
except Exception as e:
logger.error(f"Transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
async def transcribe_voxtral(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: str = Form(
"de",
description="Language code (de, en, fr, es, pt, it, nl, hi)"
),
):
"""
Transcribe audio using Voxtral Mini (Mistral AI).
Best for: German, French, European languages
Supported formats: mp3, wav, m4a, flac
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate language
from app.voxtral_service import SUPPORTED_LANGUAGES
if language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
)
try:
from app.voxtral_service import transcribe_audio_bytes
audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
)
models_status["voxtral_loaded"] = True
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
)
except Exception as e:
logger.error(f"Voxtral transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
async def transcribe_auto(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language hint (optional)"
),
prefer: str = Form(
"whisper",
description="Preferred model: 'whisper' or 'voxtral'"
),
):
"""
Transcribe audio with automatic model selection.
- Uses Whisper by default (faster, more languages)
- Falls back to Voxtral if Whisper fails
"""
if prefer == "voxtral":
# Try Voxtral first
try:
return await transcribe_voxtral(file, language or "de")
except Exception as e:
logger.warning(f"Voxtral failed, trying Whisper: {e}")
# Reset file position
await file.seek(0)
return await transcribe_whisper(file, language, None)
else:
# Try Whisper first (default)
try:
return await transcribe_whisper(file, language, None)
except Exception as e:
logger.warning(f"Whisper failed, trying Voxtral: {e}")
await file.seek(0)
return await transcribe_voxtral(file, language or "de")
# Error handlers
@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
logger.error(f"Unhandled error: {exc}")
return JSONResponse(
status_code=500,
content={"detail": "Internal server error", "error": str(exc)},
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=PORT,
reload=False,
)