mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 07:59:39 +02:00
mana-stt: add WhisperX service with CUDA GPU support, speaker diarization, and auto-fallback chain. mana-notify: add locale fallback and default templates for task reminders. CD: update deployment pipeline and docker-compose configuration. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
684 lines
22 KiB
Python
684 lines
22 KiB
Python
"""
|
|
ManaCore STT API Service
|
|
Speech-to-Text with Whisper (MLX), WhisperX (CUDA), Voxtral (vLLM), and Mistral API (fallback)
|
|
|
|
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, Response
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
|
|
from app.auth import verify_api_key, AuthResult, get_api_key_stats, REQUIRE_AUTH
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Environment
|
|
PORT = int(os.getenv("PORT", "3020"))
|
|
DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
|
|
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
|
|
CORS_ORIGINS = os.getenv(
|
|
"CORS_ORIGINS",
|
|
"https://mana.how,https://chat.mana.how,http://localhost:5173"
|
|
).split(",")
|
|
|
|
# vLLM configuration (disabled by default - has issues on macOS CPU)
|
|
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
|
|
USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true"
|
|
|
|
# WhisperX configuration (CUDA GPU server)
|
|
USE_WHISPERX = os.getenv("USE_WHISPERX", "false").lower() == "true"
|
|
|
|
|
|
# Response models
|
|
class TranscriptionResponse(BaseModel):
|
|
text: str
|
|
language: Optional[str] = None
|
|
model: str
|
|
latency_ms: Optional[float] = None
|
|
duration_seconds: Optional[float] = None
|
|
|
|
|
|
class WordTimestampResponse(BaseModel):
|
|
word: str
|
|
start: float
|
|
end: float
|
|
score: float = 0.0
|
|
speaker: Optional[str] = None
|
|
|
|
|
|
class SegmentResponse(BaseModel):
|
|
start: float
|
|
end: float
|
|
text: str
|
|
speaker: Optional[str] = None
|
|
words: list[WordTimestampResponse] = []
|
|
|
|
|
|
class UtteranceResponse(BaseModel):
|
|
speaker: int
|
|
text: str
|
|
offset: int # milliseconds
|
|
duration: int # milliseconds
|
|
|
|
|
|
class RichTranscriptionResponse(BaseModel):
|
|
"""Extended response with segments, utterances, and speaker diarization."""
|
|
text: str
|
|
language: Optional[str] = None
|
|
model: str
|
|
latency_ms: Optional[float] = None
|
|
duration_seconds: Optional[float] = None
|
|
segments: list[SegmentResponse] = []
|
|
utterances: list[UtteranceResponse] = []
|
|
speakers: dict[str, str] = {}
|
|
speaker_map: dict[str, int] = {}
|
|
languages: list[str] = []
|
|
primary_language: Optional[str] = None
|
|
words: list[WordTimestampResponse] = []
|
|
|
|
|
|
class HealthResponse(BaseModel):
|
|
status: str
|
|
whisper_loaded: bool
|
|
whisperx_available: bool
|
|
vllm_available: bool
|
|
vllm_url: Optional[str] = None
|
|
mistral_api_available: bool
|
|
auth_required: bool
|
|
models: dict
|
|
|
|
|
|
class ModelsResponse(BaseModel):
|
|
whisper: list
|
|
whisperx: list
|
|
voxtral_vllm: list
|
|
default_whisper: str
|
|
|
|
|
|
# Track loaded models
|
|
models_status = {
|
|
"whisper_loaded": False,
|
|
"whisperx_available": False,
|
|
"vllm_available": False,
|
|
}
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Startup and shutdown events."""
|
|
logger.info("Starting ManaCore STT Service...")
|
|
|
|
# Check vLLM availability
|
|
if USE_VLLM:
|
|
from app.vllm_service import check_health
|
|
health = await check_health()
|
|
models_status["vllm_available"] = health.get("status") == "healthy"
|
|
if models_status["vllm_available"]:
|
|
logger.info(f"vLLM server available at {VLLM_URL}")
|
|
else:
|
|
logger.warning(f"vLLM server not available: {health}")
|
|
|
|
# Check WhisperX availability
|
|
if USE_WHISPERX:
|
|
try:
|
|
from app.whisperx_service import is_available as whisperx_available
|
|
models_status["whisperx_available"] = whisperx_available()
|
|
if models_status["whisperx_available"]:
|
|
logger.info("WhisperX (CUDA) available")
|
|
else:
|
|
logger.warning("WhisperX not available (whisperx package not installed)")
|
|
except Exception as e:
|
|
logger.warning(f"WhisperX check failed: {e}")
|
|
|
|
# Check Mistral API
|
|
from app.voxtral_api_service import is_available as api_available
|
|
if api_available():
|
|
logger.info("Mistral API fallback configured")
|
|
|
|
# Optionally preload Whisper
|
|
if PRELOAD_MODELS:
|
|
logger.info("Preloading Whisper model...")
|
|
try:
|
|
from app.whisper_service import get_whisper_model
|
|
get_whisper_model(DEFAULT_WHISPER_MODEL)
|
|
models_status["whisper_loaded"] = True
|
|
logger.info("Whisper model preloaded")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to preload Whisper: {e}")
|
|
|
|
logger.info(f"STT Service ready on port {PORT}")
|
|
yield
|
|
logger.info("Shutting down STT Service...")
|
|
|
|
|
|
# Create FastAPI app
|
|
app = FastAPI(
|
|
title="ManaCore STT Service",
|
|
description="Speech-to-Text API with Whisper (MLX), Voxtral (vLLM), and Mistral API",
|
|
version="2.0.0",
|
|
lifespan=lifespan,
|
|
)
|
|
|
|
# CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=CORS_ORIGINS,
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
@app.get("/health", response_model=HealthResponse)
|
|
async def health_check():
|
|
"""Health check endpoint."""
|
|
from app.voxtral_api_service import is_available as api_available
|
|
from app.vllm_service import check_health
|
|
|
|
vllm_health = await check_health()
|
|
|
|
return HealthResponse(
|
|
status="healthy",
|
|
whisper_loaded=models_status["whisper_loaded"],
|
|
whisperx_available=models_status["whisperx_available"],
|
|
vllm_available=vllm_health.get("status") == "healthy",
|
|
vllm_url=VLLM_URL if USE_VLLM else None,
|
|
mistral_api_available=api_available(),
|
|
auth_required=REQUIRE_AUTH,
|
|
models={
|
|
"default_whisper": DEFAULT_WHISPER_MODEL,
|
|
},
|
|
)
|
|
|
|
|
|
@app.get("/models", response_model=ModelsResponse)
|
|
async def list_models(auth: AuthResult = Depends(verify_api_key)):
|
|
"""List available models."""
|
|
from app.whisper_service import AVAILABLE_MODELS as whisper_models
|
|
from app.vllm_service import get_models
|
|
|
|
vllm_models = await get_models()
|
|
|
|
whisperx_models = []
|
|
if USE_WHISPERX:
|
|
try:
|
|
from app.whisperx_service import AVAILABLE_MODELS as wx_models
|
|
whisperx_models = wx_models
|
|
except ImportError:
|
|
pass
|
|
|
|
return ModelsResponse(
|
|
whisper=whisper_models,
|
|
whisperx=whisperx_models,
|
|
voxtral_vllm=vllm_models,
|
|
default_whisper=DEFAULT_WHISPER_MODEL,
|
|
)
|
|
|
|
|
|
@app.post("/transcribe", response_model=TranscriptionResponse)
|
|
async def transcribe_whisper(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
|
model: Optional[str] = Form(None, description="Whisper model to use"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe audio using Whisper (Lightning MLX).
|
|
|
|
Best for: General transcription, many languages
|
|
Supported formats: mp3, wav, m4a, flac, ogg, webm
|
|
Max file size: 100MB
|
|
"""
|
|
# Add rate limit headers
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
|
|
ext = os.path.splitext(file.filename)[1].lower()
|
|
if ext not in allowed_extensions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
|
|
)
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
from app.whisper_service import transcribe_audio_bytes
|
|
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
model_name = model or DEFAULT_WHISPER_MODEL
|
|
|
|
result = await transcribe_audio_bytes(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
model_name=model_name,
|
|
)
|
|
|
|
models_status["whisper_loaded"] = True
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
return TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=f"whisper-{model_name}",
|
|
latency_ms=latency_ms,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Whisper transcription error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
|
|
async def transcribe_voxtral(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: str = Form("de", description="Language code"),
|
|
use_realtime: bool = Form(False, description="Use Realtime 4B model for lower latency"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe audio using Voxtral via vLLM server.
|
|
|
|
Models:
|
|
- Voxtral Mini 3B (default): Best quality
|
|
- Voxtral Realtime 4B: Lower latency (<500ms)
|
|
|
|
Falls back to Mistral API if vLLM is unavailable.
|
|
|
|
Supported formats: mp3, wav, m4a, flac, ogg, webm
|
|
Max file size: 100MB
|
|
"""
|
|
# Add rate limit headers
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
from app.vllm_service import (
|
|
SUPPORTED_LANGUAGES,
|
|
is_available as vllm_available,
|
|
transcribe_audio_bytes as vllm_transcribe,
|
|
transcribe_with_realtime,
|
|
check_health,
|
|
)
|
|
from app.voxtral_api_service import (
|
|
is_available as api_available,
|
|
transcribe_audio_bytes as api_transcribe,
|
|
)
|
|
|
|
if language not in SUPPORTED_LANGUAGES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
|
|
)
|
|
|
|
try:
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
# Try vLLM first
|
|
if USE_VLLM:
|
|
health = await check_health()
|
|
if health.get("status") == "healthy":
|
|
logger.info("Using vLLM for Voxtral transcription")
|
|
if use_realtime:
|
|
result = await transcribe_with_realtime(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
)
|
|
else:
|
|
result = await vllm_transcribe(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
)
|
|
|
|
return TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=result.model,
|
|
latency_ms=result.latency_ms,
|
|
duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
# Fallback to Mistral API
|
|
if api_available():
|
|
logger.info("Falling back to Mistral API")
|
|
result = await api_transcribe(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
)
|
|
|
|
return TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=result.model,
|
|
latency_ms=None,
|
|
duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Voxtral not available. Start vLLM server or configure MISTRAL_API_KEY."
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Voxtral transcription error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/voxtral/api", response_model=TranscriptionResponse)
|
|
async def transcribe_voxtral_api(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
|
diarization: bool = Form(False, description="Enable speaker diarization"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe audio using Mistral's Voxtral API directly.
|
|
|
|
Features:
|
|
- Speaker diarization
|
|
- Auto language detection
|
|
- High quality (~4% WER)
|
|
|
|
Requires MISTRAL_API_KEY environment variable.
|
|
"""
|
|
# Add rate limit headers
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
from app.voxtral_api_service import is_available, transcribe_audio_bytes
|
|
|
|
if not is_available():
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Mistral API not configured. Set MISTRAL_API_KEY environment variable."
|
|
)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
try:
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
result = await transcribe_audio_bytes(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
diarization=diarization,
|
|
)
|
|
|
|
return TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=result.model,
|
|
duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Mistral API error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/whisperx", response_model=RichTranscriptionResponse)
|
|
async def transcribe_whisperx(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
|
model: Optional[str] = Form(None, description="Whisper model to use"),
|
|
diarization: bool = Form(True, description="Enable speaker diarization"),
|
|
alignment: bool = Form(True, description="Enable word-level alignment"),
|
|
min_speakers: Optional[int] = Form(None, description="Minimum expected speakers"),
|
|
max_speakers: Optional[int] = Form(None, description="Maximum expected speakers"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe audio using WhisperX (CUDA GPU).
|
|
|
|
Returns rich transcription with:
|
|
- Word-level timestamps (via forced alignment)
|
|
- Speaker diarization (via pyannote.audio)
|
|
- Memoro-compatible utterances with speaker IDs
|
|
|
|
Requires NVIDIA GPU with CUDA and USE_WHISPERX=true.
|
|
Diarization requires HF_TOKEN with pyannote model access.
|
|
|
|
Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4
|
|
Max file size: 100MB
|
|
"""
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if not USE_WHISPERX:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="WhisperX not enabled. Set USE_WHISPERX=true on a CUDA-capable server."
|
|
)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
|
|
ext = os.path.splitext(file.filename)[1].lower()
|
|
if ext not in allowed_extensions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
|
|
)
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
from app.whisperx_service import transcribe_audio_bytes
|
|
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
model_name = model or DEFAULT_WHISPER_MODEL
|
|
|
|
result = await transcribe_audio_bytes(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
model_name=model_name,
|
|
enable_diarization=diarization,
|
|
enable_alignment=alignment,
|
|
min_speakers=min_speakers,
|
|
max_speakers=max_speakers,
|
|
)
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
return RichTranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=f"whisperx-{model_name}",
|
|
latency_ms=latency_ms,
|
|
duration_seconds=result.duration_seconds,
|
|
segments=[
|
|
SegmentResponse(
|
|
start=s.start,
|
|
end=s.end,
|
|
text=s.text,
|
|
speaker=s.speaker,
|
|
words=[
|
|
WordTimestampResponse(
|
|
word=w.word,
|
|
start=w.start,
|
|
end=w.end,
|
|
score=w.score,
|
|
speaker=w.speaker,
|
|
)
|
|
for w in s.words
|
|
],
|
|
)
|
|
for s in result.segments
|
|
],
|
|
utterances=[
|
|
UtteranceResponse(
|
|
speaker=u.speaker,
|
|
text=u.text,
|
|
offset=u.offset,
|
|
duration=u.duration,
|
|
)
|
|
for u in result.utterances
|
|
],
|
|
speakers=result.speakers,
|
|
speaker_map={k: v for k, v in result.speaker_map.items()},
|
|
languages=result.languages,
|
|
primary_language=result.primary_language,
|
|
words=[
|
|
WordTimestampResponse(
|
|
word=w.word,
|
|
start=w.start,
|
|
end=w.end,
|
|
score=w.score,
|
|
speaker=w.speaker,
|
|
)
|
|
for w in result.words
|
|
],
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except ImportError:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="WhisperX not installed. Install with: pip install -r requirements-cuda.txt"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"WhisperX transcription error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
|
|
async def transcribe_auto(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language hint"),
|
|
prefer: str = Form("whisper", description="Preferred: 'whisper', 'whisperx', or 'voxtral'"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe with automatic model selection and fallback.
|
|
|
|
Fallback chain:
|
|
- whisper: Whisper → WhisperX → Voxtral → Mistral API
|
|
- whisperx: WhisperX → Whisper → Voxtral → Mistral API
|
|
- voxtral: Voxtral → WhisperX → Whisper → Mistral API
|
|
"""
|
|
# Add rate limit headers
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
async def try_whisperx_simple():
|
|
"""Try WhisperX and return as simple TranscriptionResponse."""
|
|
if not USE_WHISPERX:
|
|
raise RuntimeError("WhisperX not enabled")
|
|
from app.whisperx_service import transcribe_audio_bytes as wx_transcribe
|
|
audio_bytes = await file.read()
|
|
result = await wx_transcribe(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename or "audio.wav",
|
|
language=language,
|
|
enable_diarization=False,
|
|
enable_alignment=False,
|
|
)
|
|
return TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=f"whisperx-{DEFAULT_WHISPER_MODEL}",
|
|
latency_ms=None,
|
|
duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
# Build fallback chain based on preference
|
|
if prefer == "whisperx":
|
|
chain = [
|
|
("WhisperX", try_whisperx_simple),
|
|
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
|
|
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
|
|
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
|
|
]
|
|
elif prefer == "voxtral":
|
|
chain = [
|
|
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
|
|
("WhisperX", try_whisperx_simple),
|
|
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
|
|
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
|
|
]
|
|
else:
|
|
chain = [
|
|
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
|
|
("WhisperX", try_whisperx_simple),
|
|
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
|
|
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
|
|
]
|
|
|
|
last_error = None
|
|
for name, fn in chain:
|
|
try:
|
|
result = await fn()
|
|
return result
|
|
except Exception as e:
|
|
last_error = e
|
|
logger.warning(f"{name} failed: {e}")
|
|
await file.seek(0)
|
|
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail=f"All transcription backends failed. Last error: {last_error}"
|
|
)
|
|
|
|
|
|
@app.exception_handler(Exception)
|
|
async def global_exception_handler(request, exc):
|
|
logger.error(f"Unhandled error: {exc}")
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"detail": "Internal server error", "error": str(exc)},
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(
|
|
"app.main:app",
|
|
host="0.0.0.0",
|
|
port=PORT,
|
|
reload=False,
|
|
)
|