mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 18:01:09 +02:00
The Windows GPU server has been the actual production home for these
services for some time, and the running code there has drifted ahead of
the repo. This sync pulls the live versions back into the repo so the
Windows box is no longer the only place those changes exist.
Pulled from C:\mana\services\* on mana-server-gpu (192.168.178.11):
mana-llm:
- src/main.py, src/config.py — small fixes (auth wiring, config tweaks)
- src/api_auth.py — NEW (cross-service GPU_API_KEY validator)
- service.pyw — Windows runner used by the ManaLLM scheduled task
(sets up logging redirect, loads .env, calls uvicorn)
mana-stt:
- app/main.py — substantial cleanup (684→392 lines), drops the
whisperx-as-separate-backend branching now that whisper_service.py
rolls whisperx in directly
- app/whisper_service.py — full CUDA + whisperx rewrite (158→358 lines)
- app/auth.py + external_auth.py — significantly expanded auth
- app/vram_manager.py — NEW (shared VRAM accounting helper)
- service.pyw — Windows runner with CUDA pre-init, FFmpeg PATH
injection, .env loading
- removed: app/whisper_service_cuda.py (folded into whisper_service.py)
- removed: app/whisperx_service.py (folded into whisper_service.py)
mana-tts:
- app/auth.py, external_auth.py — same auth expansion as stt
- app/f5_service.py, kokoro_service.py — Windows tweaks
- app/vram_manager.py — NEW (same shared helper as stt)
- service.pyw — Windows runner
mana-video-gen:
- service.pyw — Windows runner (no other changes; the .py code on the
GPU box is byte-identical to what's already in the repo)
The service.pyw files contain absolute Windows paths
(C:\mana\services\<svc>) and a hardcoded FFmpeg PATH for the tills user
profile. Kept as-is intentionally — they exist to be deployed to that
one machine and any abstraction layer would just hide what's actually
happening. Anyone redeploying to a different layout will need to edit
the path strings, which is a known and obvious change.
Mac-Mini infrastructure for these services (launchd plists, install
scripts, scripts/mac-mini/setup-{stt,tts}.sh, the Mac-flux2c image-gen
implementation) is still on disk and will be removed in a follow-up
commit, along with replacing mana-image-gen with the Windows
diffusers+CUDA implementation. This commit is just the live-code sync.
392 lines
12 KiB
Python
392 lines
12 KiB
Python
"""
|
|
ManaCore STT API Service (WhisperX Edition)
|
|
Speech-to-Text with WhisperX: transcription, word timestamps, speaker diarization.
|
|
|
|
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, Response
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
|
|
from app.auth import verify_api_key, AuthResult, get_api_key_stats, REQUIRE_AUTH
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Environment
|
|
PORT = int(os.getenv("PORT", "3020"))
|
|
DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
|
|
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
|
|
CORS_ORIGINS = os.getenv(
|
|
"CORS_ORIGINS",
|
|
"https://mana.how,https://chat.mana.how,http://localhost:5173"
|
|
).split(",")
|
|
|
|
# vLLM configuration
|
|
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
|
|
USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true"
|
|
|
|
|
|
# Response models
|
|
class WordInfo(BaseModel):
|
|
word: str
|
|
start: float
|
|
end: float
|
|
score: Optional[float] = None
|
|
speaker: Optional[str] = None
|
|
|
|
|
|
class SegmentInfo(BaseModel):
|
|
start: float
|
|
end: float
|
|
text: str
|
|
speaker: Optional[str] = None
|
|
|
|
|
|
class TranscriptionResponse(BaseModel):
|
|
text: str
|
|
language: Optional[str] = None
|
|
model: str
|
|
latency_ms: Optional[float] = None
|
|
duration_seconds: Optional[float] = None
|
|
words: Optional[list[WordInfo]] = None
|
|
segments: Optional[list[SegmentInfo]] = None
|
|
speakers: Optional[list[str]] = None
|
|
|
|
|
|
class HealthResponse(BaseModel):
|
|
status: str
|
|
whisper_loaded: bool
|
|
whisperx: bool
|
|
vllm_available: bool
|
|
vllm_url: Optional[str] = None
|
|
mistral_api_available: bool
|
|
auth_required: bool
|
|
models: dict
|
|
|
|
|
|
class ModelsResponse(BaseModel):
|
|
whisper: list
|
|
voxtral_vllm: list
|
|
default_whisper: str
|
|
|
|
|
|
# Track loaded models
|
|
models_status = {
|
|
"whisper_loaded": False,
|
|
"vllm_available": False,
|
|
}
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Startup and shutdown events."""
|
|
logger.info("Starting ManaCore STT Service (WhisperX Edition)...")
|
|
|
|
# Check vLLM availability
|
|
if USE_VLLM:
|
|
from app.vllm_service import check_health
|
|
health = await check_health()
|
|
models_status["vllm_available"] = health.get("status") == "healthy"
|
|
|
|
# Check Mistral API
|
|
from app.voxtral_api_service import is_available as api_available
|
|
if api_available():
|
|
logger.info("Mistral API fallback configured")
|
|
|
|
# Always preload WhisperX model at startup (avoids timeout on first request)
|
|
logger.info("Preloading WhisperX model...")
|
|
try:
|
|
from app.whisper_service import get_whisper_model
|
|
get_whisper_model(DEFAULT_WHISPER_MODEL)
|
|
models_status["whisper_loaded"] = True
|
|
logger.info("WhisperX model preloaded successfully")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to preload WhisperX: {e}")
|
|
|
|
logger.info(f"STT Service ready on port {PORT}")
|
|
yield
|
|
logger.info("Shutting down STT Service...")
|
|
|
|
|
|
# Create FastAPI app
|
|
app = FastAPI(
|
|
title="ManaCore STT Service",
|
|
description="Speech-to-Text API with WhisperX (word timestamps + speaker diarization)",
|
|
version="3.0.0",
|
|
lifespan=lifespan,
|
|
)
|
|
|
|
# CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=CORS_ORIGINS,
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
@app.get("/health", response_model=HealthResponse)
|
|
async def health_check():
|
|
"""Health check endpoint."""
|
|
from app.voxtral_api_service import is_available as api_available
|
|
from app.vllm_service import check_health
|
|
|
|
vllm_health = await check_health()
|
|
|
|
return HealthResponse(
|
|
status="healthy",
|
|
whisper_loaded=models_status["whisper_loaded"],
|
|
whisperx=True,
|
|
vllm_available=vllm_health.get("status") == "healthy",
|
|
vllm_url=VLLM_URL if USE_VLLM else None,
|
|
mistral_api_available=api_available(),
|
|
auth_required=REQUIRE_AUTH,
|
|
models={
|
|
"default_whisper": DEFAULT_WHISPER_MODEL,
|
|
"engine": "whisperx",
|
|
"features": ["transcription", "word_timestamps", "speaker_diarization"],
|
|
},
|
|
)
|
|
|
|
|
|
@app.get("/models", response_model=ModelsResponse)
|
|
async def list_models(auth: AuthResult = Depends(verify_api_key)):
|
|
"""List available models."""
|
|
from app.whisper_service import AVAILABLE_MODELS as whisper_models
|
|
from app.vllm_service import get_models
|
|
|
|
vllm_models = await get_models()
|
|
|
|
return ModelsResponse(
|
|
whisper=whisper_models,
|
|
voxtral_vllm=vllm_models,
|
|
default_whisper=DEFAULT_WHISPER_MODEL,
|
|
)
|
|
|
|
|
|
@app.post("/transcribe", response_model=TranscriptionResponse)
|
|
async def transcribe_whisper(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
|
|
model: Optional[str] = Form(None, description="Whisper model to use"),
|
|
align: bool = Form(True, description="Enable word-level timestamp alignment"),
|
|
diarize: bool = Form(False, description="Enable speaker diarization"),
|
|
min_speakers: Optional[int] = Form(None, description="Min expected speakers (helps diarization)"),
|
|
max_speakers: Optional[int] = Form(None, description="Max expected speakers"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""
|
|
Transcribe audio using WhisperX.
|
|
|
|
Features:
|
|
- Word-level timestamps (align=true, default)
|
|
- Speaker diarization (diarize=true, opt-in)
|
|
|
|
Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4
|
|
Max file size: 100MB
|
|
"""
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
|
|
ext = os.path.splitext(file.filename)[1].lower()
|
|
if ext not in allowed_extensions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
|
|
)
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
from app.whisper_service import transcribe_audio_bytes
|
|
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
model_name = model or DEFAULT_WHISPER_MODEL
|
|
|
|
result = await transcribe_audio_bytes(
|
|
audio_bytes=audio_bytes,
|
|
filename=file.filename,
|
|
language=language,
|
|
model_name=model_name,
|
|
align=align,
|
|
diarize=diarize,
|
|
min_speakers=min_speakers,
|
|
max_speakers=max_speakers,
|
|
)
|
|
|
|
models_status["whisper_loaded"] = True
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
# Build response
|
|
resp = TranscriptionResponse(
|
|
text=result.text,
|
|
language=result.language,
|
|
model=f"whisperx-{model_name}",
|
|
latency_ms=latency_ms,
|
|
duration_seconds=result.duration,
|
|
)
|
|
|
|
# Add word timestamps if available
|
|
if result.words:
|
|
resp.words = [
|
|
WordInfo(
|
|
word=w.word,
|
|
start=w.start,
|
|
end=w.end,
|
|
score=w.score,
|
|
speaker=w.speaker,
|
|
)
|
|
for w in result.words
|
|
]
|
|
|
|
# Add segments
|
|
if result.segments:
|
|
resp.segments = [
|
|
SegmentInfo(**s) for s in result.segments
|
|
]
|
|
|
|
# Add speakers
|
|
if result.speakers:
|
|
resp.speakers = result.speakers
|
|
|
|
return resp
|
|
|
|
except Exception as e:
|
|
logger.error(f"WhisperX transcription error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
|
|
async def transcribe_voxtral(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: str = Form("de", description="Language code"),
|
|
use_realtime: bool = Form(False, description="Use Realtime 4B model"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""Transcribe audio using Voxtral via vLLM or Mistral API."""
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
from app.vllm_service import (
|
|
SUPPORTED_LANGUAGES,
|
|
is_available as vllm_available,
|
|
transcribe_audio_bytes as vllm_transcribe,
|
|
transcribe_with_realtime,
|
|
check_health,
|
|
)
|
|
from app.voxtral_api_service import (
|
|
is_available as api_available,
|
|
transcribe_audio_bytes as api_transcribe,
|
|
)
|
|
|
|
if language not in SUPPORTED_LANGUAGES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
|
|
)
|
|
|
|
try:
|
|
audio_bytes = await file.read()
|
|
if len(audio_bytes) > 100 * 1024 * 1024:
|
|
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
|
|
|
# Try vLLM first
|
|
if USE_VLLM:
|
|
health = await check_health()
|
|
if health.get("status") == "healthy":
|
|
if use_realtime:
|
|
result = await transcribe_with_realtime(
|
|
audio_bytes=audio_bytes, filename=file.filename, language=language,
|
|
)
|
|
else:
|
|
result = await vllm_transcribe(
|
|
audio_bytes=audio_bytes, filename=file.filename, language=language,
|
|
)
|
|
return TranscriptionResponse(
|
|
text=result.text, language=result.language, model=result.model,
|
|
latency_ms=result.latency_ms, duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
# Fallback to Mistral API
|
|
if api_available():
|
|
result = await api_transcribe(
|
|
audio_bytes=audio_bytes, filename=file.filename, language=language,
|
|
)
|
|
return TranscriptionResponse(
|
|
text=result.text, language=result.language, model=result.model,
|
|
duration_seconds=result.duration_seconds,
|
|
)
|
|
|
|
raise HTTPException(status_code=503, detail="Voxtral not available.")
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Voxtral transcription error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
|
|
async def transcribe_auto(
|
|
response: Response,
|
|
file: UploadFile = File(..., description="Audio file to transcribe"),
|
|
language: Optional[str] = Form(None, description="Language hint"),
|
|
prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
|
|
auth: AuthResult = Depends(verify_api_key),
|
|
):
|
|
"""Auto-select best model with fallback chain."""
|
|
if auth.rate_limit_remaining is not None:
|
|
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
|
|
|
|
if prefer == "voxtral":
|
|
try:
|
|
return await transcribe_voxtral(response, file, language or "de", False, auth)
|
|
except Exception:
|
|
await file.seek(0)
|
|
return await transcribe_whisper(response, file, language, None, True, False, None, None, auth)
|
|
else:
|
|
try:
|
|
return await transcribe_whisper(response, file, language, None, True, False, None, None, auth)
|
|
except Exception:
|
|
await file.seek(0)
|
|
return await transcribe_voxtral(response, file, language or "de", False, auth)
|
|
|
|
|
|
@app.exception_handler(Exception)
|
|
async def global_exception_handler(request, exc):
|
|
logger.error(f"Unhandled error: {exc}")
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"detail": "Internal server error", "error": str(exc)},
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run("app.main:app", host="0.0.0.0", port=PORT, reload=False)
|