update(infra): mana-stt WhisperX + diarization, mana-notify templates, CD pipeline updates

mana-stt: add WhisperX service with CUDA GPU support, speaker diarization, and auto-fallback chain.
mana-notify: add locale fallback and default templates for task reminders.
CD: update deployment pipeline and docker-compose configuration.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-01 14:56:26 +02:00
parent a03de84e79
commit da3a140f21
9 changed files with 1280 additions and 463 deletions

View file

@ -47,6 +47,9 @@ on:
- mukke-web
- storage-backend
- storage-web
- memoro-server
- memoro-audio-server
- memoro-web
concurrency:
group: cd-macmini
@ -92,6 +95,9 @@ jobs:
mukke-web: ${{ steps.changes.outputs.mukke-web }}
storage-backend: ${{ steps.changes.outputs.storage-backend }}
storage-web: ${{ steps.changes.outputs.storage-web }}
memoro-server: ${{ steps.changes.outputs.memoro-server }}
memoro-audio-server: ${{ steps.changes.outputs.memoro-audio-server }}
memoro-web: ${{ steps.changes.outputs.memoro-web }}
mana-matrix-bot: ${{ steps.changes.outputs.mana-matrix-bot }}
any-changes: ${{ steps.changes.outputs.any-changes }}
steps:
@ -156,12 +162,15 @@ jobs:
check_changes "mukke-web" "apps/mukke/apps/web/" "apps/mukke/packages/"
check_changes "storage-backend" "apps/storage/apps/backend/" "apps/storage/packages/"
check_changes "storage-web" "apps/storage/apps/web/" "apps/storage/packages/"
check_changes "memoro-server" "apps/memoro/apps/server/" "apps/memoro/packages/"
check_changes "memoro-audio-server" "apps/memoro/apps/audio-server/"
check_changes "memoro-web" "apps/memoro/apps/web/" "apps/memoro/packages/"
check_changes "mana-matrix-bot" "services/mana-matrix-bot/"
check_changes "mana-landing-builder" "services/mana-landing-builder/" "packages/shared-types/" "packages/shared-landing-ui/"
# Check if anything needs deploying
ANY="false"
for svc in matrix-web mana-auth mana-sync mana-media mana-notify mana-api-gateway mana-crawler mana-credits mana-search chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web mana-matrix-bot mana-landing-builder; do
for svc in matrix-web mana-auth mana-sync mana-media mana-notify mana-api-gateway mana-crawler mana-credits mana-search chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web memoro-server memoro-audio-server memoro-web mana-matrix-bot mana-landing-builder; do
val=$(grep "^$svc=" $GITHUB_OUTPUT | tail -1 | cut -d= -f2)
if [ "$val" == "true" ]; then
ANY="true"
@ -245,6 +254,9 @@ jobs:
if [ "${{ needs.detect-changes.outputs.mukke-web }}" == "true" ]; then SERVICES="$SERVICES mukke-web"; fi
if [ "${{ needs.detect-changes.outputs.storage-backend }}" == "true" ]; then SERVICES="$SERVICES storage-backend"; fi
if [ "${{ needs.detect-changes.outputs.storage-web }}" == "true" ]; then SERVICES="$SERVICES storage-web"; fi
if [ "${{ needs.detect-changes.outputs.memoro-server }}" == "true" ]; then SERVICES="$SERVICES memoro-server"; fi
if [ "${{ needs.detect-changes.outputs.memoro-audio-server }}" == "true" ]; then SERVICES="$SERVICES memoro-audio-server"; fi
if [ "${{ needs.detect-changes.outputs.memoro-web }}" == "true" ]; then SERVICES="$SERVICES memoro-web"; fi
if [ "${{ needs.detect-changes.outputs.mana-matrix-bot }}" == "true" ]; then SERVICES="$SERVICES mana-matrix-bot"; fi
if [ "${{ needs.detect-changes.outputs.mana-landing-builder }}" == "true" ]; then SERVICES="$SERVICES mana-landing-builder"; fi
fi
@ -347,6 +359,9 @@ jobs:
mukke-web) echo "http://localhost:5180/health" ;;
storage-backend) echo "http://localhost:3035/api/v1/health" ;;
storage-web) echo "http://localhost:5015/health" ;;
memoro-server) echo "http://localhost:3015/health" ;;
memoro-audio-server) echo "http://localhost:3016/health" ;;
memoro-web) echo "http://localhost:5038/health" ;;
*) echo "" ;;
esac
}
@ -356,7 +371,7 @@ jobs:
SERVICES="${{ steps.services.outputs.services }}"
if [ "$DEPLOY_ALL" == "true" ]; then
SERVICES="mana-auth matrix-web chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web"
SERVICES="mana-auth matrix-web chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web memoro-server memoro-audio-server memoro-web"
fi
HEALTH_RESULTS=""

View file

@ -1480,6 +1480,8 @@ services:
MEMORO_SUPABASE_SERVICE_KEY: ${MEMORO_SUPABASE_SERVICE_KEY}
SERVICE_KEY: ${MEMORO_SERVICE_KEY}
AUDIO_SERVER_URL: http://memoro-audio-server:3016
MANA_LLM_URL: http://mana-llm:3025
MANA_LLM_MODEL: ${MANA_LLM_MODEL:-ollama/gemma3:12b}
GEMINI_API_KEY: ${GEMINI_API_KEY}
AZURE_OPENAI_KEY: ${AZURE_OPENAI_KEY}
AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT}
@ -1519,6 +1521,8 @@ services:
AZURE_STORAGE_ACCOUNT_NAME: ${AZURE_STORAGE_ACCOUNT_NAME}
AZURE_STORAGE_ACCOUNT_KEY: ${AZURE_STORAGE_ACCOUNT_KEY}
AZURE_STORAGE_CONTAINER: ${AZURE_STORAGE_CONTAINER:-memoro-batch-audio}
MANA_STT_URL: http://host.docker.internal:3020
MANA_STT_API_KEY: ${MANA_STT_API_KEY:-}
ports:
- "3016:3016"
healthcheck:
@ -1558,6 +1562,7 @@ services:
PUBLIC_SUPABASE_URL: ${MEMORO_SUPABASE_URL}
PUBLIC_SUPABASE_ANON_KEY: ${MEMORO_SUPABASE_ANON_KEY}
PUBLIC_MANA_SYNC_URL: ws://mana-sync:3050
PUBLIC_GLITCHTIP_DSN: ${MEMORO_GLITCHTIP_DSN:-}
ports:
- "5038:5038"
healthcheck:

View file

@ -215,6 +215,7 @@
"docker:clean": "docker compose -f docker-compose.dev.yml --env-file .env.development --profile all down -v",
"deploy:landing:uload": "pnpm --filter @uload/landing build && npx wrangler pages deploy apps/uload/apps/landing/dist --project-name=uload-landing",
"deploy:landing:todo": "pnpm --filter @todo/landing build && npx wrangler pages deploy apps/todo/apps/landing/dist --project-name=todo-landing",
"deploy:landing:contacts": "pnpm --filter @contacts/landing build && npx wrangler pages deploy apps/contacts/apps/landing/dist --project-name=contacts-landing",
"deploy:landing:calendar": "pnpm --filter @calendar/landing build && npx wrangler pages deploy apps/calendar/apps/landing/dist --project-name=calendars-landing",
"deploy:landing:chat": "pnpm --filter @chat/landing build && npx wrangler pages deploy apps/chat/apps/landing/dist --project-name=chat-landing",
"deploy:landing:picture": "pnpm --filter @picture/landing build && npx wrangler pages deploy apps/picture/apps/landing/dist --project-name=picture-landing",
@ -226,7 +227,7 @@
"deploy:landing:mail": "pnpm --filter @mail/landing build && npx wrangler pages deploy apps/mail/apps/landing/dist --project-name=mail-landing",
"deploy:landing:moodlit": "pnpm --filter @moodlit/landing build && npx wrangler pages deploy apps/moodlit/apps/landing/dist --project-name=moodlit-landing",
"deploy:landing:it": "pnpm --filter @mana/it-landing build && npx wrangler pages deploy services/it-landing/dist --project-name=it-landing",
"deploy:landing:all": "pnpm deploy:landing:calendar && pnpm deploy:landing:chat && pnpm deploy:landing:picture && pnpm deploy:landing:manacore && pnpm deploy:landing:cards && pnpm deploy:landing:zitare && pnpm deploy:landing:presi && pnpm deploy:landing:clock && pnpm deploy:landing:mail && pnpm deploy:landing:nutriphi",
"deploy:landing:all": "pnpm deploy:landing:calendar && pnpm deploy:landing:chat && pnpm deploy:landing:picture && pnpm deploy:landing:manacore && pnpm deploy:landing:cards && pnpm deploy:landing:zitare && pnpm deploy:landing:presi && pnpm deploy:landing:clock && pnpm deploy:landing:mail && pnpm deploy:landing:nutriphi && pnpm deploy:landing:contacts && pnpm deploy:landing:todo",
"dev:docs": "pnpm --filter @manacore/docs dev",
"build:docs": "pnpm --filter @manacore/docs build",
"deploy:docs": "pnpm --filter @manacore/docs build && npx wrangler pages deploy apps/docs/dist --project-name=manacore-docs",

954
pnpm-lock.yaml generated

File diff suppressed because it is too large Load diff

View file

@ -120,6 +120,13 @@ func (e *Engine) SeedDefaults(ctx context.Context) {
body: `<!DOCTYPE html><html><body><h1>{{.eventTitle}}</h1><p>Wann: {{.eventTime}}</p>{{if .eventLocation}}<p>Wo: {{.eventLocation}}</p>{{end}}<p><a href="{{.eventUrl}}">Termin anzeigen</a></p></body></html>`,
vars: `{"eventTitle": "Titel", "eventTime": "Zeit", "eventLocation": "Ort (optional)", "eventUrl": "Link"}`,
},
{
slug: "task-reminder",
channel: "email",
subject: "Erinnerung: {{.taskTitle}}",
body: `<!DOCTYPE html><html><body><h1>{{.taskTitle}}</h1>{{if .dueDate}}<p>Fällig: {{.dueDate}}</p>{{end}}<p><a href="{{.taskUrl}}">Aufgabe anzeigen</a></p></body></html>`,
vars: `{"taskTitle": "Aufgabentitel", "dueDate": "Fälligkeitsdatum (optional)", "taskUrl": "Link zur Aufgabe"}`,
},
}
for _, d := range defaults {

View file

@ -11,6 +11,24 @@ WHISPER_MODEL=large-v3
# Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
VOXTRAL_MODEL=voxtral-realtime-4b
# WhisperX (CUDA GPU Server)
# Enable WhisperX for rich transcription (diarization, word alignment)
# Requires NVIDIA GPU + requirements-cuda.txt
USE_WHISPERX=false
# WhisperX batch size (higher = faster but more VRAM, 16 works well for RTX 3090)
WHISPERX_BATCH_SIZE=16
# Device and compute type for CUDA
# WHISPER_DEVICE=cuda
# WHISPER_COMPUTE_TYPE=float16
# HuggingFace token for pyannote speaker diarization models
# Required for diarization. Accept terms at:
# https://huggingface.co/pyannote/speaker-diarization-3.1
# https://huggingface.co/pyannote/segmentation-3.0
HF_TOKEN=
# Model Loading
# Set to true to preload models on startup (slower startup, faster first request)
PRELOAD_MODELS=false

View file

@ -1,6 +1,6 @@
"""
ManaCore STT API Service
Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
Speech-to-Text with Whisper (MLX), WhisperX (CUDA), Voxtral (vLLM), and Mistral API (fallback)
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
"""
@ -38,6 +38,9 @@ CORS_ORIGINS = os.getenv(
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true"
# WhisperX configuration (CUDA GPU server)
USE_WHISPERX = os.getenv("USE_WHISPERX", "false").lower() == "true"
# Response models
class TranscriptionResponse(BaseModel):
@ -48,9 +51,49 @@ class TranscriptionResponse(BaseModel):
duration_seconds: Optional[float] = None
class WordTimestampResponse(BaseModel):
word: str
start: float
end: float
score: float = 0.0
speaker: Optional[str] = None
class SegmentResponse(BaseModel):
start: float
end: float
text: str
speaker: Optional[str] = None
words: list[WordTimestampResponse] = []
class UtteranceResponse(BaseModel):
speaker: int
text: str
offset: int # milliseconds
duration: int # milliseconds
class RichTranscriptionResponse(BaseModel):
"""Extended response with segments, utterances, and speaker diarization."""
text: str
language: Optional[str] = None
model: str
latency_ms: Optional[float] = None
duration_seconds: Optional[float] = None
segments: list[SegmentResponse] = []
utterances: list[UtteranceResponse] = []
speakers: dict[str, str] = {}
speaker_map: dict[str, int] = {}
languages: list[str] = []
primary_language: Optional[str] = None
words: list[WordTimestampResponse] = []
class HealthResponse(BaseModel):
status: str
whisper_loaded: bool
whisperx_available: bool
vllm_available: bool
vllm_url: Optional[str] = None
mistral_api_available: bool
@ -60,6 +103,7 @@ class HealthResponse(BaseModel):
class ModelsResponse(BaseModel):
whisper: list
whisperx: list
voxtral_vllm: list
default_whisper: str
@ -67,6 +111,7 @@ class ModelsResponse(BaseModel):
# Track loaded models
models_status = {
"whisper_loaded": False,
"whisperx_available": False,
"vllm_available": False,
}
@ -86,6 +131,18 @@ async def lifespan(app: FastAPI):
else:
logger.warning(f"vLLM server not available: {health}")
# Check WhisperX availability
if USE_WHISPERX:
try:
from app.whisperx_service import is_available as whisperx_available
models_status["whisperx_available"] = whisperx_available()
if models_status["whisperx_available"]:
logger.info("WhisperX (CUDA) available")
else:
logger.warning("WhisperX not available (whisperx package not installed)")
except Exception as e:
logger.warning(f"WhisperX check failed: {e}")
# Check Mistral API
from app.voxtral_api_service import is_available as api_available
if api_available():
@ -136,6 +193,7 @@ async def health_check():
return HealthResponse(
status="healthy",
whisper_loaded=models_status["whisper_loaded"],
whisperx_available=models_status["whisperx_available"],
vllm_available=vllm_health.get("status") == "healthy",
vllm_url=VLLM_URL if USE_VLLM else None,
mistral_api_available=api_available(),
@ -154,8 +212,17 @@ async def list_models(auth: AuthResult = Depends(verify_api_key)):
vllm_models = await get_models()
whisperx_models = []
if USE_WHISPERX:
try:
from app.whisperx_service import AVAILABLE_MODELS as wx_models
whisperx_models = wx_models
except ImportError:
pass
return ModelsResponse(
whisper=whisper_models,
whisperx=whisperx_models,
voxtral_vllm=vllm_models,
default_whisper=DEFAULT_WHISPER_MODEL,
)
@ -386,50 +453,216 @@ async def transcribe_voxtral_api(
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/whisperx", response_model=RichTranscriptionResponse)
async def transcribe_whisperx(
response: Response,
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
model: Optional[str] = Form(None, description="Whisper model to use"),
diarization: bool = Form(True, description="Enable speaker diarization"),
alignment: bool = Form(True, description="Enable word-level alignment"),
min_speakers: Optional[int] = Form(None, description="Minimum expected speakers"),
max_speakers: Optional[int] = Form(None, description="Maximum expected speakers"),
auth: AuthResult = Depends(verify_api_key),
):
"""
Transcribe audio using WhisperX (CUDA GPU).
Returns rich transcription with:
- Word-level timestamps (via forced alignment)
- Speaker diarization (via pyannote.audio)
- Memoro-compatible utterances with speaker IDs
Requires NVIDIA GPU with CUDA and USE_WHISPERX=true.
Diarization requires HF_TOKEN with pyannote model access.
Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4
Max file size: 100MB
"""
if auth.rate_limit_remaining is not None:
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
if not USE_WHISPERX:
raise HTTPException(
status_code=503,
detail="WhisperX not enabled. Set USE_WHISPERX=true on a CUDA-capable server."
)
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
)
start_time = time.time()
try:
from app.whisperx_service import transcribe_audio_bytes
audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
model_name = model or DEFAULT_WHISPER_MODEL
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
model_name=model_name,
enable_diarization=diarization,
enable_alignment=alignment,
min_speakers=min_speakers,
max_speakers=max_speakers,
)
latency_ms = (time.time() - start_time) * 1000
return RichTranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisperx-{model_name}",
latency_ms=latency_ms,
duration_seconds=result.duration_seconds,
segments=[
SegmentResponse(
start=s.start,
end=s.end,
text=s.text,
speaker=s.speaker,
words=[
WordTimestampResponse(
word=w.word,
start=w.start,
end=w.end,
score=w.score,
speaker=w.speaker,
)
for w in s.words
],
)
for s in result.segments
],
utterances=[
UtteranceResponse(
speaker=u.speaker,
text=u.text,
offset=u.offset,
duration=u.duration,
)
for u in result.utterances
],
speakers=result.speakers,
speaker_map={k: v for k, v in result.speaker_map.items()},
languages=result.languages,
primary_language=result.primary_language,
words=[
WordTimestampResponse(
word=w.word,
start=w.start,
end=w.end,
score=w.score,
speaker=w.speaker,
)
for w in result.words
],
)
except HTTPException:
raise
except ImportError:
raise HTTPException(
status_code=503,
detail="WhisperX not installed. Install with: pip install -r requirements-cuda.txt"
)
except Exception as e:
logger.error(f"WhisperX transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
async def transcribe_auto(
response: Response,
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(None, description="Language hint"),
prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
prefer: str = Form("whisper", description="Preferred: 'whisper', 'whisperx', or 'voxtral'"),
auth: AuthResult = Depends(verify_api_key),
):
"""
Transcribe with automatic model selection and fallback.
Fallback chain:
1. Preferred model (whisper or voxtral)
2. Alternative model
3. Mistral API
- whisper: Whisper WhisperX Voxtral Mistral API
- whisperx: WhisperX Whisper Voxtral Mistral API
- voxtral: Voxtral WhisperX Whisper Mistral API
"""
# Add rate limit headers
if auth.rate_limit_remaining is not None:
response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
if prefer == "voxtral":
try:
return await transcribe_voxtral(file, language or "de", False)
except Exception as e:
logger.warning(f"Voxtral failed, trying Whisper: {e}")
await file.seek(0)
try:
return await transcribe_whisper(file, language, None)
except Exception as e2:
logger.warning(f"Whisper failed, trying API: {e2}")
await file.seek(0)
return await transcribe_voxtral_api(file, language, False)
async def try_whisperx_simple():
"""Try WhisperX and return as simple TranscriptionResponse."""
if not USE_WHISPERX:
raise RuntimeError("WhisperX not enabled")
from app.whisperx_service import transcribe_audio_bytes as wx_transcribe
audio_bytes = await file.read()
result = await wx_transcribe(
audio_bytes=audio_bytes,
filename=file.filename or "audio.wav",
language=language,
enable_diarization=False,
enable_alignment=False,
)
return TranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisperx-{DEFAULT_WHISPER_MODEL}",
latency_ms=None,
duration_seconds=result.duration_seconds,
)
# Build fallback chain based on preference
if prefer == "whisperx":
chain = [
("WhisperX", try_whisperx_simple),
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
]
elif prefer == "voxtral":
chain = [
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
("WhisperX", try_whisperx_simple),
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
]
else:
chain = [
("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
("WhisperX", try_whisperx_simple),
("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
]
last_error = None
for name, fn in chain:
try:
return await transcribe_whisper(file, language, None)
result = await fn()
return result
except Exception as e:
logger.warning(f"Whisper failed, trying Voxtral: {e}")
last_error = e
logger.warning(f"{name} failed: {e}")
await file.seek(0)
try:
return await transcribe_voxtral(file, language or "de", False)
except Exception as e2:
logger.warning(f"Voxtral failed, trying API: {e2}")
await file.seek(0)
return await transcribe_voxtral_api(file, language, False)
raise HTTPException(
status_code=503,
detail=f"All transcription backends failed. Last error: {last_error}"
)
@app.exception_handler(Exception)

View file

@ -0,0 +1,419 @@
"""
WhisperX STT Service using faster-whisper + pyannote (CUDA)
Optimized for NVIDIA GPUs (RTX 3090 etc.)
Features:
- Word-level timestamps via forced alignment
- Speaker diarization via pyannote.audio
- Segment-level timestamps with speaker labels
- VAD filtering for silence removal
Requires HuggingFace token for pyannote models:
export HF_TOKEN=hf_xxx
# Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
"""
import os
import tempfile
import logging
import time
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# Lazy-loaded singletons
_whisper_model = None
_align_model = None
_align_metadata = None
_diarize_pipeline = None
HF_TOKEN = os.getenv("HF_TOKEN")
DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
BATCH_SIZE = int(os.getenv("WHISPERX_BATCH_SIZE", "16"))
@dataclass
class WordInfo:
"""Word with timestamp."""
word: str
start: float
end: float
score: float = 0.0
speaker: Optional[str] = None
@dataclass
class SegmentInfo:
"""Segment with speaker and word-level detail."""
start: float
end: float
text: str
speaker: Optional[str] = None
words: list[WordInfo] = field(default_factory=list)
@dataclass
class Utterance:
"""Speaker utterance in Memoro-compatible format."""
speaker: int
text: str
offset: int # milliseconds
duration: int # milliseconds
@dataclass
class WhisperXResult:
"""Rich transcription result with alignment and diarization."""
text: str
language: Optional[str] = None
duration_seconds: Optional[float] = None
segments: list[SegmentInfo] = field(default_factory=list)
utterances: list[Utterance] = field(default_factory=list)
speakers: dict[str, str] = field(default_factory=dict)
speaker_map: dict[str, int] = field(default_factory=dict)
languages: list[str] = field(default_factory=list)
primary_language: Optional[str] = None
words: list[WordInfo] = field(default_factory=list)
def get_whisper_model(model_name: str = None):
"""Load faster-whisper model (singleton)."""
global _whisper_model
model_name = model_name or DEFAULT_MODEL
if _whisper_model is None:
logger.info(f"Loading WhisperX model: {model_name} on {DEVICE} ({COMPUTE_TYPE})")
try:
import whisperx
_whisper_model = whisperx.load_model(
model_name,
device=DEVICE,
compute_type=COMPUTE_TYPE,
)
logger.info(f"WhisperX model loaded: {model_name}")
except ImportError:
raise RuntimeError(
"whisperx not installed. "
"Run: pip install whisperx"
)
return _whisper_model
def get_align_model(language_code: str):
"""Load alignment model for a specific language (cached per language)."""
global _align_model, _align_metadata
try:
import whisperx
_align_model, _align_metadata = whisperx.load_align_model(
language_code=language_code,
device=DEVICE,
)
logger.info(f"Alignment model loaded for language: {language_code}")
return _align_model, _align_metadata
except Exception as e:
logger.warning(f"Failed to load alignment model for {language_code}: {e}")
return None, None
def get_diarize_pipeline():
"""Load pyannote speaker diarization pipeline (singleton)."""
global _diarize_pipeline
if _diarize_pipeline is None:
if not HF_TOKEN:
logger.warning("HF_TOKEN not set — diarization disabled")
return None
try:
import whisperx
_diarize_pipeline = whisperx.DiarizationPipeline(
use_auth_token=HF_TOKEN,
device=DEVICE,
)
logger.info("Diarization pipeline loaded")
except Exception as e:
logger.warning(f"Failed to load diarization pipeline: {e}")
return None
return _diarize_pipeline
def _build_utterances(segments: list[SegmentInfo]) -> tuple[list[Utterance], dict[str, str], dict[str, int]]:
"""
Build Memoro-compatible utterances from diarized segments.
Groups consecutive segments by the same speaker.
"""
if not segments:
return [], {}, {}
# Collect unique speakers
speaker_labels = sorted(set(
s.speaker for s in segments if s.speaker is not None
))
speaker_map: dict[str, int] = {}
speakers: dict[str, str] = {}
for idx, label in enumerate(speaker_labels):
speaker_map[label] = idx
speakers[str(idx)] = label
# Merge consecutive segments with the same speaker
utterances: list[Utterance] = []
current_speaker = None
current_text_parts: list[str] = []
current_start = 0.0
current_end = 0.0
for seg in segments:
sp = seg.speaker or "SPEAKER_00"
if sp != current_speaker:
# Flush previous
if current_speaker is not None and current_text_parts:
utterances.append(Utterance(
speaker=speaker_map.get(current_speaker, 0),
text=" ".join(current_text_parts).strip(),
offset=int(current_start * 1000),
duration=int((current_end - current_start) * 1000),
))
current_speaker = sp
current_text_parts = [seg.text]
current_start = seg.start
current_end = seg.end
else:
current_text_parts.append(seg.text)
current_end = seg.end
# Flush last
if current_speaker is not None and current_text_parts:
utterances.append(Utterance(
speaker=speaker_map.get(current_speaker, 0),
text=" ".join(current_text_parts).strip(),
offset=int(current_start * 1000),
duration=int((current_end - current_start) * 1000),
))
return utterances, speakers, speaker_map
def transcribe_audio(
audio_path: str,
language: Optional[str] = None,
model_name: Optional[str] = None,
enable_diarization: bool = True,
enable_alignment: bool = True,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
) -> WhisperXResult:
"""
Transcribe audio with WhisperX: alignment + diarization.
Args:
audio_path: Path to audio file
language: Language code (e.g., 'de', 'en'). Auto-detect if None.
model_name: Whisper model to use
enable_diarization: Run speaker diarization
enable_alignment: Run forced word alignment
min_speakers: Minimum expected speakers (hint for diarization)
max_speakers: Maximum expected speakers (hint for diarization)
Returns:
WhisperXResult with full transcription, segments, utterances, speakers
"""
import whisperx
start_time = time.time()
# 1. Load audio
audio = whisperx.load_audio(audio_path)
audio_duration = len(audio) / 16000 # whisperx resamples to 16kHz
# 2. Transcribe with faster-whisper
model = get_whisper_model(model_name)
transcribe_result = model.transcribe(
audio,
batch_size=BATCH_SIZE,
language=language,
)
detected_language = transcribe_result.get("language", language or "en")
raw_segments = transcribe_result.get("segments", [])
logger.info(
f"Transcription: {len(raw_segments)} segments, "
f"language={detected_language}, "
f"duration={audio_duration:.1f}s"
)
# 3. Forced alignment (word-level timestamps)
if enable_alignment and raw_segments:
align_model, align_metadata = get_align_model(detected_language)
if align_model is not None:
try:
transcribe_result = whisperx.align(
raw_segments,
align_model,
align_metadata,
audio,
DEVICE,
return_char_alignments=False,
)
raw_segments = transcribe_result.get("segments", raw_segments)
logger.info("Word alignment complete")
except Exception as e:
logger.warning(f"Alignment failed, using segment-level timestamps: {e}")
# 4. Speaker diarization
if enable_diarization:
diarize_pipeline = get_diarize_pipeline()
if diarize_pipeline is not None:
try:
diarize_kwargs = {}
if min_speakers is not None:
diarize_kwargs["min_speakers"] = min_speakers
if max_speakers is not None:
diarize_kwargs["max_speakers"] = max_speakers
diarize_segments = diarize_pipeline(
audio_path,
**diarize_kwargs,
)
transcribe_result = whisperx.assign_word_speakers(
diarize_segments, transcribe_result
)
raw_segments = transcribe_result.get("segments", raw_segments)
logger.info("Diarization complete")
except Exception as e:
logger.warning(f"Diarization failed: {e}")
# 5. Build structured result
segments: list[SegmentInfo] = []
all_words: list[WordInfo] = []
full_text_parts: list[str] = []
for seg in raw_segments:
seg_words: list[WordInfo] = []
for w in seg.get("words", []):
wi = WordInfo(
word=w.get("word", ""),
start=w.get("start", 0.0),
end=w.get("end", 0.0),
score=w.get("score", 0.0),
speaker=w.get("speaker"),
)
seg_words.append(wi)
all_words.append(wi)
segment = SegmentInfo(
start=seg.get("start", 0.0),
end=seg.get("end", 0.0),
text=seg.get("text", "").strip(),
speaker=seg.get("speaker"),
words=seg_words,
)
segments.append(segment)
full_text_parts.append(segment.text)
full_text = " ".join(full_text_parts)
# 6. Build utterances (Memoro-compatible)
utterances, speakers, speaker_map = _build_utterances(segments)
latency = time.time() - start_time
logger.info(f"WhisperX complete in {latency:.1f}s: {len(full_text)} chars, {len(speakers)} speakers")
return WhisperXResult(
text=full_text,
language=detected_language,
duration_seconds=audio_duration,
segments=segments,
utterances=utterances,
speakers=speakers,
speaker_map=speaker_map,
languages=[detected_language] if detected_language else [],
primary_language=detected_language,
words=all_words,
)
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = None,
model_name: Optional[str] = None,
enable_diarization: bool = True,
enable_alignment: bool = True,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
) -> WhisperXResult:
"""
Transcribe audio from bytes (for API uploads).
Args:
audio_bytes: Raw audio file bytes
filename: Original filename (for extension detection)
language: Optional language code
model_name: Whisper model to use
enable_diarization: Run speaker diarization
enable_alignment: Run forced word alignment
min_speakers: Min expected speakers
max_speakers: Max expected speakers
Returns:
WhisperXResult with full transcription data
"""
ext = Path(filename).suffix or ".wav"
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
return transcribe_audio(
audio_path=tmp_path,
language=language,
model_name=model_name,
enable_diarization=enable_diarization,
enable_alignment=enable_alignment,
min_speakers=min_speakers,
max_speakers=max_speakers,
)
finally:
try:
os.unlink(tmp_path)
except Exception:
pass
def is_available() -> bool:
"""Check if WhisperX dependencies are installed."""
try:
import whisperx
return True
except ImportError:
return False
AVAILABLE_MODELS = [
"tiny",
"tiny.en",
"base",
"base.en",
"small",
"small.en",
"medium",
"medium.en",
"large-v1",
"large-v2",
"large-v3",
"large-v3-turbo",
"distil-large-v2",
"distil-large-v3",
]

View file

@ -0,0 +1,35 @@
# ManaCore STT Service Dependencies
# For GPU Server (NVIDIA RTX 3090 / CUDA)
# Web Framework
fastapi==0.115.6
uvicorn[standard]==0.34.0
python-multipart==0.0.20
# Audio Processing
pydub==0.25.1
soundfile==0.13.1
# WhisperX (CUDA) — includes faster-whisper + alignment
whisperx @ git+https://github.com/m-bain/whisperX.git
# faster-whisper with CTranslate2 (CUDA backend)
faster-whisper>=1.1.0
# Speaker Diarization (pyannote.audio)
# Requires HF_TOKEN with accepted terms:
# https://huggingface.co/pyannote/speaker-diarization-3.1
# https://huggingface.co/pyannote/segmentation-3.0
pyannote.audio>=3.3.0
# PyTorch CUDA — install separately for your CUDA version:
# pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
torch>=2.5.0
torchaudio>=2.5.0
# Utilities
numpy>=1.26.0
tqdm>=4.67.0
# External Auth (mana-core-auth integration)
httpx>=0.27.0