update(infra): mana-stt WhisperX + diarization, mana-notify templates, CD pipeline updates

mana-stt: add WhisperX service with CUDA GPU support, speaker diarization, and auto-fallback chain. mana-notify: add locale fallback and default templates for task reminders. CD: update deployment pipeline and docker-compose configuration. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 23:21:08 +02:00 · 2026-04-01 14:56:26 +02:00 · 2026-04-01 14:56:26 +02:00 · da3a140f21
commit da3a140f21
parent a03de84e79
9 changed files with 1280 additions and 463 deletions
--- a/.github/workflows/cd-macmini.yml
+++ b/.github/workflows/cd-macmini.yml
@ -47,6 +47,9 @@ on:
          - mukke-web
          - storage-backend
          - storage-web
+          - memoro-server
+          - memoro-audio-server
+          - memoro-web

 concurrency:
  group: cd-macmini
@ -92,6 +95,9 @@ jobs:
      mukke-web: ${{ steps.changes.outputs.mukke-web }}
      storage-backend: ${{ steps.changes.outputs.storage-backend }}
      storage-web: ${{ steps.changes.outputs.storage-web }}
+      memoro-server: ${{ steps.changes.outputs.memoro-server }}
+      memoro-audio-server: ${{ steps.changes.outputs.memoro-audio-server }}
+      memoro-web: ${{ steps.changes.outputs.memoro-web }}
      mana-matrix-bot: ${{ steps.changes.outputs.mana-matrix-bot }}
      any-changes: ${{ steps.changes.outputs.any-changes }}
    steps:
@ -156,12 +162,15 @@ jobs:
          check_changes "mukke-web"         "apps/mukke/apps/web/" "apps/mukke/packages/"
          check_changes "storage-backend"   "apps/storage/apps/backend/" "apps/storage/packages/"
          check_changes "storage-web"       "apps/storage/apps/web/" "apps/storage/packages/"
+          check_changes "memoro-server"     "apps/memoro/apps/server/" "apps/memoro/packages/"
+          check_changes "memoro-audio-server" "apps/memoro/apps/audio-server/"
+          check_changes "memoro-web"        "apps/memoro/apps/web/" "apps/memoro/packages/"
          check_changes "mana-matrix-bot"   "services/mana-matrix-bot/"
          check_changes "mana-landing-builder" "services/mana-landing-builder/" "packages/shared-types/" "packages/shared-landing-ui/"

          # Check if anything needs deploying
          ANY="false"
-          for svc in matrix-web mana-auth mana-sync mana-media mana-notify mana-api-gateway mana-crawler mana-credits mana-search chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web mana-matrix-bot mana-landing-builder; do
+          for svc in matrix-web mana-auth mana-sync mana-media mana-notify mana-api-gateway mana-crawler mana-credits mana-search chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web memoro-server memoro-audio-server memoro-web mana-matrix-bot mana-landing-builder; do
            val=$(grep "^$svc=" $GITHUB_OUTPUT | tail -1 | cut -d= -f2)
            if [ "$val" == "true" ]; then
              ANY="true"
@ -245,6 +254,9 @@ jobs:
            if [ "${{ needs.detect-changes.outputs.mukke-web }}" == "true" ]; then SERVICES="$SERVICES mukke-web"; fi
            if [ "${{ needs.detect-changes.outputs.storage-backend }}" == "true" ]; then SERVICES="$SERVICES storage-backend"; fi
            if [ "${{ needs.detect-changes.outputs.storage-web }}" == "true" ]; then SERVICES="$SERVICES storage-web"; fi
+            if [ "${{ needs.detect-changes.outputs.memoro-server }}" == "true" ]; then SERVICES="$SERVICES memoro-server"; fi
+            if [ "${{ needs.detect-changes.outputs.memoro-audio-server }}" == "true" ]; then SERVICES="$SERVICES memoro-audio-server"; fi
+            if [ "${{ needs.detect-changes.outputs.memoro-web }}" == "true" ]; then SERVICES="$SERVICES memoro-web"; fi
            if [ "${{ needs.detect-changes.outputs.mana-matrix-bot }}" == "true" ]; then SERVICES="$SERVICES mana-matrix-bot"; fi
            if [ "${{ needs.detect-changes.outputs.mana-landing-builder }}" == "true" ]; then SERVICES="$SERVICES mana-landing-builder"; fi
          fi
@ -347,6 +359,9 @@ jobs:
              mukke-web)         echo "http://localhost:5180/health" ;;
              storage-backend)   echo "http://localhost:3035/api/v1/health" ;;
              storage-web)       echo "http://localhost:5015/health" ;;
+              memoro-server)     echo "http://localhost:3015/health" ;;
+              memoro-audio-server) echo "http://localhost:3016/health" ;;
+              memoro-web)        echo "http://localhost:5038/health" ;;
              *)                 echo "" ;;
            esac
          }
@ -356,7 +371,7 @@ jobs:
          SERVICES="${{ steps.services.outputs.services }}"

          if [ "$DEPLOY_ALL" == "true" ]; then
-            SERVICES="mana-auth matrix-web chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web"
+            SERVICES="mana-auth matrix-web chat-backend chat-web todo-backend todo-web calendar-backend calendar-web clock-web contacts-backend contacts-web mukke-backend mukke-web storage-backend storage-web memoro-server memoro-audio-server memoro-web"
          fi

          HEALTH_RESULTS=""
--- a/docker-compose.macmini.yml
+++ b/docker-compose.macmini.yml
@ -1480,6 +1480,8 @@ services:
      MEMORO_SUPABASE_SERVICE_KEY: ${MEMORO_SUPABASE_SERVICE_KEY}
      SERVICE_KEY: ${MEMORO_SERVICE_KEY}
      AUDIO_SERVER_URL: http://memoro-audio-server:3016
+      MANA_LLM_URL: http://mana-llm:3025
+      MANA_LLM_MODEL: ${MANA_LLM_MODEL:-ollama/gemma3:12b}
      GEMINI_API_KEY: ${GEMINI_API_KEY}
      AZURE_OPENAI_KEY: ${AZURE_OPENAI_KEY}
      AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT}
@ -1519,6 +1521,8 @@ services:
      AZURE_STORAGE_ACCOUNT_NAME: ${AZURE_STORAGE_ACCOUNT_NAME}
      AZURE_STORAGE_ACCOUNT_KEY: ${AZURE_STORAGE_ACCOUNT_KEY}
      AZURE_STORAGE_CONTAINER: ${AZURE_STORAGE_CONTAINER:-memoro-batch-audio}
+      MANA_STT_URL: http://host.docker.internal:3020
+      MANA_STT_API_KEY: ${MANA_STT_API_KEY:-}
    ports:
      - "3016:3016"
    healthcheck:
@ -1558,6 +1562,7 @@ services:
      PUBLIC_SUPABASE_URL: ${MEMORO_SUPABASE_URL}
      PUBLIC_SUPABASE_ANON_KEY: ${MEMORO_SUPABASE_ANON_KEY}
      PUBLIC_MANA_SYNC_URL: ws://mana-sync:3050
+      PUBLIC_GLITCHTIP_DSN: ${MEMORO_GLITCHTIP_DSN:-}
    ports:
      - "5038:5038"
    healthcheck:
--- a/package.json
+++ b/package.json
@ -215,6 +215,7 @@
 		"docker:clean": "docker compose -f docker-compose.dev.yml --env-file .env.development --profile all down -v",
 		"deploy:landing:uload": "pnpm --filter @uload/landing build && npx wrangler pages deploy apps/uload/apps/landing/dist --project-name=uload-landing",
 		"deploy:landing:todo": "pnpm --filter @todo/landing build && npx wrangler pages deploy apps/todo/apps/landing/dist --project-name=todo-landing",
+		"deploy:landing:contacts": "pnpm --filter @contacts/landing build && npx wrangler pages deploy apps/contacts/apps/landing/dist --project-name=contacts-landing",
 		"deploy:landing:calendar": "pnpm --filter @calendar/landing build && npx wrangler pages deploy apps/calendar/apps/landing/dist --project-name=calendars-landing",
 		"deploy:landing:chat": "pnpm --filter @chat/landing build && npx wrangler pages deploy apps/chat/apps/landing/dist --project-name=chat-landing",
 		"deploy:landing:picture": "pnpm --filter @picture/landing build && npx wrangler pages deploy apps/picture/apps/landing/dist --project-name=picture-landing",
@ -226,7 +227,7 @@
 		"deploy:landing:mail": "pnpm --filter @mail/landing build && npx wrangler pages deploy apps/mail/apps/landing/dist --project-name=mail-landing",
 		"deploy:landing:moodlit": "pnpm --filter @moodlit/landing build && npx wrangler pages deploy apps/moodlit/apps/landing/dist --project-name=moodlit-landing",
 		"deploy:landing:it": "pnpm --filter @mana/it-landing build && npx wrangler pages deploy services/it-landing/dist --project-name=it-landing",
-		"deploy:landing:all": "pnpm deploy:landing:calendar && pnpm deploy:landing:chat && pnpm deploy:landing:picture && pnpm deploy:landing:manacore && pnpm deploy:landing:cards && pnpm deploy:landing:zitare && pnpm deploy:landing:presi && pnpm deploy:landing:clock && pnpm deploy:landing:mail && pnpm deploy:landing:nutriphi",
+		"deploy:landing:all": "pnpm deploy:landing:calendar && pnpm deploy:landing:chat && pnpm deploy:landing:picture && pnpm deploy:landing:manacore && pnpm deploy:landing:cards && pnpm deploy:landing:zitare && pnpm deploy:landing:presi && pnpm deploy:landing:clock && pnpm deploy:landing:mail && pnpm deploy:landing:nutriphi && pnpm deploy:landing:contacts && pnpm deploy:landing:todo",
 		"dev:docs": "pnpm --filter @manacore/docs dev",
 		"build:docs": "pnpm --filter @manacore/docs build",
 		"deploy:docs": "pnpm --filter @manacore/docs build && npx wrangler pages deploy apps/docs/dist --project-name=manacore-docs",
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/services/mana-notify/internal/template/engine.go
+++ b/services/mana-notify/internal/template/engine.go
@ -120,6 +120,13 @@ func (e *Engine) SeedDefaults(ctx context.Context) {
 			body:    `<!DOCTYPE html><html><body><h1>{{.eventTitle}}</h1><p>Wann: {{.eventTime}}</p>{{if .eventLocation}}<p>Wo: {{.eventLocation}}</p>{{end}}<p><a href="{{.eventUrl}}">Termin anzeigen</a></p></body></html>`,
 			vars:    `{"eventTitle": "Titel", "eventTime": "Zeit", "eventLocation": "Ort (optional)", "eventUrl": "Link"}`,
 		},
+		{
+			slug:    "task-reminder",
+			channel: "email",
+			subject: "Erinnerung: {{.taskTitle}}",
+			body:    `<!DOCTYPE html><html><body><h1>{{.taskTitle}}</h1>{{if .dueDate}}<p>Fällig: {{.dueDate}}</p>{{end}}<p><a href="{{.taskUrl}}">Aufgabe anzeigen</a></p></body></html>`,
+			vars:    `{"taskTitle": "Aufgabentitel", "dueDate": "Fälligkeitsdatum (optional)", "taskUrl": "Link zur Aufgabe"}`,
+		},
 	}

 	for _, d := range defaults {
--- a/services/mana-stt/.env.example
+++ b/services/mana-stt/.env.example
@ -11,6 +11,24 @@ WHISPER_MODEL=large-v3
 # Options: voxtral-mini-3b, voxtral-realtime-4b, voxtral-small-24b
 VOXTRAL_MODEL=voxtral-realtime-4b

+# WhisperX (CUDA GPU Server)
+# Enable WhisperX for rich transcription (diarization, word alignment)
+# Requires NVIDIA GPU + requirements-cuda.txt
+USE_WHISPERX=false
+
+# WhisperX batch size (higher = faster but more VRAM, 16 works well for RTX 3090)
+WHISPERX_BATCH_SIZE=16
+
+# Device and compute type for CUDA
+# WHISPER_DEVICE=cuda
+# WHISPER_COMPUTE_TYPE=float16
+
+# HuggingFace token for pyannote speaker diarization models
+# Required for diarization. Accept terms at:
+#   https://huggingface.co/pyannote/speaker-diarization-3.1
+#   https://huggingface.co/pyannote/segmentation-3.0
+HF_TOKEN=
+
 # Model Loading
 # Set to true to preload models on startup (slower startup, faster first request)
 PRELOAD_MODELS=false
--- a/services/mana-stt/app/main.py
+++ b/services/mana-stt/app/main.py
@ -1,6 +1,6 @@
 """
 ManaCore STT API Service
-Speech-to-Text with Whisper (MLX), Voxtral (vLLM), and Mistral API (fallback)
+Speech-to-Text with Whisper (MLX), WhisperX (CUDA), Voxtral (vLLM), and Mistral API (fallback)

 Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
 """
@ -38,6 +38,9 @@ CORS_ORIGINS = os.getenv(
 VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8100")
 USE_VLLM = os.getenv("USE_VLLM", "false").lower() == "true"

+# WhisperX configuration (CUDA GPU server)
+USE_WHISPERX = os.getenv("USE_WHISPERX", "false").lower() == "true"
+

 # Response models
 class TranscriptionResponse(BaseModel):
@ -48,9 +51,49 @@ class TranscriptionResponse(BaseModel):
    duration_seconds: Optional[float] = None


+class WordTimestampResponse(BaseModel):
+    word: str
+    start: float
+    end: float
+    score: float = 0.0
+    speaker: Optional[str] = None
+
+
+class SegmentResponse(BaseModel):
+    start: float
+    end: float
+    text: str
+    speaker: Optional[str] = None
+    words: list[WordTimestampResponse] = []
+
+
+class UtteranceResponse(BaseModel):
+    speaker: int
+    text: str
+    offset: int  # milliseconds
+    duration: int  # milliseconds
+
+
+class RichTranscriptionResponse(BaseModel):
+    """Extended response with segments, utterances, and speaker diarization."""
+    text: str
+    language: Optional[str] = None
+    model: str
+    latency_ms: Optional[float] = None
+    duration_seconds: Optional[float] = None
+    segments: list[SegmentResponse] = []
+    utterances: list[UtteranceResponse] = []
+    speakers: dict[str, str] = {}
+    speaker_map: dict[str, int] = {}
+    languages: list[str] = []
+    primary_language: Optional[str] = None
+    words: list[WordTimestampResponse] = []
+
+
 class HealthResponse(BaseModel):
    status: str
    whisper_loaded: bool
+    whisperx_available: bool
    vllm_available: bool
    vllm_url: Optional[str] = None
    mistral_api_available: bool
@ -60,6 +103,7 @@ class HealthResponse(BaseModel):

 class ModelsResponse(BaseModel):
    whisper: list
+    whisperx: list
    voxtral_vllm: list
    default_whisper: str

@ -67,6 +111,7 @@ class ModelsResponse(BaseModel):
 # Track loaded models
 models_status = {
    "whisper_loaded": False,
+    "whisperx_available": False,
    "vllm_available": False,
 }

@ -86,6 +131,18 @@ async def lifespan(app: FastAPI):
        else:
            logger.warning(f"vLLM server not available: {health}")

+    # Check WhisperX availability
+    if USE_WHISPERX:
+        try:
+            from app.whisperx_service import is_available as whisperx_available
+            models_status["whisperx_available"] = whisperx_available()
+            if models_status["whisperx_available"]:
+                logger.info("WhisperX (CUDA) available")
+            else:
+                logger.warning("WhisperX not available (whisperx package not installed)")
+        except Exception as e:
+            logger.warning(f"WhisperX check failed: {e}")
+
    # Check Mistral API
    from app.voxtral_api_service import is_available as api_available
    if api_available():
@ -136,6 +193,7 @@ async def health_check():
    return HealthResponse(
        status="healthy",
        whisper_loaded=models_status["whisper_loaded"],
+        whisperx_available=models_status["whisperx_available"],
        vllm_available=vllm_health.get("status") == "healthy",
        vllm_url=VLLM_URL if USE_VLLM else None,
        mistral_api_available=api_available(),
@ -154,8 +212,17 @@ async def list_models(auth: AuthResult = Depends(verify_api_key)):

    vllm_models = await get_models()

+    whisperx_models = []
+    if USE_WHISPERX:
+        try:
+            from app.whisperx_service import AVAILABLE_MODELS as wx_models
+            whisperx_models = wx_models
+        except ImportError:
+            pass
+
    return ModelsResponse(
        whisper=whisper_models,
+        whisperx=whisperx_models,
        voxtral_vllm=vllm_models,
        default_whisper=DEFAULT_WHISPER_MODEL,
    )
@ -386,50 +453,216 @@ async def transcribe_voxtral_api(
        raise HTTPException(status_code=500, detail=str(e))


+@app.post("/transcribe/whisperx", response_model=RichTranscriptionResponse)
+async def transcribe_whisperx(
+    response: Response,
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: Optional[str] = Form(None, description="Language code (auto-detect if not provided)"),
+    model: Optional[str] = Form(None, description="Whisper model to use"),
+    diarization: bool = Form(True, description="Enable speaker diarization"),
+    alignment: bool = Form(True, description="Enable word-level alignment"),
+    min_speakers: Optional[int] = Form(None, description="Minimum expected speakers"),
+    max_speakers: Optional[int] = Form(None, description="Maximum expected speakers"),
+    auth: AuthResult = Depends(verify_api_key),
+):
+    """
+    Transcribe audio using WhisperX (CUDA GPU).
+
+    Returns rich transcription with:
+    - Word-level timestamps (via forced alignment)
+    - Speaker diarization (via pyannote.audio)
+    - Memoro-compatible utterances with speaker IDs
+
+    Requires NVIDIA GPU with CUDA and USE_WHISPERX=true.
+    Diarization requires HF_TOKEN with pyannote model access.
+
+    Supported formats: mp3, wav, m4a, flac, ogg, webm, mp4
+    Max file size: 100MB
+    """
+    if auth.rate_limit_remaining is not None:
+        response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)
+
+    if not USE_WHISPERX:
+        raise HTTPException(
+            status_code=503,
+            detail="WhisperX not enabled. Set USE_WHISPERX=true on a CUDA-capable server."
+        )
+
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+
+    allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
+    ext = os.path.splitext(file.filename)[1].lower()
+    if ext not in allowed_extensions:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
+        )
+
+    start_time = time.time()
+
+    try:
+        from app.whisperx_service import transcribe_audio_bytes
+
+        audio_bytes = await file.read()
+        if len(audio_bytes) > 100 * 1024 * 1024:
+            raise HTTPException(status_code=400, detail="File too large (max 100MB)")
+
+        model_name = model or DEFAULT_WHISPER_MODEL
+
+        result = await transcribe_audio_bytes(
+            audio_bytes=audio_bytes,
+            filename=file.filename,
+            language=language,
+            model_name=model_name,
+            enable_diarization=diarization,
+            enable_alignment=alignment,
+            min_speakers=min_speakers,
+            max_speakers=max_speakers,
+        )
+
+        latency_ms = (time.time() - start_time) * 1000
+
+        return RichTranscriptionResponse(
+            text=result.text,
+            language=result.language,
+            model=f"whisperx-{model_name}",
+            latency_ms=latency_ms,
+            duration_seconds=result.duration_seconds,
+            segments=[
+                SegmentResponse(
+                    start=s.start,
+                    end=s.end,
+                    text=s.text,
+                    speaker=s.speaker,
+                    words=[
+                        WordTimestampResponse(
+                            word=w.word,
+                            start=w.start,
+                            end=w.end,
+                            score=w.score,
+                            speaker=w.speaker,
+                        )
+                        for w in s.words
+                    ],
+                )
+                for s in result.segments
+            ],
+            utterances=[
+                UtteranceResponse(
+                    speaker=u.speaker,
+                    text=u.text,
+                    offset=u.offset,
+                    duration=u.duration,
+                )
+                for u in result.utterances
+            ],
+            speakers=result.speakers,
+            speaker_map={k: v for k, v in result.speaker_map.items()},
+            languages=result.languages,
+            primary_language=result.primary_language,
+            words=[
+                WordTimestampResponse(
+                    word=w.word,
+                    start=w.start,
+                    end=w.end,
+                    score=w.score,
+                    speaker=w.speaker,
+                )
+                for w in result.words
+            ],
+        )
+
+    except HTTPException:
+        raise
+    except ImportError:
+        raise HTTPException(
+            status_code=503,
+            detail="WhisperX not installed. Install with: pip install -r requirements-cuda.txt"
+        )
+    except Exception as e:
+        logger.error(f"WhisperX transcription error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
 async def transcribe_auto(
    response: Response,
    file: UploadFile = File(..., description="Audio file to transcribe"),
    language: Optional[str] = Form(None, description="Language hint"),
-    prefer: str = Form("whisper", description="Preferred: 'whisper' or 'voxtral'"),
+    prefer: str = Form("whisper", description="Preferred: 'whisper', 'whisperx', or 'voxtral'"),
    auth: AuthResult = Depends(verify_api_key),
 ):
    """
    Transcribe with automatic model selection and fallback.

    Fallback chain:
-    1. Preferred model (whisper or voxtral)
-    2. Alternative model
-    3. Mistral API
+    - whisper: Whisper → WhisperX → Voxtral → Mistral API
+    - whisperx: WhisperX → Whisper → Voxtral → Mistral API
+    - voxtral: Voxtral → WhisperX → Whisper → Mistral API
    """
    # Add rate limit headers
    if auth.rate_limit_remaining is not None:
        response.headers["X-RateLimit-Remaining"] = str(auth.rate_limit_remaining)

-    if prefer == "voxtral":
-        try:
-            return await transcribe_voxtral(file, language or "de", False)
-        except Exception as e:
-            logger.warning(f"Voxtral failed, trying Whisper: {e}")
-            await file.seek(0)
-            try:
-                return await transcribe_whisper(file, language, None)
-            except Exception as e2:
-                logger.warning(f"Whisper failed, trying API: {e2}")
-                await file.seek(0)
-                return await transcribe_voxtral_api(file, language, False)
+    async def try_whisperx_simple():
+        """Try WhisperX and return as simple TranscriptionResponse."""
+        if not USE_WHISPERX:
+            raise RuntimeError("WhisperX not enabled")
+        from app.whisperx_service import transcribe_audio_bytes as wx_transcribe
+        audio_bytes = await file.read()
+        result = await wx_transcribe(
+            audio_bytes=audio_bytes,
+            filename=file.filename or "audio.wav",
+            language=language,
+            enable_diarization=False,
+            enable_alignment=False,
+        )
+        return TranscriptionResponse(
+            text=result.text,
+            language=result.language,
+            model=f"whisperx-{DEFAULT_WHISPER_MODEL}",
+            latency_ms=None,
+            duration_seconds=result.duration_seconds,
+        )
+
+    # Build fallback chain based on preference
+    if prefer == "whisperx":
+        chain = [
+            ("WhisperX", try_whisperx_simple),
+            ("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
+            ("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
+            ("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
+        ]
+    elif prefer == "voxtral":
+        chain = [
+            ("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
+            ("WhisperX", try_whisperx_simple),
+            ("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
+            ("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
+        ]
    else:
+        chain = [
+            ("Whisper", lambda: transcribe_whisper(response, file, language, None, auth)),
+            ("WhisperX", try_whisperx_simple),
+            ("Voxtral", lambda: transcribe_voxtral(response, file, language or "de", False, auth)),
+            ("Mistral API", lambda: transcribe_voxtral_api(response, file, language, False, auth)),
+        ]
+
+    last_error = None
+    for name, fn in chain:
        try:
-            return await transcribe_whisper(file, language, None)
+            result = await fn()
+            return result
        except Exception as e:
-            logger.warning(f"Whisper failed, trying Voxtral: {e}")
+            last_error = e
+            logger.warning(f"{name} failed: {e}")
            await file.seek(0)
-            try:
-                return await transcribe_voxtral(file, language or "de", False)
-            except Exception as e2:
-                logger.warning(f"Voxtral failed, trying API: {e2}")
-                await file.seek(0)
-                return await transcribe_voxtral_api(file, language, False)
+
+    raise HTTPException(
+        status_code=503,
+        detail=f"All transcription backends failed. Last error: {last_error}"
+    )


@app.exception_handler(Exception)
--- a/services/mana-stt/app/whisperx_service.py
+++ b/services/mana-stt/app/whisperx_service.py
@ -0,0 +1,419 @@
+"""
+WhisperX STT Service using faster-whisper + pyannote (CUDA)
+Optimized for NVIDIA GPUs (RTX 3090 etc.)
+
+Features:
+- Word-level timestamps via forced alignment
+- Speaker diarization via pyannote.audio
+- Segment-level timestamps with speaker labels
+- VAD filtering for silence removal
+
+Requires HuggingFace token for pyannote models:
+  export HF_TOKEN=hf_xxx
+  # Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
+"""
+
+import os
+import tempfile
+import logging
+import time
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+# Lazy-loaded singletons
+_whisper_model = None
+_align_model = None
+_align_metadata = None
+_diarize_pipeline = None
+
+HF_TOKEN = os.getenv("HF_TOKEN")
+DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
+DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
+COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
+BATCH_SIZE = int(os.getenv("WHISPERX_BATCH_SIZE", "16"))
+
+
+@dataclass
+class WordInfo:
+    """Word with timestamp."""
+    word: str
+    start: float
+    end: float
+    score: float = 0.0
+    speaker: Optional[str] = None
+
+
+@dataclass
+class SegmentInfo:
+    """Segment with speaker and word-level detail."""
+    start: float
+    end: float
+    text: str
+    speaker: Optional[str] = None
+    words: list[WordInfo] = field(default_factory=list)
+
+
+@dataclass
+class Utterance:
+    """Speaker utterance in Memoro-compatible format."""
+    speaker: int
+    text: str
+    offset: int  # milliseconds
+    duration: int  # milliseconds
+
+
+@dataclass
+class WhisperXResult:
+    """Rich transcription result with alignment and diarization."""
+    text: str
+    language: Optional[str] = None
+    duration_seconds: Optional[float] = None
+    segments: list[SegmentInfo] = field(default_factory=list)
+    utterances: list[Utterance] = field(default_factory=list)
+    speakers: dict[str, str] = field(default_factory=dict)
+    speaker_map: dict[str, int] = field(default_factory=dict)
+    languages: list[str] = field(default_factory=list)
+    primary_language: Optional[str] = None
+    words: list[WordInfo] = field(default_factory=list)
+
+
+def get_whisper_model(model_name: str = None):
+    """Load faster-whisper model (singleton)."""
+    global _whisper_model
+    model_name = model_name or DEFAULT_MODEL
+
+    if _whisper_model is None:
+        logger.info(f"Loading WhisperX model: {model_name} on {DEVICE} ({COMPUTE_TYPE})")
+        try:
+            import whisperx
+
+            _whisper_model = whisperx.load_model(
+                model_name,
+                device=DEVICE,
+                compute_type=COMPUTE_TYPE,
+            )
+            logger.info(f"WhisperX model loaded: {model_name}")
+        except ImportError:
+            raise RuntimeError(
+                "whisperx not installed. "
+                "Run: pip install whisperx"
+            )
+
+    return _whisper_model
+
+
+def get_align_model(language_code: str):
+    """Load alignment model for a specific language (cached per language)."""
+    global _align_model, _align_metadata
+
+    try:
+        import whisperx
+
+        _align_model, _align_metadata = whisperx.load_align_model(
+            language_code=language_code,
+            device=DEVICE,
+        )
+        logger.info(f"Alignment model loaded for language: {language_code}")
+        return _align_model, _align_metadata
+    except Exception as e:
+        logger.warning(f"Failed to load alignment model for {language_code}: {e}")
+        return None, None
+
+
+def get_diarize_pipeline():
+    """Load pyannote speaker diarization pipeline (singleton)."""
+    global _diarize_pipeline
+
+    if _diarize_pipeline is None:
+        if not HF_TOKEN:
+            logger.warning("HF_TOKEN not set — diarization disabled")
+            return None
+
+        try:
+            import whisperx
+
+            _diarize_pipeline = whisperx.DiarizationPipeline(
+                use_auth_token=HF_TOKEN,
+                device=DEVICE,
+            )
+            logger.info("Diarization pipeline loaded")
+        except Exception as e:
+            logger.warning(f"Failed to load diarization pipeline: {e}")
+            return None
+
+    return _diarize_pipeline
+
+
+def _build_utterances(segments: list[SegmentInfo]) -> tuple[list[Utterance], dict[str, str], dict[str, int]]:
+    """
+    Build Memoro-compatible utterances from diarized segments.
+    Groups consecutive segments by the same speaker.
+    """
+    if not segments:
+        return [], {}, {}
+
+    # Collect unique speakers
+    speaker_labels = sorted(set(
+        s.speaker for s in segments if s.speaker is not None
+    ))
+    speaker_map: dict[str, int] = {}
+    speakers: dict[str, str] = {}
+    for idx, label in enumerate(speaker_labels):
+        speaker_map[label] = idx
+        speakers[str(idx)] = label
+
+    # Merge consecutive segments with the same speaker
+    utterances: list[Utterance] = []
+    current_speaker = None
+    current_text_parts: list[str] = []
+    current_start = 0.0
+    current_end = 0.0
+
+    for seg in segments:
+        sp = seg.speaker or "SPEAKER_00"
+        if sp != current_speaker:
+            # Flush previous
+            if current_speaker is not None and current_text_parts:
+                utterances.append(Utterance(
+                    speaker=speaker_map.get(current_speaker, 0),
+                    text=" ".join(current_text_parts).strip(),
+                    offset=int(current_start * 1000),
+                    duration=int((current_end - current_start) * 1000),
+                ))
+            current_speaker = sp
+            current_text_parts = [seg.text]
+            current_start = seg.start
+            current_end = seg.end
+        else:
+            current_text_parts.append(seg.text)
+            current_end = seg.end
+
+    # Flush last
+    if current_speaker is not None and current_text_parts:
+        utterances.append(Utterance(
+            speaker=speaker_map.get(current_speaker, 0),
+            text=" ".join(current_text_parts).strip(),
+            offset=int(current_start * 1000),
+            duration=int((current_end - current_start) * 1000),
+        ))
+
+    return utterances, speakers, speaker_map
+
+
+def transcribe_audio(
+    audio_path: str,
+    language: Optional[str] = None,
+    model_name: Optional[str] = None,
+    enable_diarization: bool = True,
+    enable_alignment: bool = True,
+    min_speakers: Optional[int] = None,
+    max_speakers: Optional[int] = None,
+) -> WhisperXResult:
+    """
+    Transcribe audio with WhisperX: alignment + diarization.
+
+    Args:
+        audio_path: Path to audio file
+        language: Language code (e.g., 'de', 'en'). Auto-detect if None.
+        model_name: Whisper model to use
+        enable_diarization: Run speaker diarization
+        enable_alignment: Run forced word alignment
+        min_speakers: Minimum expected speakers (hint for diarization)
+        max_speakers: Maximum expected speakers (hint for diarization)
+
+    Returns:
+        WhisperXResult with full transcription, segments, utterances, speakers
+    """
+    import whisperx
+
+    start_time = time.time()
+
+    # 1. Load audio
+    audio = whisperx.load_audio(audio_path)
+    audio_duration = len(audio) / 16000  # whisperx resamples to 16kHz
+
+    # 2. Transcribe with faster-whisper
+    model = get_whisper_model(model_name)
+    transcribe_result = model.transcribe(
+        audio,
+        batch_size=BATCH_SIZE,
+        language=language,
+    )
+
+    detected_language = transcribe_result.get("language", language or "en")
+    raw_segments = transcribe_result.get("segments", [])
+
+    logger.info(
+        f"Transcription: {len(raw_segments)} segments, "
+        f"language={detected_language}, "
+        f"duration={audio_duration:.1f}s"
+    )
+
+    # 3. Forced alignment (word-level timestamps)
+    if enable_alignment and raw_segments:
+        align_model, align_metadata = get_align_model(detected_language)
+        if align_model is not None:
+            try:
+                transcribe_result = whisperx.align(
+                    raw_segments,
+                    align_model,
+                    align_metadata,
+                    audio,
+                    DEVICE,
+                    return_char_alignments=False,
+                )
+                raw_segments = transcribe_result.get("segments", raw_segments)
+                logger.info("Word alignment complete")
+            except Exception as e:
+                logger.warning(f"Alignment failed, using segment-level timestamps: {e}")
+
+    # 4. Speaker diarization
+    if enable_diarization:
+        diarize_pipeline = get_diarize_pipeline()
+        if diarize_pipeline is not None:
+            try:
+                diarize_kwargs = {}
+                if min_speakers is not None:
+                    diarize_kwargs["min_speakers"] = min_speakers
+                if max_speakers is not None:
+                    diarize_kwargs["max_speakers"] = max_speakers
+
+                diarize_segments = diarize_pipeline(
+                    audio_path,
+                    **diarize_kwargs,
+                )
+                transcribe_result = whisperx.assign_word_speakers(
+                    diarize_segments, transcribe_result
+                )
+                raw_segments = transcribe_result.get("segments", raw_segments)
+                logger.info("Diarization complete")
+            except Exception as e:
+                logger.warning(f"Diarization failed: {e}")
+
+    # 5. Build structured result
+    segments: list[SegmentInfo] = []
+    all_words: list[WordInfo] = []
+    full_text_parts: list[str] = []
+
+    for seg in raw_segments:
+        seg_words: list[WordInfo] = []
+        for w in seg.get("words", []):
+            wi = WordInfo(
+                word=w.get("word", ""),
+                start=w.get("start", 0.0),
+                end=w.get("end", 0.0),
+                score=w.get("score", 0.0),
+                speaker=w.get("speaker"),
+            )
+            seg_words.append(wi)
+            all_words.append(wi)
+
+        segment = SegmentInfo(
+            start=seg.get("start", 0.0),
+            end=seg.get("end", 0.0),
+            text=seg.get("text", "").strip(),
+            speaker=seg.get("speaker"),
+            words=seg_words,
+        )
+        segments.append(segment)
+        full_text_parts.append(segment.text)
+
+    full_text = " ".join(full_text_parts)
+
+    # 6. Build utterances (Memoro-compatible)
+    utterances, speakers, speaker_map = _build_utterances(segments)
+
+    latency = time.time() - start_time
+    logger.info(f"WhisperX complete in {latency:.1f}s: {len(full_text)} chars, {len(speakers)} speakers")
+
+    return WhisperXResult(
+        text=full_text,
+        language=detected_language,
+        duration_seconds=audio_duration,
+        segments=segments,
+        utterances=utterances,
+        speakers=speakers,
+        speaker_map=speaker_map,
+        languages=[detected_language] if detected_language else [],
+        primary_language=detected_language,
+        words=all_words,
+    )
+
+
+async def transcribe_audio_bytes(
+    audio_bytes: bytes,
+    filename: str,
+    language: Optional[str] = None,
+    model_name: Optional[str] = None,
+    enable_diarization: bool = True,
+    enable_alignment: bool = True,
+    min_speakers: Optional[int] = None,
+    max_speakers: Optional[int] = None,
+) -> WhisperXResult:
+    """
+    Transcribe audio from bytes (for API uploads).
+
+    Args:
+        audio_bytes: Raw audio file bytes
+        filename: Original filename (for extension detection)
+        language: Optional language code
+        model_name: Whisper model to use
+        enable_diarization: Run speaker diarization
+        enable_alignment: Run forced word alignment
+        min_speakers: Min expected speakers
+        max_speakers: Max expected speakers
+
+    Returns:
+        WhisperXResult with full transcription data
+    """
+    ext = Path(filename).suffix or ".wav"
+
+    with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+        tmp.write(audio_bytes)
+        tmp_path = tmp.name
+
+    try:
+        return transcribe_audio(
+            audio_path=tmp_path,
+            language=language,
+            model_name=model_name,
+            enable_diarization=enable_diarization,
+            enable_alignment=enable_alignment,
+            min_speakers=min_speakers,
+            max_speakers=max_speakers,
+        )
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+
+
+def is_available() -> bool:
+    """Check if WhisperX dependencies are installed."""
+    try:
+        import whisperx
+        return True
+    except ImportError:
+        return False
+
+
+AVAILABLE_MODELS = [
+    "tiny",
+    "tiny.en",
+    "base",
+    "base.en",
+    "small",
+    "small.en",
+    "medium",
+    "medium.en",
+    "large-v1",
+    "large-v2",
+    "large-v3",
+    "large-v3-turbo",
+    "distil-large-v2",
+    "distil-large-v3",
+]
--- a/services/mana-stt/requirements-cuda.txt
+++ b/services/mana-stt/requirements-cuda.txt
@ -0,0 +1,35 @@
+# ManaCore STT Service Dependencies
+# For GPU Server (NVIDIA RTX 3090 / CUDA)
+
+# Web Framework
+fastapi==0.115.6
+uvicorn[standard]==0.34.0
+python-multipart==0.0.20
+
+# Audio Processing
+pydub==0.25.1
+soundfile==0.13.1
+
+# WhisperX (CUDA) — includes faster-whisper + alignment
+whisperx @ git+https://github.com/m-bain/whisperX.git
+
+# faster-whisper with CTranslate2 (CUDA backend)
+faster-whisper>=1.1.0
+
+# Speaker Diarization (pyannote.audio)
+# Requires HF_TOKEN with accepted terms:
+#   https://huggingface.co/pyannote/speaker-diarization-3.1
+#   https://huggingface.co/pyannote/segmentation-3.0
+pyannote.audio>=3.3.0
+
+# PyTorch CUDA — install separately for your CUDA version:
+#   pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
+torch>=2.5.0
+torchaudio>=2.5.0
+
+# Utilities
+numpy>=1.26.0
+tqdm>=4.67.0
+
+# External Auth (mana-core-auth integration)
+httpx>=0.27.0