managarten/services/mana-stt/app/whisper_service_cuda.py
Till JS 16e0d99c5a feat(gpu-server): complete GPU server setup with AI services, monitoring, and public access
- Set up 5 AI services on Windows GPU server (RTX 3090):
  - mana-llm (Port 3025): OpenAI-compatible LLM gateway via Ollama
  - mana-stt (Port 3020): WhisperX with word timestamps + speaker diarization
  - mana-tts (Port 3022): Kokoro (EN) + Edge TTS (DE) + Piper (local DE)
  - mana-image-gen (Port 3023): FLUX.2 klein 4B image generation
  - Ollama (Port 11434): gemma3:4b/12b, qwen2.5-coder:14b, nomic-embed-text

- Add @manacore/shared-gpu TypeScript client package with SttClient, TtsClient, ImageClient
- Add CUDA-compatible whisper_service using faster-whisper for Windows
- Configure public access via Cloudflare Tunnel (gpu-llm/stt/tts/img.mana.how)
- Add Loki log aggregator (Docker on Mac Mini) + log shipper on GPU server
- Add GPU scrape targets to Prometheus/VictoriaMetrics config
- Add Grafana Loki datasource for GPU service logs
- Add health check with auto-restart, log rotation, and log shipping
- Document complete setup: Always-On config, troubleshooting, architecture

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 21:35:30 +01:00

175 lines
4.5 KiB
Python

"""
Whisper STT Service using faster-whisper (CUDA)
Optimized for NVIDIA GPUs (RTX 3090 etc.)
Drop-in replacement for whisper_service.py (MLX version).
Uses faster-whisper with CTranslate2 for GPU-accelerated inference.
"""
import os
import tempfile
import logging
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Lazy load to avoid import errors if not installed
_whisper_model = None
@dataclass
class TranscriptionResult:
text: str
language: Optional[str] = None
duration: Optional[float] = None
segments: Optional[list] = None
def get_whisper_model(model_name: str = "large-v3", **kwargs):
"""Get or create Whisper model instance (singleton pattern)."""
global _whisper_model
if _whisper_model is None:
logger.info(f"Loading Whisper model: {model_name}")
try:
from faster_whisper import WhisperModel
# Use CUDA with float16 for RTX 3090
compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
device = os.getenv("WHISPER_DEVICE", "cuda")
_whisper_model = WhisperModel(
model_name,
device=device,
compute_type=compute_type,
)
logger.info(f"Whisper model loaded: {model_name} on {device} ({compute_type})")
except ImportError as e:
logger.error(f"Failed to import faster_whisper: {e}")
raise RuntimeError(
"faster-whisper not installed. "
"Run: pip install faster-whisper"
)
except Exception as e:
logger.error(f"Failed to load Whisper model: {e}")
raise
return _whisper_model
def transcribe_audio(
audio_path: str,
language: Optional[str] = None,
model_name: str = "large-v3",
) -> TranscriptionResult:
"""
Transcribe audio file using faster-whisper (CUDA).
Args:
audio_path: Path to audio file (mp3, wav, m4a, etc.)
language: Optional language code (e.g., 'de', 'en'). Auto-detect if None.
model_name: Whisper model to use
Returns:
TranscriptionResult with text and metadata
"""
model = get_whisper_model(model_name)
logger.info(f"Transcribing: {audio_path}")
try:
segments, info = model.transcribe(
audio_path,
language=language,
beam_size=5,
vad_filter=True, # Filter out silence
)
# Collect all segments
all_segments = []
full_text_parts = []
for segment in segments:
full_text_parts.append(segment.text)
all_segments.append({
"start": segment.start,
"end": segment.end,
"text": segment.text,
})
text = " ".join(full_text_parts)
detected_language = info.language if info else language
logger.info(f"Transcription complete: {len(text)} characters, language={detected_language}")
return TranscriptionResult(
text=text.strip(),
language=detected_language,
duration=info.duration if info else None,
segments=all_segments,
)
except Exception as e:
logger.error(f"Transcription failed: {e}")
raise
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = None,
model_name: str = "large-v3",
) -> TranscriptionResult:
"""
Transcribe audio from bytes (for API uploads).
Args:
audio_bytes: Raw audio file bytes
filename: Original filename (for extension detection)
language: Optional language code
model_name: Whisper model to use
Returns:
TranscriptionResult
"""
# Get file extension
ext = Path(filename).suffix or ".wav"
# Write to temp file
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
result = transcribe_audio(
audio_path=tmp_path,
language=language,
model_name=model_name,
)
return result
finally:
# Clean up temp file
try:
os.unlink(tmp_path)
except Exception:
pass
# Available models for faster-whisper
AVAILABLE_MODELS = [
"tiny",
"tiny.en",
"base",
"base.en",
"small",
"small.en",
"medium",
"medium.en",
"large-v1",
"large-v2",
"large-v3",
"large-v3-turbo",
"distil-large-v2",
"distil-large-v3",
]