mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:41:09 +02:00
- Set up 5 AI services on Windows GPU server (RTX 3090): - mana-llm (Port 3025): OpenAI-compatible LLM gateway via Ollama - mana-stt (Port 3020): WhisperX with word timestamps + speaker diarization - mana-tts (Port 3022): Kokoro (EN) + Edge TTS (DE) + Piper (local DE) - mana-image-gen (Port 3023): FLUX.2 klein 4B image generation - Ollama (Port 11434): gemma3:4b/12b, qwen2.5-coder:14b, nomic-embed-text - Add @manacore/shared-gpu TypeScript client package with SttClient, TtsClient, ImageClient - Add CUDA-compatible whisper_service using faster-whisper for Windows - Configure public access via Cloudflare Tunnel (gpu-llm/stt/tts/img.mana.how) - Add Loki log aggregator (Docker on Mac Mini) + log shipper on GPU server - Add GPU scrape targets to Prometheus/VictoriaMetrics config - Add Grafana Loki datasource for GPU service logs - Add health check with auto-restart, log rotation, and log shipping - Document complete setup: Always-On config, troubleshooting, architecture Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
175 lines
4.5 KiB
Python
175 lines
4.5 KiB
Python
"""
|
|
Whisper STT Service using faster-whisper (CUDA)
|
|
Optimized for NVIDIA GPUs (RTX 3090 etc.)
|
|
|
|
Drop-in replacement for whisper_service.py (MLX version).
|
|
Uses faster-whisper with CTranslate2 for GPU-accelerated inference.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Lazy load to avoid import errors if not installed
|
|
_whisper_model = None
|
|
|
|
|
|
@dataclass
|
|
class TranscriptionResult:
|
|
text: str
|
|
language: Optional[str] = None
|
|
duration: Optional[float] = None
|
|
segments: Optional[list] = None
|
|
|
|
|
|
def get_whisper_model(model_name: str = "large-v3", **kwargs):
|
|
"""Get or create Whisper model instance (singleton pattern)."""
|
|
global _whisper_model
|
|
|
|
if _whisper_model is None:
|
|
logger.info(f"Loading Whisper model: {model_name}")
|
|
try:
|
|
from faster_whisper import WhisperModel
|
|
|
|
# Use CUDA with float16 for RTX 3090
|
|
compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
|
|
device = os.getenv("WHISPER_DEVICE", "cuda")
|
|
|
|
_whisper_model = WhisperModel(
|
|
model_name,
|
|
device=device,
|
|
compute_type=compute_type,
|
|
)
|
|
logger.info(f"Whisper model loaded: {model_name} on {device} ({compute_type})")
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import faster_whisper: {e}")
|
|
raise RuntimeError(
|
|
"faster-whisper not installed. "
|
|
"Run: pip install faster-whisper"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Whisper model: {e}")
|
|
raise
|
|
|
|
return _whisper_model
|
|
|
|
|
|
def transcribe_audio(
|
|
audio_path: str,
|
|
language: Optional[str] = None,
|
|
model_name: str = "large-v3",
|
|
) -> TranscriptionResult:
|
|
"""
|
|
Transcribe audio file using faster-whisper (CUDA).
|
|
|
|
Args:
|
|
audio_path: Path to audio file (mp3, wav, m4a, etc.)
|
|
language: Optional language code (e.g., 'de', 'en'). Auto-detect if None.
|
|
model_name: Whisper model to use
|
|
|
|
Returns:
|
|
TranscriptionResult with text and metadata
|
|
"""
|
|
model = get_whisper_model(model_name)
|
|
|
|
logger.info(f"Transcribing: {audio_path}")
|
|
|
|
try:
|
|
segments, info = model.transcribe(
|
|
audio_path,
|
|
language=language,
|
|
beam_size=5,
|
|
vad_filter=True, # Filter out silence
|
|
)
|
|
|
|
# Collect all segments
|
|
all_segments = []
|
|
full_text_parts = []
|
|
for segment in segments:
|
|
full_text_parts.append(segment.text)
|
|
all_segments.append({
|
|
"start": segment.start,
|
|
"end": segment.end,
|
|
"text": segment.text,
|
|
})
|
|
|
|
text = " ".join(full_text_parts)
|
|
detected_language = info.language if info else language
|
|
|
|
logger.info(f"Transcription complete: {len(text)} characters, language={detected_language}")
|
|
|
|
return TranscriptionResult(
|
|
text=text.strip(),
|
|
language=detected_language,
|
|
duration=info.duration if info else None,
|
|
segments=all_segments,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Transcription failed: {e}")
|
|
raise
|
|
|
|
|
|
async def transcribe_audio_bytes(
|
|
audio_bytes: bytes,
|
|
filename: str,
|
|
language: Optional[str] = None,
|
|
model_name: str = "large-v3",
|
|
) -> TranscriptionResult:
|
|
"""
|
|
Transcribe audio from bytes (for API uploads).
|
|
|
|
Args:
|
|
audio_bytes: Raw audio file bytes
|
|
filename: Original filename (for extension detection)
|
|
language: Optional language code
|
|
model_name: Whisper model to use
|
|
|
|
Returns:
|
|
TranscriptionResult
|
|
"""
|
|
# Get file extension
|
|
ext = Path(filename).suffix or ".wav"
|
|
|
|
# Write to temp file
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
|
tmp.write(audio_bytes)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
result = transcribe_audio(
|
|
audio_path=tmp_path,
|
|
language=language,
|
|
model_name=model_name,
|
|
)
|
|
return result
|
|
finally:
|
|
# Clean up temp file
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# Available models for faster-whisper
|
|
AVAILABLE_MODELS = [
|
|
"tiny",
|
|
"tiny.en",
|
|
"base",
|
|
"base.en",
|
|
"small",
|
|
"small.en",
|
|
"medium",
|
|
"medium.en",
|
|
"large-v1",
|
|
"large-v2",
|
|
"large-v3",
|
|
"large-v3-turbo",
|
|
"distil-large-v2",
|
|
"distil-large-v3",
|
|
]
|