From bf0fa04e7e40cca1e9088ef27783ff283005a185 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Tue, 27 Jan 2026 01:33:10 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(stt):=20add=20speech-to-text?= =?UTF-8?q?=20service=20for=20Mac=20Mini?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add mana-stt service with Whisper and Voxtral support for local transcription. Includes setup script and launchd integration for automatic startup on Mac Mini server. Co-Authored-By: Claude Opus 4.5 --- scripts/mac-mini/README.md | 47 ++++ scripts/mac-mini/setup-stt.sh | 153 +++++++++++ scripts/mac-mini/status.sh | 22 ++ services/mana-stt/README.md | 165 ++++++++++++ services/mana-stt/app/__init__.py | 1 + services/mana-stt/app/main.py | 309 +++++++++++++++++++++++ services/mana-stt/app/voxtral_service.py | 198 +++++++++++++++ services/mana-stt/app/whisper_service.py | 163 ++++++++++++ services/mana-stt/requirements.txt | 25 ++ services/mana-stt/setup.sh | 123 +++++++++ 10 files changed, 1206 insertions(+) create mode 100755 scripts/mac-mini/setup-stt.sh create mode 100644 services/mana-stt/README.md create mode 100644 services/mana-stt/app/__init__.py create mode 100644 services/mana-stt/app/main.py create mode 100644 services/mana-stt/app/voxtral_service.py create mode 100644 services/mana-stt/app/whisper_service.py create mode 100644 services/mana-stt/requirements.txt create mode 100755 services/mana-stt/setup.sh diff --git a/scripts/mac-mini/README.md b/scripts/mac-mini/README.md index 6f07e6fab..79839a030 100644 --- a/scripts/mac-mini/README.md +++ b/scripts/mac-mini/README.md @@ -23,6 +23,7 @@ cd ~/projects/manacore-monorepo | Script | Purpose | |--------|---------| | `setup-autostart.sh` | Configure automatic startup on boot (run once) | +| `setup-stt.sh` | Setup STT service (Whisper + Voxtral) | | `startup.sh` | Main startup script (called by launchd) | | `health-check.sh` | Check all services health | | `status.sh` | Show full system status | @@ -143,6 +144,7 @@ Three services are configured to run automatically: | Cloudflared | `com.cloudflare.cloudflared` | Tunnel to Cloudflare | | Docker Startup | `com.manacore.docker-startup` | Start containers on boot | | Health Check | `com.manacore.health-check` | Check every 5 minutes | +| STT Service | `com.manacore.stt` | Speech-to-Text (Whisper + Voxtral) | ### Manual Service Control @@ -238,4 +240,49 @@ Once running, services are available at: | Calendar API | https://calendar-api.mana.how | | Clock | https://clock.mana.how | | Clock API | https://clock-api.mana.how | +| STT API | http://localhost:3020 (internal only) | | SSH | ssh mac-mini (via cloudflared) | + +## Native Services (non-Docker) + +### Ollama (LLM) + +Ollama runs natively on Mac Mini for LLM inference: + +```bash +# Check status +curl http://localhost:11434/api/tags + +# List models +ollama list + +# Pull a model +ollama pull gemma3:4b +``` + +### STT Service (Speech-to-Text) + +The STT service provides Whisper and Voxtral transcription: + +```bash +# Setup (first time) +./scripts/mac-mini/setup-stt.sh + +# Check status +curl http://localhost:3020/health + +# Transcribe audio +curl -X POST http://localhost:3020/transcribe \ + -F "file=@audio.mp3" \ + -F "language=de" + +# View logs +tail -f /tmp/manacore-stt.log +``` + +**Available endpoints:** +- `POST /transcribe` - Whisper transcription (recommended) +- `POST /transcribe/voxtral` - Voxtral transcription +- `POST /transcribe/auto` - Auto-select model +- `GET /health` - Health check +- `GET /models` - List available models diff --git a/scripts/mac-mini/setup-stt.sh b/scripts/mac-mini/setup-stt.sh new file mode 100755 index 000000000..e3b3b672e --- /dev/null +++ b/scripts/mac-mini/setup-stt.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Setup STT Service on Mac Mini +# Creates launchd service for auto-start + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +STT_DIR="$REPO_DIR/services/mana-stt" +PLIST_NAME="com.manacore.stt" +PLIST_PATH="$HOME/Library/LaunchAgents/$PLIST_NAME.plist" + +echo "==============================================" +echo " ManaCore STT Service Setup (Mac Mini)" +echo "==============================================" +echo "" + +# Check if STT service directory exists +if [ ! -d "$STT_DIR" ]; then + echo "Error: STT service directory not found at $STT_DIR" + exit 1 +fi + +# Run the main setup script first +echo "1. Running STT service setup..." +cd "$STT_DIR" +if [ ! -d ".venv" ]; then + echo " Installing dependencies..." + ./setup.sh +else + echo " Virtual environment already exists" + echo " Skipping dependency installation" +fi + +# Create launchd plist +echo "" +echo "2. Creating launchd service..." + +cat > "$PLIST_PATH" << EOF + + + + + Label + $PLIST_NAME + + ProgramArguments + + $STT_DIR/.venv/bin/uvicorn + app.main:app + --host + 0.0.0.0 + --port + 3020 + + + WorkingDirectory + $STT_DIR + + EnvironmentVariables + + PATH + $STT_DIR/.venv/bin:/usr/local/bin:/usr/bin:/bin + PORT + 3020 + WHISPER_MODEL + large-v3-turbo + PRELOAD_MODELS + false + CORS_ORIGINS + https://mana.how,https://chat.mana.how,https://todo.mana.how + + + RunAtLoad + + + KeepAlive + + SuccessfulExit + + Crashed + + + + ThrottleInterval + 10 + + StandardOutPath + /tmp/manacore-stt.log + + StandardErrorPath + /tmp/manacore-stt.error.log + + +EOF + +echo " Created: $PLIST_PATH" + +# Unload if already loaded +echo "" +echo "3. Loading launchd service..." +launchctl unload "$PLIST_PATH" 2>/dev/null || true +launchctl load "$PLIST_PATH" + +# Wait for service to start +sleep 2 + +# Check if service is running +echo "" +echo "4. Checking service status..." +if launchctl list | grep -q "$PLIST_NAME"; then + echo " Service is running" + + # Check health endpoint + sleep 3 + if curl -s http://localhost:3020/health > /dev/null 2>&1; then + echo " Health check passed" + HEALTH=$(curl -s http://localhost:3020/health) + echo " $HEALTH" + else + echo " Warning: Health check failed (service may still be starting)" + echo " Check logs: tail -f /tmp/manacore-stt.log" + fi +else + echo " Warning: Service may not be running" + echo " Check logs: tail -f /tmp/manacore-stt.error.log" +fi + +echo "" +echo "==============================================" +echo " STT Service Setup Complete!" +echo "==============================================" +echo "" +echo "Service URL: http://localhost:3020" +echo "" +echo "Useful commands:" +echo " # View logs" +echo " tail -f /tmp/manacore-stt.log" +echo "" +echo " # Restart service" +echo " launchctl kickstart -k gui/\$(id -u)/$PLIST_NAME" +echo "" +echo " # Stop service" +echo " launchctl unload $PLIST_PATH" +echo "" +echo " # Start service" +echo " launchctl load $PLIST_PATH" +echo "" +echo " # Test transcription" +echo " curl -X POST http://localhost:3020/transcribe \\" +echo " -F 'file=@audio.mp3' \\" +echo " -F 'language=de'" +echo "" diff --git a/scripts/mac-mini/status.sh b/scripts/mac-mini/status.sh index 79ad16c12..181466275 100755 --- a/scripts/mac-mini/status.sh +++ b/scripts/mac-mini/status.sh @@ -46,6 +46,7 @@ check_launchd() { check_launchd "com.cloudflare.cloudflared" "Cloudflared Tunnel" check_launchd "com.manacore.docker-startup" "Docker Startup" check_launchd "com.manacore.health-check" "Health Check (5min)" +check_launchd "com.manacore.stt" "STT Service (Whisper/Voxtral)" # ============================================ # Docker Status @@ -83,6 +84,27 @@ if docker info >/dev/null 2>&1; then done fi +# ============================================ +# Native Services (non-Docker) +# ============================================ +echo "" +echo -e "${BOLD}Native Services:${NC}" + +# Ollama +if curl -s --max-time 2 http://localhost:11434/api/tags >/dev/null 2>&1; then + OLLAMA_MODELS=$(curl -s http://localhost:11434/api/tags | grep -o '"name":"[^"]*"' | wc -l | tr -d ' ') + echo -e " ${GREEN}[Running]${NC} Ollama (${OLLAMA_MODELS} models)" +else + echo -e " ${YELLOW}[Stopped]${NC} Ollama" +fi + +# STT Service +if curl -s --max-time 2 http://localhost:3020/health >/dev/null 2>&1; then + echo -e " ${GREEN}[Running]${NC} STT Service (port 3020)" +else + echo -e " ${YELLOW}[Stopped]${NC} STT Service" +fi + # ============================================ # Network/Tunnel Status # ============================================ diff --git a/services/mana-stt/README.md b/services/mana-stt/README.md new file mode 100644 index 000000000..8ea9642f5 --- /dev/null +++ b/services/mana-stt/README.md @@ -0,0 +1,165 @@ +# ManaCore STT Service + +Speech-to-Text API service with **Whisper (Lightning MLX)** and **Voxtral Mini**. + +Optimized for Mac Mini M4 (Apple Silicon). + +## Features + +- **Whisper Large V3 Turbo** - Best quality, 99+ languages, German WER 6-9% +- **Voxtral Mini (3B)** - Mistral AI, Apache 2.0, 8 languages including German +- **Apple Silicon Optimized** - Uses MLX for 10x faster inference +- **REST API** - Simple HTTP endpoints for integration + +## Quick Start + +### Installation + +```bash +cd services/mana-stt +./setup.sh +``` + +### Run Locally + +```bash +source .venv/bin/activate +uvicorn app.main:app --host 0.0.0.0 --port 3020 +``` + +### Setup as System Service (Mac Mini) + +```bash +./scripts/mac-mini/setup-stt.sh +``` + +## API Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check | +| `/models` | GET | List available models | +| `/transcribe` | POST | Whisper transcription | +| `/transcribe/voxtral` | POST | Voxtral transcription | +| `/transcribe/auto` | POST | Auto-select best model | + +## Usage Examples + +### Transcribe with Whisper (Recommended) + +```bash +curl -X POST http://localhost:3020/transcribe \ + -F "file=@recording.mp3" \ + -F "language=de" +``` + +Response: +```json +{ + "text": "Das ist ein Beispieltext...", + "language": "de", + "model": "whisper-large-v3-turbo" +} +``` + +### Transcribe with Voxtral + +```bash +curl -X POST http://localhost:3020/transcribe/voxtral \ + -F "file=@recording.mp3" \ + -F "language=de" +``` + +### Auto-Select Model + +```bash +curl -X POST http://localhost:3020/transcribe/auto \ + -F "file=@recording.mp3" \ + -F "prefer=whisper" +``` + +## Configuration + +Environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `PORT` | `3020` | API server port | +| `WHISPER_MODEL` | `large-v3-turbo` | Default Whisper model | +| `PRELOAD_MODELS` | `false` | Load models on startup | +| `CORS_ORIGINS` | `https://mana.how,...` | Allowed CORS origins | + +## Supported Audio Formats + +- MP3, WAV, M4A, FLAC, OGG, WebM, MP4 +- Max file size: 100MB +- Any sample rate (automatically resampled to 16kHz) + +## Model Comparison + +| Model | German WER | Speed | VRAM | License | +|-------|------------|-------|------|---------| +| Whisper Large V3 Turbo | 6-9% | Fast | ~6 GB | MIT | +| Voxtral Mini (3B) | 8-12% | Medium | ~4 GB | Apache 2.0 | + +## Logs + +```bash +# Service logs +tail -f /tmp/manacore-stt.log + +# Error logs +tail -f /tmp/manacore-stt.error.log +``` + +## Troubleshooting + +### Model Download Slow + +First run downloads ~1.6 GB for Whisper and ~6 GB for Voxtral. Be patient. + +### Out of Memory + +Reduce batch size or use smaller model: +```bash +export WHISPER_MODEL=medium +``` + +### MPS Not Available + +Ensure PyTorch is installed with MPS support: +```bash +pip install torch torchvision torchaudio +python -c "import torch; print(torch.backends.mps.is_available())" +``` + +## Integration + +### From Chat Backend (NestJS) + +```typescript +const formData = new FormData(); +formData.append('file', audioBuffer, 'recording.webm'); +formData.append('language', 'de'); + +const response = await fetch('http://localhost:3020/transcribe', { + method: 'POST', + body: formData, +}); + +const { text } = await response.json(); +``` + +### From SvelteKit Web + +```typescript +const formData = new FormData(); +formData.append('file', audioBlob, 'recording.webm'); + +const response = await fetch('https://stt-api.mana.how/transcribe', { + method: 'POST', + body: formData, +}); + +const { text } = await response.json(); +``` diff --git a/services/mana-stt/app/__init__.py b/services/mana-stt/app/__init__.py new file mode 100644 index 000000000..4edcfe762 --- /dev/null +++ b/services/mana-stt/app/__init__.py @@ -0,0 +1 @@ +# ManaCore STT Service diff --git a/services/mana-stt/app/main.py b/services/mana-stt/app/main.py new file mode 100644 index 000000000..a327a304e --- /dev/null +++ b/services/mana-stt/app/main.py @@ -0,0 +1,309 @@ +""" +ManaCore STT API Service +Speech-to-Text with Whisper (MLX) and Voxtral + +Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020 +""" + +import os +import logging +from typing import Optional +from contextlib import asynccontextmanager + +from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# Environment +PORT = int(os.getenv("PORT", "3020")) +DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3-turbo") +PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true" +CORS_ORIGINS = os.getenv( + "CORS_ORIGINS", + "https://mana.how,https://chat.mana.how,http://localhost:5173" +).split(",") + + +# Response models +class TranscriptionResponse(BaseModel): + text: str + language: Optional[str] = None + model: str + duration_seconds: Optional[float] = None + + +class HealthResponse(BaseModel): + status: str + whisper_loaded: bool + voxtral_loaded: bool + models: dict + + +class ModelsResponse(BaseModel): + whisper: list + voxtral: list + default_whisper: str + + +# Track loaded models +models_status = { + "whisper_loaded": False, + "voxtral_loaded": False, +} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup and shutdown events.""" + logger.info("Starting ManaCore STT Service...") + + # Optionally preload models on startup + if PRELOAD_MODELS: + logger.info("Preloading models (PRELOAD_MODELS=true)...") + try: + from app.whisper_service import get_whisper_model + get_whisper_model(DEFAULT_WHISPER_MODEL) + models_status["whisper_loaded"] = True + logger.info("Whisper model preloaded") + except Exception as e: + logger.warning(f"Failed to preload Whisper: {e}") + + try: + from app.voxtral_service import get_voxtral_model + get_voxtral_model() + models_status["voxtral_loaded"] = True + logger.info("Voxtral model preloaded") + except Exception as e: + logger.warning(f"Failed to preload Voxtral: {e}") + else: + logger.info("Models will be loaded on first request (lazy loading)") + + logger.info(f"STT Service ready on port {PORT}") + yield + logger.info("Shutting down STT Service...") + + +# Create FastAPI app +app = FastAPI( + title="ManaCore STT Service", + description="Speech-to-Text API with Whisper (MLX) and Voxtral", + version="1.0.0", + lifespan=lifespan, +) + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=CORS_ORIGINS, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint.""" + return HealthResponse( + status="healthy", + whisper_loaded=models_status["whisper_loaded"], + voxtral_loaded=models_status["voxtral_loaded"], + models={ + "default_whisper": DEFAULT_WHISPER_MODEL, + }, + ) + + +@app.get("/models", response_model=ModelsResponse) +async def list_models(): + """List available models.""" + from app.whisper_service import AVAILABLE_MODELS as whisper_models + from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages + + return ModelsResponse( + whisper=whisper_models, + voxtral=voxtral_languages, + default_whisper=DEFAULT_WHISPER_MODEL, + ) + + +@app.post("/transcribe", response_model=TranscriptionResponse) +async def transcribe_whisper( + file: UploadFile = File(..., description="Audio file to transcribe"), + language: Optional[str] = Form( + None, + description="Language code (e.g., 'de', 'en'). Auto-detect if not provided." + ), + model: str = Form( + None, + description="Whisper model to use (default: large-v3-turbo)" + ), +): + """ + Transcribe audio using Whisper (Lightning MLX). + + Supported formats: mp3, wav, m4a, flac, ogg, webm + Max file size: 100MB + """ + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + # Validate file type + allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"} + ext = os.path.splitext(file.filename)[1].lower() + if ext not in allowed_extensions: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}" + ) + + try: + from app.whisper_service import transcribe_audio_bytes + + # Read file + audio_bytes = await file.read() + + # Check file size (100MB limit) + if len(audio_bytes) > 100 * 1024 * 1024: + raise HTTPException(status_code=400, detail="File too large (max 100MB)") + + # Use default model if not specified + model_name = model or DEFAULT_WHISPER_MODEL + + # Transcribe + result = await transcribe_audio_bytes( + audio_bytes=audio_bytes, + filename=file.filename, + language=language, + model_name=model_name, + ) + + models_status["whisper_loaded"] = True + + return TranscriptionResponse( + text=result.text, + language=result.language, + model=f"whisper-{model_name}", + ) + + except Exception as e: + logger.error(f"Transcription error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/transcribe/voxtral", response_model=TranscriptionResponse) +async def transcribe_voxtral( + file: UploadFile = File(..., description="Audio file to transcribe"), + language: str = Form( + "de", + description="Language code (de, en, fr, es, pt, it, nl, hi)" + ), +): + """ + Transcribe audio using Voxtral Mini (Mistral AI). + + Best for: German, French, European languages + Supported formats: mp3, wav, m4a, flac + Max file size: 100MB + """ + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + # Validate language + from app.voxtral_service import SUPPORTED_LANGUAGES + if language not in SUPPORTED_LANGUAGES: + raise HTTPException( + status_code=400, + detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}" + ) + + try: + from app.voxtral_service import transcribe_audio_bytes + + audio_bytes = await file.read() + + if len(audio_bytes) > 100 * 1024 * 1024: + raise HTTPException(status_code=400, detail="File too large (max 100MB)") + + result = await transcribe_audio_bytes( + audio_bytes=audio_bytes, + filename=file.filename, + language=language, + ) + + models_status["voxtral_loaded"] = True + + return TranscriptionResponse( + text=result.text, + language=result.language, + model=result.model, + ) + + except Exception as e: + logger.error(f"Voxtral transcription error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/transcribe/auto", response_model=TranscriptionResponse) +async def transcribe_auto( + file: UploadFile = File(..., description="Audio file to transcribe"), + language: Optional[str] = Form( + None, + description="Language hint (optional)" + ), + prefer: str = Form( + "whisper", + description="Preferred model: 'whisper' or 'voxtral'" + ), +): + """ + Transcribe audio with automatic model selection. + + - Uses Whisper by default (faster, more languages) + - Falls back to Voxtral if Whisper fails + """ + if prefer == "voxtral": + # Try Voxtral first + try: + return await transcribe_voxtral(file, language or "de") + except Exception as e: + logger.warning(f"Voxtral failed, trying Whisper: {e}") + # Reset file position + await file.seek(0) + return await transcribe_whisper(file, language, None) + else: + # Try Whisper first (default) + try: + return await transcribe_whisper(file, language, None) + except Exception as e: + logger.warning(f"Whisper failed, trying Voxtral: {e}") + await file.seek(0) + return await transcribe_voxtral(file, language or "de") + + +# Error handlers +@app.exception_handler(Exception) +async def global_exception_handler(request, exc): + logger.error(f"Unhandled error: {exc}") + return JSONResponse( + status_code=500, + content={"detail": "Internal server error", "error": str(exc)}, + ) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "app.main:app", + host="0.0.0.0", + port=PORT, + reload=False, + ) diff --git a/services/mana-stt/app/voxtral_service.py b/services/mana-stt/app/voxtral_service.py new file mode 100644 index 000000000..88cf8fd5d --- /dev/null +++ b/services/mana-stt/app/voxtral_service.py @@ -0,0 +1,198 @@ +""" +Voxtral STT Service using Hugging Face Transformers +Mistral AI's Speech-to-Text model (Apache 2.0 License) +""" + +import os +import tempfile +import logging +import base64 +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# Lazy load to avoid import errors +_voxtral_model = None +_voxtral_processor = None + + +@dataclass +class VoxtralTranscriptionResult: + text: str + language: Optional[str] = None + model: str = "voxtral-mini" + + +def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"): + """ + Get or create Voxtral model instance. + + Note: Voxtral Mini (3B) is recommended for Mac Mini M4. + Voxtral Small (24B) requires more VRAM. + """ + global _voxtral_model, _voxtral_processor + + if _voxtral_model is None: + logger.info(f"Loading Voxtral model: {model_name}") + try: + import torch + from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor + + # Determine device + if torch.backends.mps.is_available(): + device = "mps" + torch_dtype = torch.float16 + elif torch.cuda.is_available(): + device = "cuda" + torch_dtype = torch.float16 + else: + device = "cpu" + torch_dtype = torch.float32 + + logger.info(f"Using device: {device}") + + # Load processor + _voxtral_processor = AutoProcessor.from_pretrained( + model_name, + trust_remote_code=True, + ) + + # Load model + _voxtral_model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_name, + torch_dtype=torch_dtype, + device_map="auto", + trust_remote_code=True, + ) + + logger.info(f"Voxtral model loaded successfully on {device}") + + except ImportError as e: + logger.error(f"Failed to import transformers: {e}") + raise RuntimeError( + "transformers not installed. " + "Run: pip install transformers torch" + ) + except Exception as e: + logger.error(f"Failed to load Voxtral model: {e}") + raise + + return _voxtral_model, _voxtral_processor + + +def transcribe_audio( + audio_path: str, + language: Optional[str] = "de", + model_name: str = "mistralai/Voxtral-Mini-3B-2507", +) -> VoxtralTranscriptionResult: + """ + Transcribe audio file using Voxtral. + + Args: + audio_path: Path to audio file + language: Target language for transcription + model_name: Hugging Face model ID + + Returns: + VoxtralTranscriptionResult with transcribed text + """ + import torch + import soundfile as sf + + model, processor = get_voxtral_model(model_name) + + logger.info(f"Transcribing with Voxtral: {audio_path}") + + try: + # Load audio + audio_array, sample_rate = sf.read(audio_path) + + # Resample to 16kHz if needed + if sample_rate != 16000: + import numpy as np + from scipy import signal + + num_samples = int(len(audio_array) * 16000 / sample_rate) + audio_array = signal.resample(audio_array, num_samples) + sample_rate = 16000 + + # Process audio + inputs = processor( + audio_array, + sampling_rate=sample_rate, + return_tensors="pt", + ) + + # Move to same device as model + device = next(model.parameters()).device + inputs = {k: v.to(device) for k, v in inputs.items()} + + # Generate transcription + with torch.no_grad(): + generated_ids = model.generate( + **inputs, + max_new_tokens=448, + language=language, + ) + + # Decode + text = processor.batch_decode( + generated_ids, + skip_special_tokens=True, + )[0] + + logger.info(f"Voxtral transcription complete: {len(text)} characters") + + return VoxtralTranscriptionResult( + text=text.strip(), + language=language, + model="voxtral-mini", + ) + + except Exception as e: + logger.error(f"Voxtral transcription failed: {e}") + raise + + +async def transcribe_audio_bytes( + audio_bytes: bytes, + filename: str, + language: Optional[str] = "de", + model_name: str = "mistralai/Voxtral-Mini-3B-2507", +) -> VoxtralTranscriptionResult: + """ + Transcribe audio from bytes (for API uploads). + """ + ext = Path(filename).suffix or ".wav" + + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: + tmp.write(audio_bytes) + tmp_path = tmp.name + + try: + result = transcribe_audio( + audio_path=tmp_path, + language=language, + model_name=model_name, + ) + return result + finally: + try: + os.unlink(tmp_path) + except Exception: + pass + + +# Supported languages by Voxtral +SUPPORTED_LANGUAGES = [ + "en", # English + "de", # German + "fr", # French + "es", # Spanish + "pt", # Portuguese + "it", # Italian + "nl", # Dutch + "hi", # Hindi +] diff --git a/services/mana-stt/app/whisper_service.py b/services/mana-stt/app/whisper_service.py new file mode 100644 index 000000000..a41556c01 --- /dev/null +++ b/services/mana-stt/app/whisper_service.py @@ -0,0 +1,163 @@ +""" +Whisper STT Service using Lightning Whisper MLX +Optimized for Apple Silicon (M1/M2/M3/M4) +""" + +import os +import tempfile +import logging +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# Lazy load to avoid import errors if not installed +_whisper_model = None + + +@dataclass +class TranscriptionResult: + text: str + language: Optional[str] = None + duration: Optional[float] = None + segments: Optional[list] = None + + +def get_whisper_model(model_name: str = "large-v3-turbo", batch_size: int = 12): + """Get or create Whisper model instance (singleton pattern).""" + global _whisper_model + + if _whisper_model is None: + logger.info(f"Loading Whisper model: {model_name}") + try: + from lightning_whisper_mlx import LightningWhisperMLX + _whisper_model = LightningWhisperMLX( + model=model_name, + batch_size=batch_size, + quant=None # Use full precision for best quality + ) + logger.info(f"Whisper model loaded successfully: {model_name}") + except ImportError as e: + logger.error(f"Failed to import lightning_whisper_mlx: {e}") + raise RuntimeError( + "lightning-whisper-mlx not installed. " + "Run: pip install lightning-whisper-mlx" + ) + except Exception as e: + logger.error(f"Failed to load Whisper model: {e}") + raise + + return _whisper_model + + +def transcribe_audio( + audio_path: str, + language: Optional[str] = None, + model_name: str = "large-v3-turbo", +) -> TranscriptionResult: + """ + Transcribe audio file using Lightning Whisper MLX. + + Args: + audio_path: Path to audio file (mp3, wav, m4a, etc.) + language: Optional language code (e.g., 'de', 'en'). Auto-detect if None. + model_name: Whisper model to use + + Returns: + TranscriptionResult with text and metadata + """ + model = get_whisper_model(model_name) + + logger.info(f"Transcribing: {audio_path}") + + try: + # Lightning Whisper MLX returns dict with 'text' key + result = model.transcribe( + audio_path=audio_path, + language=language, + ) + + # Handle different return formats + if isinstance(result, dict): + text = result.get("text", "") + segments = result.get("segments", []) + detected_language = result.get("language", language) + else: + text = str(result) + segments = [] + detected_language = language + + logger.info(f"Transcription complete: {len(text)} characters") + + return TranscriptionResult( + text=text.strip(), + language=detected_language, + segments=segments, + ) + + except Exception as e: + logger.error(f"Transcription failed: {e}") + raise + + +async def transcribe_audio_bytes( + audio_bytes: bytes, + filename: str, + language: Optional[str] = None, + model_name: str = "large-v3-turbo", +) -> TranscriptionResult: + """ + Transcribe audio from bytes (for API uploads). + + Args: + audio_bytes: Raw audio file bytes + filename: Original filename (for extension detection) + language: Optional language code + model_name: Whisper model to use + + Returns: + TranscriptionResult + """ + # Get file extension + ext = Path(filename).suffix or ".wav" + + # Write to temp file + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: + tmp.write(audio_bytes) + tmp_path = tmp.name + + try: + result = transcribe_audio( + audio_path=tmp_path, + language=language, + model_name=model_name, + ) + return result + finally: + # Clean up temp file + try: + os.unlink(tmp_path) + except Exception: + pass + + +# Available models for reference +AVAILABLE_MODELS = [ + "tiny", + "tiny.en", + "base", + "base.en", + "small", + "small.en", + "medium", + "medium.en", + "large", + "large-v2", + "large-v3", + "large-v3-turbo", # Recommended for Mac Mini + "distil-small.en", + "distil-medium.en", + "distil-large-v2", + "distil-large-v3", +] diff --git a/services/mana-stt/requirements.txt b/services/mana-stt/requirements.txt new file mode 100644 index 000000000..47da624dd --- /dev/null +++ b/services/mana-stt/requirements.txt @@ -0,0 +1,25 @@ +# ManaCore STT Service Dependencies +# For Mac Mini M4 (Apple Silicon) + +# Web Framework +fastapi==0.115.6 +uvicorn[standard]==0.34.0 +python-multipart==0.0.20 + +# Audio Processing +pydub==0.25.1 +soundfile==0.13.1 + +# Whisper (Apple Silicon optimized) +lightning-whisper-mlx==0.0.10 +mlx>=0.21.0 + +# Voxtral (Hugging Face Transformers) +transformers>=4.47.0 +torch>=2.5.0 +accelerate>=1.2.0 +sentencepiece>=0.2.0 + +# Utilities +numpy>=1.26.0 +tqdm>=4.67.0 diff --git a/services/mana-stt/setup.sh b/services/mana-stt/setup.sh new file mode 100755 index 000000000..1df05dd9e --- /dev/null +++ b/services/mana-stt/setup.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# ManaCore STT Service Setup Script +# For Mac Mini M4 (Apple Silicon) + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="$SCRIPT_DIR/.venv" +PYTHON_VERSION="3.11" + +echo "==============================================" +echo " ManaCore STT Service Setup" +echo " Whisper (Lightning MLX) + Voxtral" +echo "==============================================" +echo "" + +# Check if running on macOS +if [[ "$(uname)" != "Darwin" ]]; then + echo "Warning: This script is optimized for macOS (Apple Silicon)" +fi + +# Check for Apple Silicon +if [[ "$(uname -m)" != "arm64" ]]; then + echo "Warning: Not running on Apple Silicon. MLX optimizations won't work." +fi + +# Check Python version +echo "1. Checking Python installation..." +if command -v python3.11 &> /dev/null; then + PYTHON_CMD="python3.11" +elif command -v python3 &> /dev/null; then + PYTHON_CMD="python3" + PY_VERSION=$($PYTHON_CMD --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2) + echo " Found Python $PY_VERSION" +else + echo "Error: Python 3 not found. Please install Python 3.11+" + echo " brew install python@3.11" + exit 1 +fi + +# Create virtual environment +echo "" +echo "2. Creating virtual environment..." +if [ -d "$VENV_DIR" ]; then + echo " Virtual environment already exists at $VENV_DIR" + read -p " Recreate? (y/N) " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -rf "$VENV_DIR" + $PYTHON_CMD -m venv "$VENV_DIR" + echo " Virtual environment recreated" + fi +else + $PYTHON_CMD -m venv "$VENV_DIR" + echo " Virtual environment created at $VENV_DIR" +fi + +# Activate virtual environment +source "$VENV_DIR/bin/activate" + +# Upgrade pip +echo "" +echo "3. Upgrading pip..." +pip install --upgrade pip wheel setuptools + +# Install dependencies +echo "" +echo "4. Installing dependencies..." +echo " This may take several minutes (downloading large models)..." + +# Install PyTorch with MPS support first +pip install torch torchvision torchaudio + +# Install MLX for Apple Silicon +pip install mlx + +# Install other dependencies +pip install -r "$SCRIPT_DIR/requirements.txt" + +# Install scipy for audio resampling (needed by Voxtral) +pip install scipy + +echo "" +echo "5. Verifying installation..." + +# Test imports +python -c "import torch; print(f' PyTorch {torch.__version__} - MPS available: {torch.backends.mps.is_available()}')" +python -c "import mlx; print(f' MLX installed')" 2>/dev/null || echo " MLX not available (CPU fallback)" +python -c "import fastapi; print(f' FastAPI {fastapi.__version__}')" + +echo "" +echo "6. Downloading Whisper model (large-v3-turbo)..." +echo " This will download ~1.6 GB on first run..." +# Pre-download the model +python -c " +from lightning_whisper_mlx import LightningWhisperMLX +print(' Initializing Whisper model...') +whisper = LightningWhisperMLX(model='large-v3-turbo', batch_size=12) +print(' Whisper model ready!') +" || echo " Note: Model will be downloaded on first transcription request" + +echo "" +echo "==============================================" +echo " Setup Complete!" +echo "==============================================" +echo "" +echo "To start the STT service:" +echo "" +echo " cd $SCRIPT_DIR" +echo " source .venv/bin/activate" +echo " uvicorn app.main:app --host 0.0.0.0 --port 3020" +echo "" +echo "Or use the systemd/launchd service (recommended for production):" +echo "" +echo " ./scripts/mac-mini/setup-stt.sh" +echo "" +echo "API Endpoints:" +echo " POST /transcribe - Whisper transcription" +echo " POST /transcribe/voxtral - Voxtral transcription" +echo " POST /transcribe/auto - Auto-select best model" +echo " GET /health - Health check" +echo " GET /models - List available models" +echo ""