feat(stt): add speech-to-text service for Mac Mini

Add mana-stt service with Whisper and Voxtral support for local
transcription. Includes setup script and launchd integration for
automatic startup on Mac Mini server.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-01-27 01:33:10 +01:00
parent aeabd21d4a
commit bf0fa04e7e
10 changed files with 1206 additions and 0 deletions

View file

@ -23,6 +23,7 @@ cd ~/projects/manacore-monorepo
| Script | Purpose |
|--------|---------|
| `setup-autostart.sh` | Configure automatic startup on boot (run once) |
| `setup-stt.sh` | Setup STT service (Whisper + Voxtral) |
| `startup.sh` | Main startup script (called by launchd) |
| `health-check.sh` | Check all services health |
| `status.sh` | Show full system status |
@ -143,6 +144,7 @@ Three services are configured to run automatically:
| Cloudflared | `com.cloudflare.cloudflared` | Tunnel to Cloudflare |
| Docker Startup | `com.manacore.docker-startup` | Start containers on boot |
| Health Check | `com.manacore.health-check` | Check every 5 minutes |
| STT Service | `com.manacore.stt` | Speech-to-Text (Whisper + Voxtral) |
### Manual Service Control
@ -238,4 +240,49 @@ Once running, services are available at:
| Calendar API | https://calendar-api.mana.how |
| Clock | https://clock.mana.how |
| Clock API | https://clock-api.mana.how |
| STT API | http://localhost:3020 (internal only) |
| SSH | ssh mac-mini (via cloudflared) |
## Native Services (non-Docker)
### Ollama (LLM)
Ollama runs natively on Mac Mini for LLM inference:
```bash
# Check status
curl http://localhost:11434/api/tags
# List models
ollama list
# Pull a model
ollama pull gemma3:4b
```
### STT Service (Speech-to-Text)
The STT service provides Whisper and Voxtral transcription:
```bash
# Setup (first time)
./scripts/mac-mini/setup-stt.sh
# Check status
curl http://localhost:3020/health
# Transcribe audio
curl -X POST http://localhost:3020/transcribe \
-F "file=@audio.mp3" \
-F "language=de"
# View logs
tail -f /tmp/manacore-stt.log
```
**Available endpoints:**
- `POST /transcribe` - Whisper transcription (recommended)
- `POST /transcribe/voxtral` - Voxtral transcription
- `POST /transcribe/auto` - Auto-select model
- `GET /health` - Health check
- `GET /models` - List available models

153
scripts/mac-mini/setup-stt.sh Executable file
View file

@ -0,0 +1,153 @@
#!/bin/bash
# Setup STT Service on Mac Mini
# Creates launchd service for auto-start
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
STT_DIR="$REPO_DIR/services/mana-stt"
PLIST_NAME="com.manacore.stt"
PLIST_PATH="$HOME/Library/LaunchAgents/$PLIST_NAME.plist"
echo "=============================================="
echo " ManaCore STT Service Setup (Mac Mini)"
echo "=============================================="
echo ""
# Check if STT service directory exists
if [ ! -d "$STT_DIR" ]; then
echo "Error: STT service directory not found at $STT_DIR"
exit 1
fi
# Run the main setup script first
echo "1. Running STT service setup..."
cd "$STT_DIR"
if [ ! -d ".venv" ]; then
echo " Installing dependencies..."
./setup.sh
else
echo " Virtual environment already exists"
echo " Skipping dependency installation"
fi
# Create launchd plist
echo ""
echo "2. Creating launchd service..."
cat > "$PLIST_PATH" << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>$PLIST_NAME</string>
<key>ProgramArguments</key>
<array>
<string>$STT_DIR/.venv/bin/uvicorn</string>
<string>app.main:app</string>
<string>--host</string>
<string>0.0.0.0</string>
<string>--port</string>
<string>3020</string>
</array>
<key>WorkingDirectory</key>
<string>$STT_DIR</string>
<key>EnvironmentVariables</key>
<dict>
<key>PATH</key>
<string>$STT_DIR/.venv/bin:/usr/local/bin:/usr/bin:/bin</string>
<key>PORT</key>
<string>3020</string>
<key>WHISPER_MODEL</key>
<string>large-v3-turbo</string>
<key>PRELOAD_MODELS</key>
<string>false</string>
<key>CORS_ORIGINS</key>
<string>https://mana.how,https://chat.mana.how,https://todo.mana.how</string>
</dict>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<dict>
<key>SuccessfulExit</key>
<false/>
<key>Crashed</key>
<true/>
</dict>
<key>ThrottleInterval</key>
<integer>10</integer>
<key>StandardOutPath</key>
<string>/tmp/manacore-stt.log</string>
<key>StandardErrorPath</key>
<string>/tmp/manacore-stt.error.log</string>
</dict>
</plist>
EOF
echo " Created: $PLIST_PATH"
# Unload if already loaded
echo ""
echo "3. Loading launchd service..."
launchctl unload "$PLIST_PATH" 2>/dev/null || true
launchctl load "$PLIST_PATH"
# Wait for service to start
sleep 2
# Check if service is running
echo ""
echo "4. Checking service status..."
if launchctl list | grep -q "$PLIST_NAME"; then
echo " Service is running"
# Check health endpoint
sleep 3
if curl -s http://localhost:3020/health > /dev/null 2>&1; then
echo " Health check passed"
HEALTH=$(curl -s http://localhost:3020/health)
echo " $HEALTH"
else
echo " Warning: Health check failed (service may still be starting)"
echo " Check logs: tail -f /tmp/manacore-stt.log"
fi
else
echo " Warning: Service may not be running"
echo " Check logs: tail -f /tmp/manacore-stt.error.log"
fi
echo ""
echo "=============================================="
echo " STT Service Setup Complete!"
echo "=============================================="
echo ""
echo "Service URL: http://localhost:3020"
echo ""
echo "Useful commands:"
echo " # View logs"
echo " tail -f /tmp/manacore-stt.log"
echo ""
echo " # Restart service"
echo " launchctl kickstart -k gui/\$(id -u)/$PLIST_NAME"
echo ""
echo " # Stop service"
echo " launchctl unload $PLIST_PATH"
echo ""
echo " # Start service"
echo " launchctl load $PLIST_PATH"
echo ""
echo " # Test transcription"
echo " curl -X POST http://localhost:3020/transcribe \\"
echo " -F 'file=@audio.mp3' \\"
echo " -F 'language=de'"
echo ""

View file

@ -46,6 +46,7 @@ check_launchd() {
check_launchd "com.cloudflare.cloudflared" "Cloudflared Tunnel"
check_launchd "com.manacore.docker-startup" "Docker Startup"
check_launchd "com.manacore.health-check" "Health Check (5min)"
check_launchd "com.manacore.stt" "STT Service (Whisper/Voxtral)"
# ============================================
# Docker Status
@ -83,6 +84,27 @@ if docker info >/dev/null 2>&1; then
done
fi
# ============================================
# Native Services (non-Docker)
# ============================================
echo ""
echo -e "${BOLD}Native Services:${NC}"
# Ollama
if curl -s --max-time 2 http://localhost:11434/api/tags >/dev/null 2>&1; then
OLLAMA_MODELS=$(curl -s http://localhost:11434/api/tags | grep -o '"name":"[^"]*"' | wc -l | tr -d ' ')
echo -e " ${GREEN}[Running]${NC} Ollama (${OLLAMA_MODELS} models)"
else
echo -e " ${YELLOW}[Stopped]${NC} Ollama"
fi
# STT Service
if curl -s --max-time 2 http://localhost:3020/health >/dev/null 2>&1; then
echo -e " ${GREEN}[Running]${NC} STT Service (port 3020)"
else
echo -e " ${YELLOW}[Stopped]${NC} STT Service"
fi
# ============================================
# Network/Tunnel Status
# ============================================

165
services/mana-stt/README.md Normal file
View file

@ -0,0 +1,165 @@
# ManaCore STT Service
Speech-to-Text API service with **Whisper (Lightning MLX)** and **Voxtral Mini**.
Optimized for Mac Mini M4 (Apple Silicon).
## Features
- **Whisper Large V3 Turbo** - Best quality, 99+ languages, German WER 6-9%
- **Voxtral Mini (3B)** - Mistral AI, Apache 2.0, 8 languages including German
- **Apple Silicon Optimized** - Uses MLX for 10x faster inference
- **REST API** - Simple HTTP endpoints for integration
## Quick Start
### Installation
```bash
cd services/mana-stt
./setup.sh
```
### Run Locally
```bash
source .venv/bin/activate
uvicorn app.main:app --host 0.0.0.0 --port 3020
```
### Setup as System Service (Mac Mini)
```bash
./scripts/mac-mini/setup-stt.sh
```
## API Endpoints
| Endpoint | Method | Description |
|----------|--------|-------------|
| `/health` | GET | Health check |
| `/models` | GET | List available models |
| `/transcribe` | POST | Whisper transcription |
| `/transcribe/voxtral` | POST | Voxtral transcription |
| `/transcribe/auto` | POST | Auto-select best model |
## Usage Examples
### Transcribe with Whisper (Recommended)
```bash
curl -X POST http://localhost:3020/transcribe \
-F "file=@recording.mp3" \
-F "language=de"
```
Response:
```json
{
"text": "Das ist ein Beispieltext...",
"language": "de",
"model": "whisper-large-v3-turbo"
}
```
### Transcribe with Voxtral
```bash
curl -X POST http://localhost:3020/transcribe/voxtral \
-F "file=@recording.mp3" \
-F "language=de"
```
### Auto-Select Model
```bash
curl -X POST http://localhost:3020/transcribe/auto \
-F "file=@recording.mp3" \
-F "prefer=whisper"
```
## Configuration
Environment variables:
| Variable | Default | Description |
|----------|---------|-------------|
| `PORT` | `3020` | API server port |
| `WHISPER_MODEL` | `large-v3-turbo` | Default Whisper model |
| `PRELOAD_MODELS` | `false` | Load models on startup |
| `CORS_ORIGINS` | `https://mana.how,...` | Allowed CORS origins |
## Supported Audio Formats
- MP3, WAV, M4A, FLAC, OGG, WebM, MP4
- Max file size: 100MB
- Any sample rate (automatically resampled to 16kHz)
## Model Comparison
| Model | German WER | Speed | VRAM | License |
|-------|------------|-------|------|---------|
| Whisper Large V3 Turbo | 6-9% | Fast | ~6 GB | MIT |
| Voxtral Mini (3B) | 8-12% | Medium | ~4 GB | Apache 2.0 |
## Logs
```bash
# Service logs
tail -f /tmp/manacore-stt.log
# Error logs
tail -f /tmp/manacore-stt.error.log
```
## Troubleshooting
### Model Download Slow
First run downloads ~1.6 GB for Whisper and ~6 GB for Voxtral. Be patient.
### Out of Memory
Reduce batch size or use smaller model:
```bash
export WHISPER_MODEL=medium
```
### MPS Not Available
Ensure PyTorch is installed with MPS support:
```bash
pip install torch torchvision torchaudio
python -c "import torch; print(torch.backends.mps.is_available())"
```
## Integration
### From Chat Backend (NestJS)
```typescript
const formData = new FormData();
formData.append('file', audioBuffer, 'recording.webm');
formData.append('language', 'de');
const response = await fetch('http://localhost:3020/transcribe', {
method: 'POST',
body: formData,
});
const { text } = await response.json();
```
### From SvelteKit Web
```typescript
const formData = new FormData();
formData.append('file', audioBlob, 'recording.webm');
const response = await fetch('https://stt-api.mana.how/transcribe', {
method: 'POST',
body: formData,
});
const { text } = await response.json();
```

View file

@ -0,0 +1 @@
# ManaCore STT Service

View file

@ -0,0 +1,309 @@
"""
ManaCore STT API Service
Speech-to-Text with Whisper (MLX) and Voxtral
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
"""
import os
import logging
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Environment
PORT = int(os.getenv("PORT", "3020"))
DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3-turbo")
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
CORS_ORIGINS = os.getenv(
"CORS_ORIGINS",
"https://mana.how,https://chat.mana.how,http://localhost:5173"
).split(",")
# Response models
class TranscriptionResponse(BaseModel):
text: str
language: Optional[str] = None
model: str
duration_seconds: Optional[float] = None
class HealthResponse(BaseModel):
status: str
whisper_loaded: bool
voxtral_loaded: bool
models: dict
class ModelsResponse(BaseModel):
whisper: list
voxtral: list
default_whisper: str
# Track loaded models
models_status = {
"whisper_loaded": False,
"voxtral_loaded": False,
}
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
logger.info("Starting ManaCore STT Service...")
# Optionally preload models on startup
if PRELOAD_MODELS:
logger.info("Preloading models (PRELOAD_MODELS=true)...")
try:
from app.whisper_service import get_whisper_model
get_whisper_model(DEFAULT_WHISPER_MODEL)
models_status["whisper_loaded"] = True
logger.info("Whisper model preloaded")
except Exception as e:
logger.warning(f"Failed to preload Whisper: {e}")
try:
from app.voxtral_service import get_voxtral_model
get_voxtral_model()
models_status["voxtral_loaded"] = True
logger.info("Voxtral model preloaded")
except Exception as e:
logger.warning(f"Failed to preload Voxtral: {e}")
else:
logger.info("Models will be loaded on first request (lazy loading)")
logger.info(f"STT Service ready on port {PORT}")
yield
logger.info("Shutting down STT Service...")
# Create FastAPI app
app = FastAPI(
title="ManaCore STT Service",
description="Speech-to-Text API with Whisper (MLX) and Voxtral",
version="1.0.0",
lifespan=lifespan,
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint."""
return HealthResponse(
status="healthy",
whisper_loaded=models_status["whisper_loaded"],
voxtral_loaded=models_status["voxtral_loaded"],
models={
"default_whisper": DEFAULT_WHISPER_MODEL,
},
)
@app.get("/models", response_model=ModelsResponse)
async def list_models():
"""List available models."""
from app.whisper_service import AVAILABLE_MODELS as whisper_models
from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
return ModelsResponse(
whisper=whisper_models,
voxtral=voxtral_languages,
default_whisper=DEFAULT_WHISPER_MODEL,
)
@app.post("/transcribe", response_model=TranscriptionResponse)
async def transcribe_whisper(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
),
model: str = Form(
None,
description="Whisper model to use (default: large-v3-turbo)"
),
):
"""
Transcribe audio using Whisper (Lightning MLX).
Supported formats: mp3, wav, m4a, flac, ogg, webm
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate file type
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
ext = os.path.splitext(file.filename)[1].lower()
if ext not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
)
try:
from app.whisper_service import transcribe_audio_bytes
# Read file
audio_bytes = await file.read()
# Check file size (100MB limit)
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
# Use default model if not specified
model_name = model or DEFAULT_WHISPER_MODEL
# Transcribe
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
model_name=model_name,
)
models_status["whisper_loaded"] = True
return TranscriptionResponse(
text=result.text,
language=result.language,
model=f"whisper-{model_name}",
)
except Exception as e:
logger.error(f"Transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
async def transcribe_voxtral(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: str = Form(
"de",
description="Language code (de, en, fr, es, pt, it, nl, hi)"
),
):
"""
Transcribe audio using Voxtral Mini (Mistral AI).
Best for: German, French, European languages
Supported formats: mp3, wav, m4a, flac
Max file size: 100MB
"""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Validate language
from app.voxtral_service import SUPPORTED_LANGUAGES
if language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
)
try:
from app.voxtral_service import transcribe_audio_bytes
audio_bytes = await file.read()
if len(audio_bytes) > 100 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
result = await transcribe_audio_bytes(
audio_bytes=audio_bytes,
filename=file.filename,
language=language,
)
models_status["voxtral_loaded"] = True
return TranscriptionResponse(
text=result.text,
language=result.language,
model=result.model,
)
except Exception as e:
logger.error(f"Voxtral transcription error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
async def transcribe_auto(
file: UploadFile = File(..., description="Audio file to transcribe"),
language: Optional[str] = Form(
None,
description="Language hint (optional)"
),
prefer: str = Form(
"whisper",
description="Preferred model: 'whisper' or 'voxtral'"
),
):
"""
Transcribe audio with automatic model selection.
- Uses Whisper by default (faster, more languages)
- Falls back to Voxtral if Whisper fails
"""
if prefer == "voxtral":
# Try Voxtral first
try:
return await transcribe_voxtral(file, language or "de")
except Exception as e:
logger.warning(f"Voxtral failed, trying Whisper: {e}")
# Reset file position
await file.seek(0)
return await transcribe_whisper(file, language, None)
else:
# Try Whisper first (default)
try:
return await transcribe_whisper(file, language, None)
except Exception as e:
logger.warning(f"Whisper failed, trying Voxtral: {e}")
await file.seek(0)
return await transcribe_voxtral(file, language or "de")
# Error handlers
@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
logger.error(f"Unhandled error: {exc}")
return JSONResponse(
status_code=500,
content={"detail": "Internal server error", "error": str(exc)},
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=PORT,
reload=False,
)

View file

@ -0,0 +1,198 @@
"""
Voxtral STT Service using Hugging Face Transformers
Mistral AI's Speech-to-Text model (Apache 2.0 License)
"""
import os
import tempfile
import logging
import base64
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Lazy load to avoid import errors
_voxtral_model = None
_voxtral_processor = None
@dataclass
class VoxtralTranscriptionResult:
text: str
language: Optional[str] = None
model: str = "voxtral-mini"
def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
"""
Get or create Voxtral model instance.
Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
Voxtral Small (24B) requires more VRAM.
"""
global _voxtral_model, _voxtral_processor
if _voxtral_model is None:
logger.info(f"Loading Voxtral model: {model_name}")
try:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
# Determine device
if torch.backends.mps.is_available():
device = "mps"
torch_dtype = torch.float16
elif torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.float16
else:
device = "cpu"
torch_dtype = torch.float32
logger.info(f"Using device: {device}")
# Load processor
_voxtral_processor = AutoProcessor.from_pretrained(
model_name,
trust_remote_code=True,
)
# Load model
_voxtral_model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_name,
torch_dtype=torch_dtype,
device_map="auto",
trust_remote_code=True,
)
logger.info(f"Voxtral model loaded successfully on {device}")
except ImportError as e:
logger.error(f"Failed to import transformers: {e}")
raise RuntimeError(
"transformers not installed. "
"Run: pip install transformers torch"
)
except Exception as e:
logger.error(f"Failed to load Voxtral model: {e}")
raise
return _voxtral_model, _voxtral_processor
def transcribe_audio(
audio_path: str,
language: Optional[str] = "de",
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
) -> VoxtralTranscriptionResult:
"""
Transcribe audio file using Voxtral.
Args:
audio_path: Path to audio file
language: Target language for transcription
model_name: Hugging Face model ID
Returns:
VoxtralTranscriptionResult with transcribed text
"""
import torch
import soundfile as sf
model, processor = get_voxtral_model(model_name)
logger.info(f"Transcribing with Voxtral: {audio_path}")
try:
# Load audio
audio_array, sample_rate = sf.read(audio_path)
# Resample to 16kHz if needed
if sample_rate != 16000:
import numpy as np
from scipy import signal
num_samples = int(len(audio_array) * 16000 / sample_rate)
audio_array = signal.resample(audio_array, num_samples)
sample_rate = 16000
# Process audio
inputs = processor(
audio_array,
sampling_rate=sample_rate,
return_tensors="pt",
)
# Move to same device as model
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=448,
language=language,
)
# Decode
text = processor.batch_decode(
generated_ids,
skip_special_tokens=True,
)[0]
logger.info(f"Voxtral transcription complete: {len(text)} characters")
return VoxtralTranscriptionResult(
text=text.strip(),
language=language,
model="voxtral-mini",
)
except Exception as e:
logger.error(f"Voxtral transcription failed: {e}")
raise
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = "de",
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
) -> VoxtralTranscriptionResult:
"""
Transcribe audio from bytes (for API uploads).
"""
ext = Path(filename).suffix or ".wav"
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
result = transcribe_audio(
audio_path=tmp_path,
language=language,
model_name=model_name,
)
return result
finally:
try:
os.unlink(tmp_path)
except Exception:
pass
# Supported languages by Voxtral
SUPPORTED_LANGUAGES = [
"en", # English
"de", # German
"fr", # French
"es", # Spanish
"pt", # Portuguese
"it", # Italian
"nl", # Dutch
"hi", # Hindi
]

View file

@ -0,0 +1,163 @@
"""
Whisper STT Service using Lightning Whisper MLX
Optimized for Apple Silicon (M1/M2/M3/M4)
"""
import os
import tempfile
import logging
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Lazy load to avoid import errors if not installed
_whisper_model = None
@dataclass
class TranscriptionResult:
text: str
language: Optional[str] = None
duration: Optional[float] = None
segments: Optional[list] = None
def get_whisper_model(model_name: str = "large-v3-turbo", batch_size: int = 12):
"""Get or create Whisper model instance (singleton pattern)."""
global _whisper_model
if _whisper_model is None:
logger.info(f"Loading Whisper model: {model_name}")
try:
from lightning_whisper_mlx import LightningWhisperMLX
_whisper_model = LightningWhisperMLX(
model=model_name,
batch_size=batch_size,
quant=None # Use full precision for best quality
)
logger.info(f"Whisper model loaded successfully: {model_name}")
except ImportError as e:
logger.error(f"Failed to import lightning_whisper_mlx: {e}")
raise RuntimeError(
"lightning-whisper-mlx not installed. "
"Run: pip install lightning-whisper-mlx"
)
except Exception as e:
logger.error(f"Failed to load Whisper model: {e}")
raise
return _whisper_model
def transcribe_audio(
audio_path: str,
language: Optional[str] = None,
model_name: str = "large-v3-turbo",
) -> TranscriptionResult:
"""
Transcribe audio file using Lightning Whisper MLX.
Args:
audio_path: Path to audio file (mp3, wav, m4a, etc.)
language: Optional language code (e.g., 'de', 'en'). Auto-detect if None.
model_name: Whisper model to use
Returns:
TranscriptionResult with text and metadata
"""
model = get_whisper_model(model_name)
logger.info(f"Transcribing: {audio_path}")
try:
# Lightning Whisper MLX returns dict with 'text' key
result = model.transcribe(
audio_path=audio_path,
language=language,
)
# Handle different return formats
if isinstance(result, dict):
text = result.get("text", "")
segments = result.get("segments", [])
detected_language = result.get("language", language)
else:
text = str(result)
segments = []
detected_language = language
logger.info(f"Transcription complete: {len(text)} characters")
return TranscriptionResult(
text=text.strip(),
language=detected_language,
segments=segments,
)
except Exception as e:
logger.error(f"Transcription failed: {e}")
raise
async def transcribe_audio_bytes(
audio_bytes: bytes,
filename: str,
language: Optional[str] = None,
model_name: str = "large-v3-turbo",
) -> TranscriptionResult:
"""
Transcribe audio from bytes (for API uploads).
Args:
audio_bytes: Raw audio file bytes
filename: Original filename (for extension detection)
language: Optional language code
model_name: Whisper model to use
Returns:
TranscriptionResult
"""
# Get file extension
ext = Path(filename).suffix or ".wav"
# Write to temp file
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
result = transcribe_audio(
audio_path=tmp_path,
language=language,
model_name=model_name,
)
return result
finally:
# Clean up temp file
try:
os.unlink(tmp_path)
except Exception:
pass
# Available models for reference
AVAILABLE_MODELS = [
"tiny",
"tiny.en",
"base",
"base.en",
"small",
"small.en",
"medium",
"medium.en",
"large",
"large-v2",
"large-v3",
"large-v3-turbo", # Recommended for Mac Mini
"distil-small.en",
"distil-medium.en",
"distil-large-v2",
"distil-large-v3",
]

View file

@ -0,0 +1,25 @@
# ManaCore STT Service Dependencies
# For Mac Mini M4 (Apple Silicon)
# Web Framework
fastapi==0.115.6
uvicorn[standard]==0.34.0
python-multipart==0.0.20
# Audio Processing
pydub==0.25.1
soundfile==0.13.1
# Whisper (Apple Silicon optimized)
lightning-whisper-mlx==0.0.10
mlx>=0.21.0
# Voxtral (Hugging Face Transformers)
transformers>=4.47.0
torch>=2.5.0
accelerate>=1.2.0
sentencepiece>=0.2.0
# Utilities
numpy>=1.26.0
tqdm>=4.67.0

123
services/mana-stt/setup.sh Executable file
View file

@ -0,0 +1,123 @@
#!/bin/bash
# ManaCore STT Service Setup Script
# For Mac Mini M4 (Apple Silicon)
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="$SCRIPT_DIR/.venv"
PYTHON_VERSION="3.11"
echo "=============================================="
echo " ManaCore STT Service Setup"
echo " Whisper (Lightning MLX) + Voxtral"
echo "=============================================="
echo ""
# Check if running on macOS
if [[ "$(uname)" != "Darwin" ]]; then
echo "Warning: This script is optimized for macOS (Apple Silicon)"
fi
# Check for Apple Silicon
if [[ "$(uname -m)" != "arm64" ]]; then
echo "Warning: Not running on Apple Silicon. MLX optimizations won't work."
fi
# Check Python version
echo "1. Checking Python installation..."
if command -v python3.11 &> /dev/null; then
PYTHON_CMD="python3.11"
elif command -v python3 &> /dev/null; then
PYTHON_CMD="python3"
PY_VERSION=$($PYTHON_CMD --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
echo " Found Python $PY_VERSION"
else
echo "Error: Python 3 not found. Please install Python 3.11+"
echo " brew install python@3.11"
exit 1
fi
# Create virtual environment
echo ""
echo "2. Creating virtual environment..."
if [ -d "$VENV_DIR" ]; then
echo " Virtual environment already exists at $VENV_DIR"
read -p " Recreate? (y/N) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
rm -rf "$VENV_DIR"
$PYTHON_CMD -m venv "$VENV_DIR"
echo " Virtual environment recreated"
fi
else
$PYTHON_CMD -m venv "$VENV_DIR"
echo " Virtual environment created at $VENV_DIR"
fi
# Activate virtual environment
source "$VENV_DIR/bin/activate"
# Upgrade pip
echo ""
echo "3. Upgrading pip..."
pip install --upgrade pip wheel setuptools
# Install dependencies
echo ""
echo "4. Installing dependencies..."
echo " This may take several minutes (downloading large models)..."
# Install PyTorch with MPS support first
pip install torch torchvision torchaudio
# Install MLX for Apple Silicon
pip install mlx
# Install other dependencies
pip install -r "$SCRIPT_DIR/requirements.txt"
# Install scipy for audio resampling (needed by Voxtral)
pip install scipy
echo ""
echo "5. Verifying installation..."
# Test imports
python -c "import torch; print(f' PyTorch {torch.__version__} - MPS available: {torch.backends.mps.is_available()}')"
python -c "import mlx; print(f' MLX installed')" 2>/dev/null || echo " MLX not available (CPU fallback)"
python -c "import fastapi; print(f' FastAPI {fastapi.__version__}')"
echo ""
echo "6. Downloading Whisper model (large-v3-turbo)..."
echo " This will download ~1.6 GB on first run..."
# Pre-download the model
python -c "
from lightning_whisper_mlx import LightningWhisperMLX
print(' Initializing Whisper model...')
whisper = LightningWhisperMLX(model='large-v3-turbo', batch_size=12)
print(' Whisper model ready!')
" || echo " Note: Model will be downloaded on first transcription request"
echo ""
echo "=============================================="
echo " Setup Complete!"
echo "=============================================="
echo ""
echo "To start the STT service:"
echo ""
echo " cd $SCRIPT_DIR"
echo " source .venv/bin/activate"
echo " uvicorn app.main:app --host 0.0.0.0 --port 3020"
echo ""
echo "Or use the systemd/launchd service (recommended for production):"
echo ""
echo " ./scripts/mac-mini/setup-stt.sh"
echo ""
echo "API Endpoints:"
echo " POST /transcribe - Whisper transcription"
echo " POST /transcribe/voxtral - Voxtral transcription"
echo " POST /transcribe/auto - Auto-select best model"
echo " GET /health - Health check"
echo " GET /models - List available models"
echo ""