mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 23:41:08 +02:00
✨ feat(stt): add speech-to-text service for Mac Mini
Add mana-stt service with Whisper and Voxtral support for local transcription. Includes setup script and launchd integration for automatic startup on Mac Mini server. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
aeabd21d4a
commit
bf0fa04e7e
10 changed files with 1206 additions and 0 deletions
|
|
@ -23,6 +23,7 @@ cd ~/projects/manacore-monorepo
|
|||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `setup-autostart.sh` | Configure automatic startup on boot (run once) |
|
||||
| `setup-stt.sh` | Setup STT service (Whisper + Voxtral) |
|
||||
| `startup.sh` | Main startup script (called by launchd) |
|
||||
| `health-check.sh` | Check all services health |
|
||||
| `status.sh` | Show full system status |
|
||||
|
|
@ -143,6 +144,7 @@ Three services are configured to run automatically:
|
|||
| Cloudflared | `com.cloudflare.cloudflared` | Tunnel to Cloudflare |
|
||||
| Docker Startup | `com.manacore.docker-startup` | Start containers on boot |
|
||||
| Health Check | `com.manacore.health-check` | Check every 5 minutes |
|
||||
| STT Service | `com.manacore.stt` | Speech-to-Text (Whisper + Voxtral) |
|
||||
|
||||
### Manual Service Control
|
||||
|
||||
|
|
@ -238,4 +240,49 @@ Once running, services are available at:
|
|||
| Calendar API | https://calendar-api.mana.how |
|
||||
| Clock | https://clock.mana.how |
|
||||
| Clock API | https://clock-api.mana.how |
|
||||
| STT API | http://localhost:3020 (internal only) |
|
||||
| SSH | ssh mac-mini (via cloudflared) |
|
||||
|
||||
## Native Services (non-Docker)
|
||||
|
||||
### Ollama (LLM)
|
||||
|
||||
Ollama runs natively on Mac Mini for LLM inference:
|
||||
|
||||
```bash
|
||||
# Check status
|
||||
curl http://localhost:11434/api/tags
|
||||
|
||||
# List models
|
||||
ollama list
|
||||
|
||||
# Pull a model
|
||||
ollama pull gemma3:4b
|
||||
```
|
||||
|
||||
### STT Service (Speech-to-Text)
|
||||
|
||||
The STT service provides Whisper and Voxtral transcription:
|
||||
|
||||
```bash
|
||||
# Setup (first time)
|
||||
./scripts/mac-mini/setup-stt.sh
|
||||
|
||||
# Check status
|
||||
curl http://localhost:3020/health
|
||||
|
||||
# Transcribe audio
|
||||
curl -X POST http://localhost:3020/transcribe \
|
||||
-F "file=@audio.mp3" \
|
||||
-F "language=de"
|
||||
|
||||
# View logs
|
||||
tail -f /tmp/manacore-stt.log
|
||||
```
|
||||
|
||||
**Available endpoints:**
|
||||
- `POST /transcribe` - Whisper transcription (recommended)
|
||||
- `POST /transcribe/voxtral` - Voxtral transcription
|
||||
- `POST /transcribe/auto` - Auto-select model
|
||||
- `GET /health` - Health check
|
||||
- `GET /models` - List available models
|
||||
|
|
|
|||
153
scripts/mac-mini/setup-stt.sh
Executable file
153
scripts/mac-mini/setup-stt.sh
Executable file
|
|
@ -0,0 +1,153 @@
|
|||
#!/bin/bash
|
||||
# Setup STT Service on Mac Mini
|
||||
# Creates launchd service for auto-start
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
STT_DIR="$REPO_DIR/services/mana-stt"
|
||||
PLIST_NAME="com.manacore.stt"
|
||||
PLIST_PATH="$HOME/Library/LaunchAgents/$PLIST_NAME.plist"
|
||||
|
||||
echo "=============================================="
|
||||
echo " ManaCore STT Service Setup (Mac Mini)"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
|
||||
# Check if STT service directory exists
|
||||
if [ ! -d "$STT_DIR" ]; then
|
||||
echo "Error: STT service directory not found at $STT_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run the main setup script first
|
||||
echo "1. Running STT service setup..."
|
||||
cd "$STT_DIR"
|
||||
if [ ! -d ".venv" ]; then
|
||||
echo " Installing dependencies..."
|
||||
./setup.sh
|
||||
else
|
||||
echo " Virtual environment already exists"
|
||||
echo " Skipping dependency installation"
|
||||
fi
|
||||
|
||||
# Create launchd plist
|
||||
echo ""
|
||||
echo "2. Creating launchd service..."
|
||||
|
||||
cat > "$PLIST_PATH" << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>$PLIST_NAME</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>$STT_DIR/.venv/bin/uvicorn</string>
|
||||
<string>app.main:app</string>
|
||||
<string>--host</string>
|
||||
<string>0.0.0.0</string>
|
||||
<string>--port</string>
|
||||
<string>3020</string>
|
||||
</array>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>$STT_DIR</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>$STT_DIR/.venv/bin:/usr/local/bin:/usr/bin:/bin</string>
|
||||
<key>PORT</key>
|
||||
<string>3020</string>
|
||||
<key>WHISPER_MODEL</key>
|
||||
<string>large-v3-turbo</string>
|
||||
<key>PRELOAD_MODELS</key>
|
||||
<string>false</string>
|
||||
<key>CORS_ORIGINS</key>
|
||||
<string>https://mana.how,https://chat.mana.how,https://todo.mana.how</string>
|
||||
</dict>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>KeepAlive</key>
|
||||
<dict>
|
||||
<key>SuccessfulExit</key>
|
||||
<false/>
|
||||
<key>Crashed</key>
|
||||
<true/>
|
||||
</dict>
|
||||
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>10</integer>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/manacore-stt.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/manacore-stt.error.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
|
||||
echo " Created: $PLIST_PATH"
|
||||
|
||||
# Unload if already loaded
|
||||
echo ""
|
||||
echo "3. Loading launchd service..."
|
||||
launchctl unload "$PLIST_PATH" 2>/dev/null || true
|
||||
launchctl load "$PLIST_PATH"
|
||||
|
||||
# Wait for service to start
|
||||
sleep 2
|
||||
|
||||
# Check if service is running
|
||||
echo ""
|
||||
echo "4. Checking service status..."
|
||||
if launchctl list | grep -q "$PLIST_NAME"; then
|
||||
echo " Service is running"
|
||||
|
||||
# Check health endpoint
|
||||
sleep 3
|
||||
if curl -s http://localhost:3020/health > /dev/null 2>&1; then
|
||||
echo " Health check passed"
|
||||
HEALTH=$(curl -s http://localhost:3020/health)
|
||||
echo " $HEALTH"
|
||||
else
|
||||
echo " Warning: Health check failed (service may still be starting)"
|
||||
echo " Check logs: tail -f /tmp/manacore-stt.log"
|
||||
fi
|
||||
else
|
||||
echo " Warning: Service may not be running"
|
||||
echo " Check logs: tail -f /tmp/manacore-stt.error.log"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=============================================="
|
||||
echo " STT Service Setup Complete!"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
echo "Service URL: http://localhost:3020"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " # View logs"
|
||||
echo " tail -f /tmp/manacore-stt.log"
|
||||
echo ""
|
||||
echo " # Restart service"
|
||||
echo " launchctl kickstart -k gui/\$(id -u)/$PLIST_NAME"
|
||||
echo ""
|
||||
echo " # Stop service"
|
||||
echo " launchctl unload $PLIST_PATH"
|
||||
echo ""
|
||||
echo " # Start service"
|
||||
echo " launchctl load $PLIST_PATH"
|
||||
echo ""
|
||||
echo " # Test transcription"
|
||||
echo " curl -X POST http://localhost:3020/transcribe \\"
|
||||
echo " -F 'file=@audio.mp3' \\"
|
||||
echo " -F 'language=de'"
|
||||
echo ""
|
||||
|
|
@ -46,6 +46,7 @@ check_launchd() {
|
|||
check_launchd "com.cloudflare.cloudflared" "Cloudflared Tunnel"
|
||||
check_launchd "com.manacore.docker-startup" "Docker Startup"
|
||||
check_launchd "com.manacore.health-check" "Health Check (5min)"
|
||||
check_launchd "com.manacore.stt" "STT Service (Whisper/Voxtral)"
|
||||
|
||||
# ============================================
|
||||
# Docker Status
|
||||
|
|
@ -83,6 +84,27 @@ if docker info >/dev/null 2>&1; then
|
|||
done
|
||||
fi
|
||||
|
||||
# ============================================
|
||||
# Native Services (non-Docker)
|
||||
# ============================================
|
||||
echo ""
|
||||
echo -e "${BOLD}Native Services:${NC}"
|
||||
|
||||
# Ollama
|
||||
if curl -s --max-time 2 http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||
OLLAMA_MODELS=$(curl -s http://localhost:11434/api/tags | grep -o '"name":"[^"]*"' | wc -l | tr -d ' ')
|
||||
echo -e " ${GREEN}[Running]${NC} Ollama (${OLLAMA_MODELS} models)"
|
||||
else
|
||||
echo -e " ${YELLOW}[Stopped]${NC} Ollama"
|
||||
fi
|
||||
|
||||
# STT Service
|
||||
if curl -s --max-time 2 http://localhost:3020/health >/dev/null 2>&1; then
|
||||
echo -e " ${GREEN}[Running]${NC} STT Service (port 3020)"
|
||||
else
|
||||
echo -e " ${YELLOW}[Stopped]${NC} STT Service"
|
||||
fi
|
||||
|
||||
# ============================================
|
||||
# Network/Tunnel Status
|
||||
# ============================================
|
||||
|
|
|
|||
165
services/mana-stt/README.md
Normal file
165
services/mana-stt/README.md
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
# ManaCore STT Service
|
||||
|
||||
Speech-to-Text API service with **Whisper (Lightning MLX)** and **Voxtral Mini**.
|
||||
|
||||
Optimized for Mac Mini M4 (Apple Silicon).
|
||||
|
||||
## Features
|
||||
|
||||
- **Whisper Large V3 Turbo** - Best quality, 99+ languages, German WER 6-9%
|
||||
- **Voxtral Mini (3B)** - Mistral AI, Apache 2.0, 8 languages including German
|
||||
- **Apple Silicon Optimized** - Uses MLX for 10x faster inference
|
||||
- **REST API** - Simple HTTP endpoints for integration
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
cd services/mana-stt
|
||||
./setup.sh
|
||||
```
|
||||
|
||||
### Run Locally
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
uvicorn app.main:app --host 0.0.0.0 --port 3020
|
||||
```
|
||||
|
||||
### Setup as System Service (Mac Mini)
|
||||
|
||||
```bash
|
||||
./scripts/mac-mini/setup-stt.sh
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `/health` | GET | Health check |
|
||||
| `/models` | GET | List available models |
|
||||
| `/transcribe` | POST | Whisper transcription |
|
||||
| `/transcribe/voxtral` | POST | Voxtral transcription |
|
||||
| `/transcribe/auto` | POST | Auto-select best model |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Transcribe with Whisper (Recommended)
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3020/transcribe \
|
||||
-F "file=@recording.mp3" \
|
||||
-F "language=de"
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"text": "Das ist ein Beispieltext...",
|
||||
"language": "de",
|
||||
"model": "whisper-large-v3-turbo"
|
||||
}
|
||||
```
|
||||
|
||||
### Transcribe with Voxtral
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3020/transcribe/voxtral \
|
||||
-F "file=@recording.mp3" \
|
||||
-F "language=de"
|
||||
```
|
||||
|
||||
### Auto-Select Model
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:3020/transcribe/auto \
|
||||
-F "file=@recording.mp3" \
|
||||
-F "prefer=whisper"
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment variables:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `PORT` | `3020` | API server port |
|
||||
| `WHISPER_MODEL` | `large-v3-turbo` | Default Whisper model |
|
||||
| `PRELOAD_MODELS` | `false` | Load models on startup |
|
||||
| `CORS_ORIGINS` | `https://mana.how,...` | Allowed CORS origins |
|
||||
|
||||
## Supported Audio Formats
|
||||
|
||||
- MP3, WAV, M4A, FLAC, OGG, WebM, MP4
|
||||
- Max file size: 100MB
|
||||
- Any sample rate (automatically resampled to 16kHz)
|
||||
|
||||
## Model Comparison
|
||||
|
||||
| Model | German WER | Speed | VRAM | License |
|
||||
|-------|------------|-------|------|---------|
|
||||
| Whisper Large V3 Turbo | 6-9% | Fast | ~6 GB | MIT |
|
||||
| Voxtral Mini (3B) | 8-12% | Medium | ~4 GB | Apache 2.0 |
|
||||
|
||||
## Logs
|
||||
|
||||
```bash
|
||||
# Service logs
|
||||
tail -f /tmp/manacore-stt.log
|
||||
|
||||
# Error logs
|
||||
tail -f /tmp/manacore-stt.error.log
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Model Download Slow
|
||||
|
||||
First run downloads ~1.6 GB for Whisper and ~6 GB for Voxtral. Be patient.
|
||||
|
||||
### Out of Memory
|
||||
|
||||
Reduce batch size or use smaller model:
|
||||
```bash
|
||||
export WHISPER_MODEL=medium
|
||||
```
|
||||
|
||||
### MPS Not Available
|
||||
|
||||
Ensure PyTorch is installed with MPS support:
|
||||
```bash
|
||||
pip install torch torchvision torchaudio
|
||||
python -c "import torch; print(torch.backends.mps.is_available())"
|
||||
```
|
||||
|
||||
## Integration
|
||||
|
||||
### From Chat Backend (NestJS)
|
||||
|
||||
```typescript
|
||||
const formData = new FormData();
|
||||
formData.append('file', audioBuffer, 'recording.webm');
|
||||
formData.append('language', 'de');
|
||||
|
||||
const response = await fetch('http://localhost:3020/transcribe', {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const { text } = await response.json();
|
||||
```
|
||||
|
||||
### From SvelteKit Web
|
||||
|
||||
```typescript
|
||||
const formData = new FormData();
|
||||
formData.append('file', audioBlob, 'recording.webm');
|
||||
|
||||
const response = await fetch('https://stt-api.mana.how/transcribe', {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const { text } = await response.json();
|
||||
```
|
||||
1
services/mana-stt/app/__init__.py
Normal file
1
services/mana-stt/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# ManaCore STT Service
|
||||
309
services/mana-stt/app/main.py
Normal file
309
services/mana-stt/app/main.py
Normal file
|
|
@ -0,0 +1,309 @@
|
|||
"""
|
||||
ManaCore STT API Service
|
||||
Speech-to-Text with Whisper (MLX) and Voxtral
|
||||
|
||||
Run with: uvicorn app.main:app --host 0.0.0.0 --port 3020
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Environment
|
||||
PORT = int(os.getenv("PORT", "3020"))
|
||||
DEFAULT_WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3-turbo")
|
||||
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "false").lower() == "true"
|
||||
CORS_ORIGINS = os.getenv(
|
||||
"CORS_ORIGINS",
|
||||
"https://mana.how,https://chat.mana.how,http://localhost:5173"
|
||||
).split(",")
|
||||
|
||||
|
||||
# Response models
|
||||
class TranscriptionResponse(BaseModel):
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str
|
||||
duration_seconds: Optional[float] = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
whisper_loaded: bool
|
||||
voxtral_loaded: bool
|
||||
models: dict
|
||||
|
||||
|
||||
class ModelsResponse(BaseModel):
|
||||
whisper: list
|
||||
voxtral: list
|
||||
default_whisper: str
|
||||
|
||||
|
||||
# Track loaded models
|
||||
models_status = {
|
||||
"whisper_loaded": False,
|
||||
"voxtral_loaded": False,
|
||||
}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Startup and shutdown events."""
|
||||
logger.info("Starting ManaCore STT Service...")
|
||||
|
||||
# Optionally preload models on startup
|
||||
if PRELOAD_MODELS:
|
||||
logger.info("Preloading models (PRELOAD_MODELS=true)...")
|
||||
try:
|
||||
from app.whisper_service import get_whisper_model
|
||||
get_whisper_model(DEFAULT_WHISPER_MODEL)
|
||||
models_status["whisper_loaded"] = True
|
||||
logger.info("Whisper model preloaded")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to preload Whisper: {e}")
|
||||
|
||||
try:
|
||||
from app.voxtral_service import get_voxtral_model
|
||||
get_voxtral_model()
|
||||
models_status["voxtral_loaded"] = True
|
||||
logger.info("Voxtral model preloaded")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to preload Voxtral: {e}")
|
||||
else:
|
||||
logger.info("Models will be loaded on first request (lazy loading)")
|
||||
|
||||
logger.info(f"STT Service ready on port {PORT}")
|
||||
yield
|
||||
logger.info("Shutting down STT Service...")
|
||||
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title="ManaCore STT Service",
|
||||
description="Speech-to-Text API with Whisper (MLX) and Voxtral",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=CORS_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
whisper_loaded=models_status["whisper_loaded"],
|
||||
voxtral_loaded=models_status["voxtral_loaded"],
|
||||
models={
|
||||
"default_whisper": DEFAULT_WHISPER_MODEL,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/models", response_model=ModelsResponse)
|
||||
async def list_models():
|
||||
"""List available models."""
|
||||
from app.whisper_service import AVAILABLE_MODELS as whisper_models
|
||||
from app.voxtral_service import SUPPORTED_LANGUAGES as voxtral_languages
|
||||
|
||||
return ModelsResponse(
|
||||
whisper=whisper_models,
|
||||
voxtral=voxtral_languages,
|
||||
default_whisper=DEFAULT_WHISPER_MODEL,
|
||||
)
|
||||
|
||||
|
||||
@app.post("/transcribe", response_model=TranscriptionResponse)
|
||||
async def transcribe_whisper(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: Optional[str] = Form(
|
||||
None,
|
||||
description="Language code (e.g., 'de', 'en'). Auto-detect if not provided."
|
||||
),
|
||||
model: str = Form(
|
||||
None,
|
||||
description="Whisper model to use (default: large-v3-turbo)"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Transcribe audio using Whisper (Lightning MLX).
|
||||
|
||||
Supported formats: mp3, wav, m4a, flac, ogg, webm
|
||||
Max file size: 100MB
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
# Validate file type
|
||||
allowed_extensions = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".webm", ".mp4"}
|
||||
ext = os.path.splitext(file.filename)[1].lower()
|
||||
if ext not in allowed_extensions:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported file type: {ext}. Allowed: {allowed_extensions}"
|
||||
)
|
||||
|
||||
try:
|
||||
from app.whisper_service import transcribe_audio_bytes
|
||||
|
||||
# Read file
|
||||
audio_bytes = await file.read()
|
||||
|
||||
# Check file size (100MB limit)
|
||||
if len(audio_bytes) > 100 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
||||
|
||||
# Use default model if not specified
|
||||
model_name = model or DEFAULT_WHISPER_MODEL
|
||||
|
||||
# Transcribe
|
||||
result = await transcribe_audio_bytes(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
model_name=model_name,
|
||||
)
|
||||
|
||||
models_status["whisper_loaded"] = True
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=f"whisper-{model_name}",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/transcribe/voxtral", response_model=TranscriptionResponse)
|
||||
async def transcribe_voxtral(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: str = Form(
|
||||
"de",
|
||||
description="Language code (de, en, fr, es, pt, it, nl, hi)"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Transcribe audio using Voxtral Mini (Mistral AI).
|
||||
|
||||
Best for: German, French, European languages
|
||||
Supported formats: mp3, wav, m4a, flac
|
||||
Max file size: 100MB
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
# Validate language
|
||||
from app.voxtral_service import SUPPORTED_LANGUAGES
|
||||
if language not in SUPPORTED_LANGUAGES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported language: {language}. Supported: {SUPPORTED_LANGUAGES}"
|
||||
)
|
||||
|
||||
try:
|
||||
from app.voxtral_service import transcribe_audio_bytes
|
||||
|
||||
audio_bytes = await file.read()
|
||||
|
||||
if len(audio_bytes) > 100 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 100MB)")
|
||||
|
||||
result = await transcribe_audio_bytes(
|
||||
audio_bytes=audio_bytes,
|
||||
filename=file.filename,
|
||||
language=language,
|
||||
)
|
||||
|
||||
models_status["voxtral_loaded"] = True
|
||||
|
||||
return TranscriptionResponse(
|
||||
text=result.text,
|
||||
language=result.language,
|
||||
model=result.model,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voxtral transcription error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/transcribe/auto", response_model=TranscriptionResponse)
|
||||
async def transcribe_auto(
|
||||
file: UploadFile = File(..., description="Audio file to transcribe"),
|
||||
language: Optional[str] = Form(
|
||||
None,
|
||||
description="Language hint (optional)"
|
||||
),
|
||||
prefer: str = Form(
|
||||
"whisper",
|
||||
description="Preferred model: 'whisper' or 'voxtral'"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Transcribe audio with automatic model selection.
|
||||
|
||||
- Uses Whisper by default (faster, more languages)
|
||||
- Falls back to Voxtral if Whisper fails
|
||||
"""
|
||||
if prefer == "voxtral":
|
||||
# Try Voxtral first
|
||||
try:
|
||||
return await transcribe_voxtral(file, language or "de")
|
||||
except Exception as e:
|
||||
logger.warning(f"Voxtral failed, trying Whisper: {e}")
|
||||
# Reset file position
|
||||
await file.seek(0)
|
||||
return await transcribe_whisper(file, language, None)
|
||||
else:
|
||||
# Try Whisper first (default)
|
||||
try:
|
||||
return await transcribe_whisper(file, language, None)
|
||||
except Exception as e:
|
||||
logger.warning(f"Whisper failed, trying Voxtral: {e}")
|
||||
await file.seek(0)
|
||||
return await transcribe_voxtral(file, language or "de")
|
||||
|
||||
|
||||
# Error handlers
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request, exc):
|
||||
logger.error(f"Unhandled error: {exc}")
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={"detail": "Internal server error", "error": str(exc)},
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(
|
||||
"app.main:app",
|
||||
host="0.0.0.0",
|
||||
port=PORT,
|
||||
reload=False,
|
||||
)
|
||||
198
services/mana-stt/app/voxtral_service.py
Normal file
198
services/mana-stt/app/voxtral_service.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
"""
|
||||
Voxtral STT Service using Hugging Face Transformers
|
||||
Mistral AI's Speech-to-Text model (Apache 2.0 License)
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy load to avoid import errors
|
||||
_voxtral_model = None
|
||||
_voxtral_processor = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoxtralTranscriptionResult:
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
model: str = "voxtral-mini"
|
||||
|
||||
|
||||
def get_voxtral_model(model_name: str = "mistralai/Voxtral-Mini-3B-2507"):
|
||||
"""
|
||||
Get or create Voxtral model instance.
|
||||
|
||||
Note: Voxtral Mini (3B) is recommended for Mac Mini M4.
|
||||
Voxtral Small (24B) requires more VRAM.
|
||||
"""
|
||||
global _voxtral_model, _voxtral_processor
|
||||
|
||||
if _voxtral_model is None:
|
||||
logger.info(f"Loading Voxtral model: {model_name}")
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
||||
|
||||
# Determine device
|
||||
if torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
torch_dtype = torch.float16
|
||||
elif torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
torch_dtype = torch.float16
|
||||
else:
|
||||
device = "cpu"
|
||||
torch_dtype = torch.float32
|
||||
|
||||
logger.info(f"Using device: {device}")
|
||||
|
||||
# Load processor
|
||||
_voxtral_processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Load model
|
||||
_voxtral_model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
logger.info(f"Voxtral model loaded successfully on {device}")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import transformers: {e}")
|
||||
raise RuntimeError(
|
||||
"transformers not installed. "
|
||||
"Run: pip install transformers torch"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Voxtral model: {e}")
|
||||
raise
|
||||
|
||||
return _voxtral_model, _voxtral_processor
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
audio_path: str,
|
||||
language: Optional[str] = "de",
|
||||
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
|
||||
) -> VoxtralTranscriptionResult:
|
||||
"""
|
||||
Transcribe audio file using Voxtral.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file
|
||||
language: Target language for transcription
|
||||
model_name: Hugging Face model ID
|
||||
|
||||
Returns:
|
||||
VoxtralTranscriptionResult with transcribed text
|
||||
"""
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
model, processor = get_voxtral_model(model_name)
|
||||
|
||||
logger.info(f"Transcribing with Voxtral: {audio_path}")
|
||||
|
||||
try:
|
||||
# Load audio
|
||||
audio_array, sample_rate = sf.read(audio_path)
|
||||
|
||||
# Resample to 16kHz if needed
|
||||
if sample_rate != 16000:
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
|
||||
num_samples = int(len(audio_array) * 16000 / sample_rate)
|
||||
audio_array = signal.resample(audio_array, num_samples)
|
||||
sample_rate = 16000
|
||||
|
||||
# Process audio
|
||||
inputs = processor(
|
||||
audio_array,
|
||||
sampling_rate=sample_rate,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Move to same device as model
|
||||
device = next(model.parameters()).device
|
||||
inputs = {k: v.to(device) for k, v in inputs.items()}
|
||||
|
||||
# Generate transcription
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=448,
|
||||
language=language,
|
||||
)
|
||||
|
||||
# Decode
|
||||
text = processor.batch_decode(
|
||||
generated_ids,
|
||||
skip_special_tokens=True,
|
||||
)[0]
|
||||
|
||||
logger.info(f"Voxtral transcription complete: {len(text)} characters")
|
||||
|
||||
return VoxtralTranscriptionResult(
|
||||
text=text.strip(),
|
||||
language=language,
|
||||
model="voxtral-mini",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voxtral transcription failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def transcribe_audio_bytes(
|
||||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = "de",
|
||||
model_name: str = "mistralai/Voxtral-Mini-3B-2507",
|
||||
) -> VoxtralTranscriptionResult:
|
||||
"""
|
||||
Transcribe audio from bytes (for API uploads).
|
||||
"""
|
||||
ext = Path(filename).suffix or ".wav"
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||
tmp.write(audio_bytes)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = transcribe_audio(
|
||||
audio_path=tmp_path,
|
||||
language=language,
|
||||
model_name=model_name,
|
||||
)
|
||||
return result
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Supported languages by Voxtral
|
||||
SUPPORTED_LANGUAGES = [
|
||||
"en", # English
|
||||
"de", # German
|
||||
"fr", # French
|
||||
"es", # Spanish
|
||||
"pt", # Portuguese
|
||||
"it", # Italian
|
||||
"nl", # Dutch
|
||||
"hi", # Hindi
|
||||
]
|
||||
163
services/mana-stt/app/whisper_service.py
Normal file
163
services/mana-stt/app/whisper_service.py
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
"""
|
||||
Whisper STT Service using Lightning Whisper MLX
|
||||
Optimized for Apple Silicon (M1/M2/M3/M4)
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy load to avoid import errors if not installed
|
||||
_whisper_model = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionResult:
|
||||
text: str
|
||||
language: Optional[str] = None
|
||||
duration: Optional[float] = None
|
||||
segments: Optional[list] = None
|
||||
|
||||
|
||||
def get_whisper_model(model_name: str = "large-v3-turbo", batch_size: int = 12):
|
||||
"""Get or create Whisper model instance (singleton pattern)."""
|
||||
global _whisper_model
|
||||
|
||||
if _whisper_model is None:
|
||||
logger.info(f"Loading Whisper model: {model_name}")
|
||||
try:
|
||||
from lightning_whisper_mlx import LightningWhisperMLX
|
||||
_whisper_model = LightningWhisperMLX(
|
||||
model=model_name,
|
||||
batch_size=batch_size,
|
||||
quant=None # Use full precision for best quality
|
||||
)
|
||||
logger.info(f"Whisper model loaded successfully: {model_name}")
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import lightning_whisper_mlx: {e}")
|
||||
raise RuntimeError(
|
||||
"lightning-whisper-mlx not installed. "
|
||||
"Run: pip install lightning-whisper-mlx"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Whisper model: {e}")
|
||||
raise
|
||||
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
audio_path: str,
|
||||
language: Optional[str] = None,
|
||||
model_name: str = "large-v3-turbo",
|
||||
) -> TranscriptionResult:
|
||||
"""
|
||||
Transcribe audio file using Lightning Whisper MLX.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file (mp3, wav, m4a, etc.)
|
||||
language: Optional language code (e.g., 'de', 'en'). Auto-detect if None.
|
||||
model_name: Whisper model to use
|
||||
|
||||
Returns:
|
||||
TranscriptionResult with text and metadata
|
||||
"""
|
||||
model = get_whisper_model(model_name)
|
||||
|
||||
logger.info(f"Transcribing: {audio_path}")
|
||||
|
||||
try:
|
||||
# Lightning Whisper MLX returns dict with 'text' key
|
||||
result = model.transcribe(
|
||||
audio_path=audio_path,
|
||||
language=language,
|
||||
)
|
||||
|
||||
# Handle different return formats
|
||||
if isinstance(result, dict):
|
||||
text = result.get("text", "")
|
||||
segments = result.get("segments", [])
|
||||
detected_language = result.get("language", language)
|
||||
else:
|
||||
text = str(result)
|
||||
segments = []
|
||||
detected_language = language
|
||||
|
||||
logger.info(f"Transcription complete: {len(text)} characters")
|
||||
|
||||
return TranscriptionResult(
|
||||
text=text.strip(),
|
||||
language=detected_language,
|
||||
segments=segments,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def transcribe_audio_bytes(
|
||||
audio_bytes: bytes,
|
||||
filename: str,
|
||||
language: Optional[str] = None,
|
||||
model_name: str = "large-v3-turbo",
|
||||
) -> TranscriptionResult:
|
||||
"""
|
||||
Transcribe audio from bytes (for API uploads).
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio file bytes
|
||||
filename: Original filename (for extension detection)
|
||||
language: Optional language code
|
||||
model_name: Whisper model to use
|
||||
|
||||
Returns:
|
||||
TranscriptionResult
|
||||
"""
|
||||
# Get file extension
|
||||
ext = Path(filename).suffix or ".wav"
|
||||
|
||||
# Write to temp file
|
||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||
tmp.write(audio_bytes)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = transcribe_audio(
|
||||
audio_path=tmp_path,
|
||||
language=language,
|
||||
model_name=model_name,
|
||||
)
|
||||
return result
|
||||
finally:
|
||||
# Clean up temp file
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Available models for reference
|
||||
AVAILABLE_MODELS = [
|
||||
"tiny",
|
||||
"tiny.en",
|
||||
"base",
|
||||
"base.en",
|
||||
"small",
|
||||
"small.en",
|
||||
"medium",
|
||||
"medium.en",
|
||||
"large",
|
||||
"large-v2",
|
||||
"large-v3",
|
||||
"large-v3-turbo", # Recommended for Mac Mini
|
||||
"distil-small.en",
|
||||
"distil-medium.en",
|
||||
"distil-large-v2",
|
||||
"distil-large-v3",
|
||||
]
|
||||
25
services/mana-stt/requirements.txt
Normal file
25
services/mana-stt/requirements.txt
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# ManaCore STT Service Dependencies
|
||||
# For Mac Mini M4 (Apple Silicon)
|
||||
|
||||
# Web Framework
|
||||
fastapi==0.115.6
|
||||
uvicorn[standard]==0.34.0
|
||||
python-multipart==0.0.20
|
||||
|
||||
# Audio Processing
|
||||
pydub==0.25.1
|
||||
soundfile==0.13.1
|
||||
|
||||
# Whisper (Apple Silicon optimized)
|
||||
lightning-whisper-mlx==0.0.10
|
||||
mlx>=0.21.0
|
||||
|
||||
# Voxtral (Hugging Face Transformers)
|
||||
transformers>=4.47.0
|
||||
torch>=2.5.0
|
||||
accelerate>=1.2.0
|
||||
sentencepiece>=0.2.0
|
||||
|
||||
# Utilities
|
||||
numpy>=1.26.0
|
||||
tqdm>=4.67.0
|
||||
123
services/mana-stt/setup.sh
Executable file
123
services/mana-stt/setup.sh
Executable file
|
|
@ -0,0 +1,123 @@
|
|||
#!/bin/bash
|
||||
# ManaCore STT Service Setup Script
|
||||
# For Mac Mini M4 (Apple Silicon)
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
VENV_DIR="$SCRIPT_DIR/.venv"
|
||||
PYTHON_VERSION="3.11"
|
||||
|
||||
echo "=============================================="
|
||||
echo " ManaCore STT Service Setup"
|
||||
echo " Whisper (Lightning MLX) + Voxtral"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
|
||||
# Check if running on macOS
|
||||
if [[ "$(uname)" != "Darwin" ]]; then
|
||||
echo "Warning: This script is optimized for macOS (Apple Silicon)"
|
||||
fi
|
||||
|
||||
# Check for Apple Silicon
|
||||
if [[ "$(uname -m)" != "arm64" ]]; then
|
||||
echo "Warning: Not running on Apple Silicon. MLX optimizations won't work."
|
||||
fi
|
||||
|
||||
# Check Python version
|
||||
echo "1. Checking Python installation..."
|
||||
if command -v python3.11 &> /dev/null; then
|
||||
PYTHON_CMD="python3.11"
|
||||
elif command -v python3 &> /dev/null; then
|
||||
PYTHON_CMD="python3"
|
||||
PY_VERSION=$($PYTHON_CMD --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
|
||||
echo " Found Python $PY_VERSION"
|
||||
else
|
||||
echo "Error: Python 3 not found. Please install Python 3.11+"
|
||||
echo " brew install python@3.11"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create virtual environment
|
||||
echo ""
|
||||
echo "2. Creating virtual environment..."
|
||||
if [ -d "$VENV_DIR" ]; then
|
||||
echo " Virtual environment already exists at $VENV_DIR"
|
||||
read -p " Recreate? (y/N) " -n 1 -r
|
||||
echo
|
||||
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
||||
rm -rf "$VENV_DIR"
|
||||
$PYTHON_CMD -m venv "$VENV_DIR"
|
||||
echo " Virtual environment recreated"
|
||||
fi
|
||||
else
|
||||
$PYTHON_CMD -m venv "$VENV_DIR"
|
||||
echo " Virtual environment created at $VENV_DIR"
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# Upgrade pip
|
||||
echo ""
|
||||
echo "3. Upgrading pip..."
|
||||
pip install --upgrade pip wheel setuptools
|
||||
|
||||
# Install dependencies
|
||||
echo ""
|
||||
echo "4. Installing dependencies..."
|
||||
echo " This may take several minutes (downloading large models)..."
|
||||
|
||||
# Install PyTorch with MPS support first
|
||||
pip install torch torchvision torchaudio
|
||||
|
||||
# Install MLX for Apple Silicon
|
||||
pip install mlx
|
||||
|
||||
# Install other dependencies
|
||||
pip install -r "$SCRIPT_DIR/requirements.txt"
|
||||
|
||||
# Install scipy for audio resampling (needed by Voxtral)
|
||||
pip install scipy
|
||||
|
||||
echo ""
|
||||
echo "5. Verifying installation..."
|
||||
|
||||
# Test imports
|
||||
python -c "import torch; print(f' PyTorch {torch.__version__} - MPS available: {torch.backends.mps.is_available()}')"
|
||||
python -c "import mlx; print(f' MLX installed')" 2>/dev/null || echo " MLX not available (CPU fallback)"
|
||||
python -c "import fastapi; print(f' FastAPI {fastapi.__version__}')"
|
||||
|
||||
echo ""
|
||||
echo "6. Downloading Whisper model (large-v3-turbo)..."
|
||||
echo " This will download ~1.6 GB on first run..."
|
||||
# Pre-download the model
|
||||
python -c "
|
||||
from lightning_whisper_mlx import LightningWhisperMLX
|
||||
print(' Initializing Whisper model...')
|
||||
whisper = LightningWhisperMLX(model='large-v3-turbo', batch_size=12)
|
||||
print(' Whisper model ready!')
|
||||
" || echo " Note: Model will be downloaded on first transcription request"
|
||||
|
||||
echo ""
|
||||
echo "=============================================="
|
||||
echo " Setup Complete!"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
echo "To start the STT service:"
|
||||
echo ""
|
||||
echo " cd $SCRIPT_DIR"
|
||||
echo " source .venv/bin/activate"
|
||||
echo " uvicorn app.main:app --host 0.0.0.0 --port 3020"
|
||||
echo ""
|
||||
echo "Or use the systemd/launchd service (recommended for production):"
|
||||
echo ""
|
||||
echo " ./scripts/mac-mini/setup-stt.sh"
|
||||
echo ""
|
||||
echo "API Endpoints:"
|
||||
echo " POST /transcribe - Whisper transcription"
|
||||
echo " POST /transcribe/voxtral - Voxtral transcription"
|
||||
echo " POST /transcribe/auto - Auto-select best model"
|
||||
echo " GET /health - Health check"
|
||||
echo " GET /models - List available models"
|
||||
echo ""
|
||||
Loading…
Add table
Add a link
Reference in a new issue