diff --git a/services/mana-voice-bot/CLAUDE.md b/services/mana-voice-bot/CLAUDE.md new file mode 100644 index 000000000..5519bea6d --- /dev/null +++ b/services/mana-voice-bot/CLAUDE.md @@ -0,0 +1,132 @@ +# CLAUDE.md - Mana Voice Bot + +## Service Overview + +German voice-to-voice assistant combining: +- **STT**: Whisper via mana-stt (Port 3020) +- **LLM**: Ollama with Gemma/Qwen (Port 11434) +- **TTS**: Edge TTS (Microsoft, cloud API) + +**Port**: 3050 + +## Architecture + +``` +Audio Input → Whisper (STT) → Ollama (LLM) → Edge TTS → Audio Output + ↓ ↓ ↓ ↓ + [WAV/MP3] [German Text] [Response] [MP3 Audio] +``` + +## Commands + +```bash +# Setup +./setup.sh + +# Development +source venv/bin/activate +uvicorn app.main:app --host 0.0.0.0 --port 3050 --reload + +# Production +./start.sh + +# Test +curl http://localhost:3050/health +``` + +## API Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Service health check | +| `/voices` | GET | List German TTS voices | +| `/models` | GET | List available Ollama models | +| `/transcribe` | POST | Audio → Text (STT only) | +| `/chat` | POST | Text → Text (LLM only) | +| `/chat/audio` | POST | Text → Audio (LLM + TTS) | +| `/tts` | POST | Text → Audio (TTS only) | +| `/voice` | POST | Audio → Audio (Full pipeline) | +| `/voice/metadata` | POST | Audio → JSON (Full pipeline, no audio) | + +## Usage Examples + +### Full Voice Pipeline +```bash +# Record audio and send to voice bot +curl -X POST http://localhost:3050/voice \ + -F "audio=@input.wav" \ + -F "model=gemma3:4b" \ + -F "voice=de-DE-ConradNeural" \ + -o response.mp3 +``` + +### Text to Audio +```bash +curl -X POST http://localhost:3050/chat/audio \ + -H "Content-Type: application/json" \ + -d '{"message": "Was ist die Hauptstadt von Deutschland?", "voice": "de-DE-KatjaNeural"}' \ + -o response.mp3 +``` + +### TTS Only +```bash +curl -X POST http://localhost:3050/tts \ + -F "text=Hallo, wie geht es dir?" \ + -F "voice=de-DE-ConradNeural" \ + -o hello.mp3 +``` + +## German Voices + +| Voice ID | Description | +|----------|-------------| +| `de-DE-ConradNeural` | Male - Professional (Default) | +| `de-DE-KatjaNeural` | Female - Natural | +| `de-DE-AmalaNeural` | Female - Friendly | +| `de-DE-BerndNeural` | Male - Calm | +| `de-DE-ChristophNeural` | Male - News | +| `de-DE-ElkeNeural` | Female - Warm | +| `de-DE-KillianNeural` | Male - Casual | +| `de-DE-KlarissaNeural` | Female - Cheerful | +| `de-DE-KlausNeural` | Male - Storyteller | +| `de-DE-LouisaNeural` | Female - Assistant | +| `de-DE-TanjaNeural` | Female - Business | + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `PORT` | `3050` | Service port | +| `STT_URL` | `http://localhost:3020` | mana-stt URL | +| `OLLAMA_URL` | `http://localhost:11434` | Ollama URL | +| `DEFAULT_MODEL` | `gemma3:4b` | Default LLM model | +| `DEFAULT_VOICE` | `de-DE-ConradNeural` | Default TTS voice | +| `SYSTEM_PROMPT` | (German assistant) | LLM system prompt | + +## Dependencies + +- `fastapi` - Web framework +- `uvicorn` - ASGI server +- `aiohttp` - Async HTTP client +- `edge-tts` - Microsoft TTS +- `python-multipart` - File uploads + +## Performance + +Typical latency breakdown: +- STT (Whisper): 0.5-2s +- LLM (Gemma 4B): 1-5s +- TTS (Edge): 0.3-0.5s +- **Total**: 2-7s + +## Mac Mini Deployment + +```bash +# On Mac Mini +cd ~/projects/manacore-monorepo/services/mana-voice-bot +./setup.sh +./start.sh + +# Or with launchd (autostart) +# See scripts/mac-mini/setup-voice-bot.sh +``` diff --git a/services/mana-voice-bot/app/__init__.py b/services/mana-voice-bot/app/__init__.py new file mode 100644 index 000000000..b51cb8ece --- /dev/null +++ b/services/mana-voice-bot/app/__init__.py @@ -0,0 +1 @@ +# Mana Voice Bot diff --git a/services/mana-voice-bot/app/main.py b/services/mana-voice-bot/app/main.py new file mode 100644 index 000000000..5115c98ed --- /dev/null +++ b/services/mana-voice-bot/app/main.py @@ -0,0 +1,504 @@ +""" +Mana Voice Bot - German Voice Assistant + +Complete voice-to-voice pipeline: +1. STT: Whisper (mana-stt) - Speech to Text +2. LLM: Ollama (Gemma/Qwen) - Text Processing +3. TTS: Edge TTS - Text to Speech + +Optimized for German language. +""" + +import asyncio +import logging +import os +import tempfile +import time +from contextlib import asynccontextmanager +from typing import Optional + +import aiohttp +import edge_tts +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import Response, StreamingResponse +from pydantic import BaseModel, Field + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# Configuration +PORT = int(os.getenv("PORT", "3050")) +STT_URL = os.getenv("STT_URL", "http://localhost:3020") +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") +DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "gemma3:4b") +DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "de-DE-ConradNeural") +SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", """Du bist ein freundlicher und hilfreicher deutscher Sprachassistent. +Antworte immer auf Deutsch, kurz und prägnant. +Halte deine Antworten unter 3 Sätzen, es sei denn, der Nutzer fragt nach mehr Details.""") + +CORS_ORIGINS = os.getenv( + "CORS_ORIGINS", + "https://mana.how,http://localhost:5173,http://localhost:3000", +).split(",") + +# German Edge TTS voices +GERMAN_VOICES = { + "de-DE-ConradNeural": "Male - Conrad (Professional)", + "de-DE-KatjaNeural": "Female - Katja (Natural)", + "de-DE-AmalaNeural": "Female - Amala (Friendly)", + "de-DE-BerndNeural": "Male - Bernd (Calm)", + "de-DE-ChristophNeural": "Male - Christoph (News)", + "de-DE-ElkeNeural": "Female - Elke (Warm)", + "de-DE-KillianNeural": "Male - Killian (Casual)", + "de-DE-KlarissaNeural": "Female - Klarissa (Cheerful)", + "de-DE-KlausNeural": "Male - Klaus (Storyteller)", + "de-DE-LouisaNeural": "Female - Louisa (Assistant)", + "de-DE-TanjaNeural": "Female - Tanja (Business)", +} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan manager.""" + logger.info(f"Starting Mana Voice Bot on port {PORT}") + logger.info(f"STT Service: {STT_URL}") + logger.info(f"Ollama: {OLLAMA_URL}") + logger.info(f"Default Model: {DEFAULT_MODEL}") + logger.info(f"Default Voice: {DEFAULT_VOICE}") + yield + logger.info("Shutting down Mana Voice Bot") + + +app = FastAPI( + title="Mana Voice Bot", + description="German voice-to-voice assistant using Whisper + Ollama + Edge TTS", + version="1.0.0", + lifespan=lifespan, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=CORS_ORIGINS, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# ============================================================================ +# Models +# ============================================================================ + + +class ChatRequest(BaseModel): + """Text chat request.""" + message: str = Field(..., description="User message") + model: str = Field(DEFAULT_MODEL, description="Ollama model to use") + voice: str = Field(DEFAULT_VOICE, description="TTS voice") + system_prompt: Optional[str] = Field(None, description="Custom system prompt") + + +class TranscribeResponse(BaseModel): + """Transcription response.""" + text: str + language: str + duration: float + + +class ChatResponse(BaseModel): + """Chat response with text.""" + user_text: str + assistant_text: str + model: str + processing_time: float + + +class VoiceResponse(BaseModel): + """Voice processing response metadata.""" + user_text: str + assistant_text: str + model: str + voice: str + stt_time: float + llm_time: float + tts_time: float + total_time: float + + +# ============================================================================ +# Service Functions +# ============================================================================ + + +async def transcribe_audio(audio_bytes: bytes, language: str = "de") -> dict: + """Transcribe audio using mana-stt (Whisper).""" + async with aiohttp.ClientSession() as session: + data = aiohttp.FormData() + data.add_field("file", audio_bytes, filename="audio.wav", content_type="audio/wav") + data.add_field("language", language) + + async with session.post(f"{STT_URL}/transcribe", data=data) as response: + if response.status != 200: + error = await response.text() + raise HTTPException(status_code=500, detail=f"STT error: {error}") + return await response.json() + + +async def chat_with_ollama( + message: str, + model: str = DEFAULT_MODEL, + system_prompt: str = SYSTEM_PROMPT, +) -> str: + """Send message to Ollama and get response.""" + async with aiohttp.ClientSession() as session: + payload = { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": message}, + ], + "stream": False, + } + + async with session.post( + f"{OLLAMA_URL}/api/chat", + json=payload, + timeout=aiohttp.ClientTimeout(total=120), + ) as response: + if response.status != 200: + error = await response.text() + raise HTTPException(status_code=500, detail=f"Ollama error: {error}") + + result = await response.json() + return result.get("message", {}).get("content", "") + + +async def synthesize_speech(text: str, voice: str = DEFAULT_VOICE) -> bytes: + """Synthesize speech using Edge TTS.""" + if voice not in GERMAN_VOICES: + voice = DEFAULT_VOICE + + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + output_path = f.name + + try: + communicate = edge_tts.Communicate(text, voice) + await communicate.save(output_path) + + with open(output_path, "rb") as f: + return f.read() + finally: + if os.path.exists(output_path): + os.unlink(output_path) + + +# ============================================================================ +# Endpoints +# ============================================================================ + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + # Check STT service + stt_ok = False + try: + async with aiohttp.ClientSession() as session: + async with session.get(f"{STT_URL}/health", timeout=aiohttp.ClientTimeout(total=5)) as r: + stt_ok = r.status == 200 + except Exception: + pass + + # Check Ollama + ollama_ok = False + try: + async with aiohttp.ClientSession() as session: + async with session.get(f"{OLLAMA_URL}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as r: + ollama_ok = r.status == 200 + except Exception: + pass + + return { + "status": "healthy" if (stt_ok and ollama_ok) else "degraded", + "services": { + "stt": "ok" if stt_ok else "unavailable", + "ollama": "ok" if ollama_ok else "unavailable", + "tts": "ok", # Edge TTS is always available (cloud API) + }, + "config": { + "default_model": DEFAULT_MODEL, + "default_voice": DEFAULT_VOICE, + }, + } + + +@app.get("/voices") +async def list_voices(): + """List available German voices.""" + return { + "voices": GERMAN_VOICES, + "default": DEFAULT_VOICE, + } + + +@app.get("/models") +async def list_models(): + """List available Ollama models.""" + try: + async with aiohttp.ClientSession() as session: + async with session.get(f"{OLLAMA_URL}/api/tags") as response: + if response.status == 200: + data = await response.json() + models = [m["name"] for m in data.get("models", [])] + return {"models": models, "default": DEFAULT_MODEL} + except Exception as e: + logger.error(f"Failed to get models: {e}") + + return {"models": [], "default": DEFAULT_MODEL, "error": "Could not fetch models"} + + +@app.post("/transcribe", response_model=TranscribeResponse) +async def transcribe( + audio: UploadFile = File(..., description="Audio file to transcribe"), + language: str = Form("de", description="Language code"), +): + """Transcribe audio to text using Whisper.""" + start = time.time() + + audio_bytes = await audio.read() + if len(audio_bytes) == 0: + raise HTTPException(status_code=400, detail="Audio file is empty") + + result = await transcribe_audio(audio_bytes, language) + + return TranscribeResponse( + text=result.get("text", ""), + language=result.get("language", language), + duration=time.time() - start, + ) + + +@app.post("/chat", response_model=ChatResponse) +async def chat(request: ChatRequest): + """Text chat with the LLM.""" + start = time.time() + + response_text = await chat_with_ollama( + message=request.message, + model=request.model, + system_prompt=request.system_prompt or SYSTEM_PROMPT, + ) + + return ChatResponse( + user_text=request.message, + assistant_text=response_text, + model=request.model, + processing_time=time.time() - start, + ) + + +@app.post("/chat/audio") +async def chat_audio(request: ChatRequest): + """Text chat with audio response.""" + start = time.time() + + # Get LLM response + llm_start = time.time() + response_text = await chat_with_ollama( + message=request.message, + model=request.model, + system_prompt=request.system_prompt or SYSTEM_PROMPT, + ) + llm_time = time.time() - llm_start + + # Synthesize speech + tts_start = time.time() + audio_bytes = await synthesize_speech(response_text, request.voice) + tts_time = time.time() - tts_start + + return Response( + content=audio_bytes, + media_type="audio/mpeg", + headers={ + "X-User-Text": request.message[:100], + "X-Assistant-Text": response_text[:200], + "X-Model": request.model, + "X-Voice": request.voice, + "X-LLM-Time": str(round(llm_time, 2)), + "X-TTS-Time": str(round(tts_time, 2)), + "X-Total-Time": str(round(time.time() - start, 2)), + }, + ) + + +@app.post("/voice") +async def voice_to_voice( + audio: UploadFile = File(..., description="Audio input"), + model: str = Form(DEFAULT_MODEL, description="Ollama model"), + voice: str = Form(DEFAULT_VOICE, description="TTS voice"), + language: str = Form("de", description="Input language"), + system_prompt: Optional[str] = Form(None, description="Custom system prompt"), +): + """ + Complete voice-to-voice pipeline. + + 1. Transcribe audio input (Whisper) + 2. Process with LLM (Ollama) + 3. Synthesize response (Edge TTS) + + Returns audio response with metadata headers. + """ + total_start = time.time() + + # Read audio + audio_bytes = await audio.read() + if len(audio_bytes) == 0: + raise HTTPException(status_code=400, detail="Audio file is empty") + + # 1. Speech-to-Text + stt_start = time.time() + try: + stt_result = await transcribe_audio(audio_bytes, language) + user_text = stt_result.get("text", "").strip() + except Exception as e: + logger.error(f"STT failed: {e}") + raise HTTPException(status_code=500, detail=f"Transcription failed: {e}") + stt_time = time.time() - stt_start + + if not user_text: + raise HTTPException(status_code=400, detail="Could not transcribe audio - no speech detected") + + logger.info(f"Transcribed: {user_text}") + + # 2. LLM Processing + llm_start = time.time() + try: + assistant_text = await chat_with_ollama( + message=user_text, + model=model, + system_prompt=system_prompt or SYSTEM_PROMPT, + ) + except Exception as e: + logger.error(f"LLM failed: {e}") + raise HTTPException(status_code=500, detail=f"LLM processing failed: {e}") + llm_time = time.time() - llm_start + + logger.info(f"LLM response: {assistant_text[:100]}...") + + # 3. Text-to-Speech + tts_start = time.time() + try: + response_audio = await synthesize_speech(assistant_text, voice) + except Exception as e: + logger.error(f"TTS failed: {e}") + raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {e}") + tts_time = time.time() - tts_start + + total_time = time.time() - total_start + + logger.info(f"Pipeline complete - STT: {stt_time:.2f}s, LLM: {llm_time:.2f}s, TTS: {tts_time:.2f}s, Total: {total_time:.2f}s") + + # Return audio with metadata + return Response( + content=response_audio, + media_type="audio/mpeg", + headers={ + "X-User-Text": user_text[:200].replace("\n", " "), + "X-Assistant-Text": assistant_text[:500].replace("\n", " "), + "X-Model": model, + "X-Voice": voice, + "X-STT-Time": str(round(stt_time, 2)), + "X-LLM-Time": str(round(llm_time, 2)), + "X-TTS-Time": str(round(tts_time, 2)), + "X-Total-Time": str(round(total_time, 2)), + }, + ) + + +@app.post("/voice/metadata", response_model=VoiceResponse) +async def voice_to_voice_with_metadata( + audio: UploadFile = File(..., description="Audio input"), + model: str = Form(DEFAULT_MODEL, description="Ollama model"), + voice: str = Form(DEFAULT_VOICE, description="TTS voice"), + language: str = Form("de", description="Input language"), + system_prompt: Optional[str] = Form(None, description="Custom system prompt"), +): + """ + Voice-to-voice pipeline returning JSON metadata (without audio). + Useful for debugging or when you need the text response. + """ + total_start = time.time() + + audio_bytes = await audio.read() + if len(audio_bytes) == 0: + raise HTTPException(status_code=400, detail="Audio file is empty") + + # STT + stt_start = time.time() + stt_result = await transcribe_audio(audio_bytes, language) + user_text = stt_result.get("text", "").strip() + stt_time = time.time() - stt_start + + if not user_text: + raise HTTPException(status_code=400, detail="No speech detected") + + # LLM + llm_start = time.time() + assistant_text = await chat_with_ollama( + message=user_text, + model=model, + system_prompt=system_prompt or SYSTEM_PROMPT, + ) + llm_time = time.time() - llm_start + + # TTS (just measure time, don't return audio) + tts_start = time.time() + await synthesize_speech(assistant_text, voice) + tts_time = time.time() - tts_start + + return VoiceResponse( + user_text=user_text, + assistant_text=assistant_text, + model=model, + voice=voice, + stt_time=round(stt_time, 2), + llm_time=round(llm_time, 2), + tts_time=round(tts_time, 2), + total_time=round(time.time() - total_start, 2), + ) + + +@app.post("/tts") +async def text_to_speech( + text: str = Form(..., description="Text to synthesize"), + voice: str = Form(DEFAULT_VOICE, description="Voice to use"), +): + """Direct text-to-speech synthesis.""" + if not text.strip(): + raise HTTPException(status_code=400, detail="Text is empty") + + audio_bytes = await synthesize_speech(text, voice) + + return Response( + content=audio_bytes, + media_type="audio/mpeg", + headers={ + "X-Voice": voice, + "X-Text-Length": str(len(text)), + }, + ) + + +# ============================================================================ +# Main +# ============================================================================ + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=PORT) diff --git a/services/mana-voice-bot/requirements.txt b/services/mana-voice-bot/requirements.txt new file mode 100644 index 000000000..8488e8b54 --- /dev/null +++ b/services/mana-voice-bot/requirements.txt @@ -0,0 +1,6 @@ +fastapi>=0.100.0 +uvicorn>=0.20.0 +aiohttp>=3.8.0 +edge-tts>=6.0.0 +python-multipart>=0.0.6 +pydantic>=2.0.0 diff --git a/services/mana-voice-bot/setup.sh b/services/mana-voice-bot/setup.sh new file mode 100755 index 000000000..23a24b824 --- /dev/null +++ b/services/mana-voice-bot/setup.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Setup script for Mana Voice Bot + +set -e + +echo "Setting up Mana Voice Bot..." + +# Create virtual environment +if [ ! -d "venv" ]; then + echo "Creating virtual environment..." + python3 -m venv venv +fi + +# Activate and install dependencies +source venv/bin/activate +pip install --upgrade pip +pip install -r requirements.txt + +echo "" +echo "Setup complete!" +echo "" +echo "To start the service:" +echo " source venv/bin/activate" +echo " uvicorn app.main:app --host 0.0.0.0 --port 3050 --reload" +echo "" +echo "Or use the start script:" +echo " ./start.sh" diff --git a/services/mana-voice-bot/start.sh b/services/mana-voice-bot/start.sh new file mode 100755 index 000000000..db60e7e3a --- /dev/null +++ b/services/mana-voice-bot/start.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Start Mana Voice Bot + +cd "$(dirname "$0")" +source venv/bin/activate + +export PORT=${PORT:-3050} +export STT_URL=${STT_URL:-http://localhost:3020} +export OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434} +export DEFAULT_MODEL=${DEFAULT_MODEL:-gemma3:4b} +export DEFAULT_VOICE=${DEFAULT_VOICE:-de-DE-ConradNeural} + +echo "Starting Mana Voice Bot..." +echo " Port: $PORT" +echo " STT: $STT_URL" +echo " Ollama: $OLLAMA_URL" +echo " Model: $DEFAULT_MODEL" +echo " Voice: $DEFAULT_VOICE" +echo "" + +exec uvicorn app.main:app --host 0.0.0.0 --port $PORT