managarten/services/mana-voice-bot/app/main.py
Till JS 4cb1bc1827 fix(mana-voice-bot): move default port 3050 → 3024 + Windows GPU deployment notes
mana-voice-bot's source default was 3050, which collided with mana-sync.
Today the collision is latent (voice-bot isn't deployed anywhere), but
sooner or later someone is going to start it on a host that's already
running mana-sync and the second one will refuse to bind. Moving to
3024 puts it inside the AI/ML port range alongside its dependencies
(stt 3020, tts 3022, image-gen 3023, llm 3025) and away from sync.

Updated:
- app/main.py — PORT default 3050 → 3024
- start.sh, setup.sh — same fix in the example commands
- CLAUDE.md — full rewrite. Old version described "Mac Mini deployment"
  with launchd; the new version explicitly says "not deployed yet" and
  documents the seven concrete steps to deploy on the Windows GPU box
  alongside the other AI services (Scheduled Task, service.pyw, .env,
  firewall rule, cloudflared route, WINDOWS_GPU_SERVER_SETUP.md update).

docs/WINDOWS_GPU_SERVER_SETUP.md:
- Added the missing ManaVideoGen scheduled task to all four
  Start-ScheduledTask snippets — video-gen has been running on the
  Windows GPU but the doc had never picked it up.
- Added a "mana-video-gen (Port 3026)" service section parallel to the
  existing image-gen one, with venv path, repo pointer, model, etc.
- Added a repo-pendants table mapping C:\mana\services\<svc>\ to the
  corresponding services/<svc>/ directory in the repo, plus a note that
  changes should flow repo→Windows, not the other way around.

docs/PORT_SCHEMA.md:
- Reconciled the warning block with the post-cleanup reality: no more
  active or latent port collisions (image-gen ↔ video-gen and
  voice-bot ↔ sync are both resolved). Listed the actual ports per host
  with public URLs. Kept the planned-vs-actual disclaimer for the
  services that still don't match the aspirational ranges (mana-credits
  3061 vs planned 3002, etc).
2026-04-08 13:14:57 +02:00

504 lines
16 KiB
Python

"""
Mana Voice Bot - German Voice Assistant
Complete voice-to-voice pipeline:
1. STT: Whisper (mana-stt) - Speech to Text
2. LLM: Ollama (Gemma/Qwen) - Text Processing
3. TTS: Edge TTS - Text to Speech
Optimized for German language.
"""
import asyncio
import logging
import os
import tempfile
import time
from contextlib import asynccontextmanager
from typing import Optional
import aiohttp
import edge_tts
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel, Field
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Configuration
PORT = int(os.getenv("PORT", "3024"))
STT_URL = os.getenv("STT_URL", "http://localhost:3020")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "gemma3:4b")
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "de-DE-ConradNeural")
SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", """Du bist ein freundlicher und hilfreicher deutscher Sprachassistent.
Antworte immer auf Deutsch, kurz und prägnant.
Halte deine Antworten unter 3 Sätzen, es sei denn, der Nutzer fragt nach mehr Details.""")
CORS_ORIGINS = os.getenv(
"CORS_ORIGINS",
"https://mana.how,http://localhost:5173,http://localhost:3000",
).split(",")
# German Edge TTS voices
GERMAN_VOICES = {
"de-DE-ConradNeural": "Male - Conrad (Professional)",
"de-DE-KatjaNeural": "Female - Katja (Natural)",
"de-DE-AmalaNeural": "Female - Amala (Friendly)",
"de-DE-BerndNeural": "Male - Bernd (Calm)",
"de-DE-ChristophNeural": "Male - Christoph (News)",
"de-DE-ElkeNeural": "Female - Elke (Warm)",
"de-DE-KillianNeural": "Male - Killian (Casual)",
"de-DE-KlarissaNeural": "Female - Klarissa (Cheerful)",
"de-DE-KlausNeural": "Male - Klaus (Storyteller)",
"de-DE-LouisaNeural": "Female - Louisa (Assistant)",
"de-DE-TanjaNeural": "Female - Tanja (Business)",
}
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager."""
logger.info(f"Starting Mana Voice Bot on port {PORT}")
logger.info(f"STT Service: {STT_URL}")
logger.info(f"Ollama: {OLLAMA_URL}")
logger.info(f"Default Model: {DEFAULT_MODEL}")
logger.info(f"Default Voice: {DEFAULT_VOICE}")
yield
logger.info("Shutting down Mana Voice Bot")
app = FastAPI(
title="Mana Voice Bot",
description="German voice-to-voice assistant using Whisper + Ollama + Edge TTS",
version="1.0.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ============================================================================
# Models
# ============================================================================
class ChatRequest(BaseModel):
"""Text chat request."""
message: str = Field(..., description="User message")
model: str = Field(DEFAULT_MODEL, description="Ollama model to use")
voice: str = Field(DEFAULT_VOICE, description="TTS voice")
system_prompt: Optional[str] = Field(None, description="Custom system prompt")
class TranscribeResponse(BaseModel):
"""Transcription response."""
text: str
language: str
duration: float
class ChatResponse(BaseModel):
"""Chat response with text."""
user_text: str
assistant_text: str
model: str
processing_time: float
class VoiceResponse(BaseModel):
"""Voice processing response metadata."""
user_text: str
assistant_text: str
model: str
voice: str
stt_time: float
llm_time: float
tts_time: float
total_time: float
# ============================================================================
# Service Functions
# ============================================================================
async def transcribe_audio(audio_bytes: bytes, language: str = "de") -> dict:
"""Transcribe audio using mana-stt (Whisper)."""
async with aiohttp.ClientSession() as session:
data = aiohttp.FormData()
data.add_field("file", audio_bytes, filename="audio.wav", content_type="audio/wav")
data.add_field("language", language)
async with session.post(f"{STT_URL}/transcribe", data=data) as response:
if response.status != 200:
error = await response.text()
raise HTTPException(status_code=500, detail=f"STT error: {error}")
return await response.json()
async def chat_with_ollama(
message: str,
model: str = DEFAULT_MODEL,
system_prompt: str = SYSTEM_PROMPT,
) -> str:
"""Send message to Ollama and get response."""
async with aiohttp.ClientSession() as session:
payload = {
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": message},
],
"stream": False,
}
async with session.post(
f"{OLLAMA_URL}/api/chat",
json=payload,
timeout=aiohttp.ClientTimeout(total=120),
) as response:
if response.status != 200:
error = await response.text()
raise HTTPException(status_code=500, detail=f"Ollama error: {error}")
result = await response.json()
return result.get("message", {}).get("content", "")
async def synthesize_speech(text: str, voice: str = DEFAULT_VOICE) -> bytes:
"""Synthesize speech using Edge TTS."""
if voice not in GERMAN_VOICES:
voice = DEFAULT_VOICE
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
output_path = f.name
try:
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
with open(output_path, "rb") as f:
return f.read()
finally:
if os.path.exists(output_path):
os.unlink(output_path)
# ============================================================================
# Endpoints
# ============================================================================
@app.get("/health")
async def health_check():
"""Health check endpoint."""
# Check STT service
stt_ok = False
try:
async with aiohttp.ClientSession() as session:
async with session.get(f"{STT_URL}/health", timeout=aiohttp.ClientTimeout(total=5)) as r:
stt_ok = r.status == 200
except Exception:
pass
# Check Ollama
ollama_ok = False
try:
async with aiohttp.ClientSession() as session:
async with session.get(f"{OLLAMA_URL}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as r:
ollama_ok = r.status == 200
except Exception:
pass
return {
"status": "healthy" if (stt_ok and ollama_ok) else "degraded",
"services": {
"stt": "ok" if stt_ok else "unavailable",
"ollama": "ok" if ollama_ok else "unavailable",
"tts": "ok", # Edge TTS is always available (cloud API)
},
"config": {
"default_model": DEFAULT_MODEL,
"default_voice": DEFAULT_VOICE,
},
}
@app.get("/voices")
async def list_voices():
"""List available German voices."""
return {
"voices": GERMAN_VOICES,
"default": DEFAULT_VOICE,
}
@app.get("/models")
async def list_models():
"""List available Ollama models."""
try:
async with aiohttp.ClientSession() as session:
async with session.get(f"{OLLAMA_URL}/api/tags") as response:
if response.status == 200:
data = await response.json()
models = [m["name"] for m in data.get("models", [])]
return {"models": models, "default": DEFAULT_MODEL}
except Exception as e:
logger.error(f"Failed to get models: {e}")
return {"models": [], "default": DEFAULT_MODEL, "error": "Could not fetch models"}
@app.post("/transcribe", response_model=TranscribeResponse)
async def transcribe(
audio: UploadFile = File(..., description="Audio file to transcribe"),
language: str = Form("de", description="Language code"),
):
"""Transcribe audio to text using Whisper."""
start = time.time()
audio_bytes = await audio.read()
if len(audio_bytes) == 0:
raise HTTPException(status_code=400, detail="Audio file is empty")
result = await transcribe_audio(audio_bytes, language)
return TranscribeResponse(
text=result.get("text", ""),
language=result.get("language", language),
duration=time.time() - start,
)
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
"""Text chat with the LLM."""
start = time.time()
response_text = await chat_with_ollama(
message=request.message,
model=request.model,
system_prompt=request.system_prompt or SYSTEM_PROMPT,
)
return ChatResponse(
user_text=request.message,
assistant_text=response_text,
model=request.model,
processing_time=time.time() - start,
)
@app.post("/chat/audio")
async def chat_audio(request: ChatRequest):
"""Text chat with audio response."""
start = time.time()
# Get LLM response
llm_start = time.time()
response_text = await chat_with_ollama(
message=request.message,
model=request.model,
system_prompt=request.system_prompt or SYSTEM_PROMPT,
)
llm_time = time.time() - llm_start
# Synthesize speech
tts_start = time.time()
audio_bytes = await synthesize_speech(response_text, request.voice)
tts_time = time.time() - tts_start
return Response(
content=audio_bytes,
media_type="audio/mpeg",
headers={
"X-User-Text": request.message[:100],
"X-Assistant-Text": response_text[:200],
"X-Model": request.model,
"X-Voice": request.voice,
"X-LLM-Time": str(round(llm_time, 2)),
"X-TTS-Time": str(round(tts_time, 2)),
"X-Total-Time": str(round(time.time() - start, 2)),
},
)
@app.post("/voice")
async def voice_to_voice(
audio: UploadFile = File(..., description="Audio input"),
model: str = Form(DEFAULT_MODEL, description="Ollama model"),
voice: str = Form(DEFAULT_VOICE, description="TTS voice"),
language: str = Form("de", description="Input language"),
system_prompt: Optional[str] = Form(None, description="Custom system prompt"),
):
"""
Complete voice-to-voice pipeline.
1. Transcribe audio input (Whisper)
2. Process with LLM (Ollama)
3. Synthesize response (Edge TTS)
Returns audio response with metadata headers.
"""
total_start = time.time()
# Read audio
audio_bytes = await audio.read()
if len(audio_bytes) == 0:
raise HTTPException(status_code=400, detail="Audio file is empty")
# 1. Speech-to-Text
stt_start = time.time()
try:
stt_result = await transcribe_audio(audio_bytes, language)
user_text = stt_result.get("text", "").strip()
except Exception as e:
logger.error(f"STT failed: {e}")
raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
stt_time = time.time() - stt_start
if not user_text:
raise HTTPException(status_code=400, detail="Could not transcribe audio - no speech detected")
logger.info(f"Transcribed: {user_text}")
# 2. LLM Processing
llm_start = time.time()
try:
assistant_text = await chat_with_ollama(
message=user_text,
model=model,
system_prompt=system_prompt or SYSTEM_PROMPT,
)
except Exception as e:
logger.error(f"LLM failed: {e}")
raise HTTPException(status_code=500, detail=f"LLM processing failed: {e}")
llm_time = time.time() - llm_start
logger.info(f"LLM response: {assistant_text[:100]}...")
# 3. Text-to-Speech
tts_start = time.time()
try:
response_audio = await synthesize_speech(assistant_text, voice)
except Exception as e:
logger.error(f"TTS failed: {e}")
raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {e}")
tts_time = time.time() - tts_start
total_time = time.time() - total_start
logger.info(f"Pipeline complete - STT: {stt_time:.2f}s, LLM: {llm_time:.2f}s, TTS: {tts_time:.2f}s, Total: {total_time:.2f}s")
# Return audio with metadata
return Response(
content=response_audio,
media_type="audio/mpeg",
headers={
"X-User-Text": user_text[:200].replace("\n", " "),
"X-Assistant-Text": assistant_text[:500].replace("\n", " "),
"X-Model": model,
"X-Voice": voice,
"X-STT-Time": str(round(stt_time, 2)),
"X-LLM-Time": str(round(llm_time, 2)),
"X-TTS-Time": str(round(tts_time, 2)),
"X-Total-Time": str(round(total_time, 2)),
},
)
@app.post("/voice/metadata", response_model=VoiceResponse)
async def voice_to_voice_with_metadata(
audio: UploadFile = File(..., description="Audio input"),
model: str = Form(DEFAULT_MODEL, description="Ollama model"),
voice: str = Form(DEFAULT_VOICE, description="TTS voice"),
language: str = Form("de", description="Input language"),
system_prompt: Optional[str] = Form(None, description="Custom system prompt"),
):
"""
Voice-to-voice pipeline returning JSON metadata (without audio).
Useful for debugging or when you need the text response.
"""
total_start = time.time()
audio_bytes = await audio.read()
if len(audio_bytes) == 0:
raise HTTPException(status_code=400, detail="Audio file is empty")
# STT
stt_start = time.time()
stt_result = await transcribe_audio(audio_bytes, language)
user_text = stt_result.get("text", "").strip()
stt_time = time.time() - stt_start
if not user_text:
raise HTTPException(status_code=400, detail="No speech detected")
# LLM
llm_start = time.time()
assistant_text = await chat_with_ollama(
message=user_text,
model=model,
system_prompt=system_prompt or SYSTEM_PROMPT,
)
llm_time = time.time() - llm_start
# TTS (just measure time, don't return audio)
tts_start = time.time()
await synthesize_speech(assistant_text, voice)
tts_time = time.time() - tts_start
return VoiceResponse(
user_text=user_text,
assistant_text=assistant_text,
model=model,
voice=voice,
stt_time=round(stt_time, 2),
llm_time=round(llm_time, 2),
tts_time=round(tts_time, 2),
total_time=round(time.time() - total_start, 2),
)
@app.post("/tts")
async def text_to_speech(
text: str = Form(..., description="Text to synthesize"),
voice: str = Form(DEFAULT_VOICE, description="Voice to use"),
):
"""Direct text-to-speech synthesis."""
if not text.strip():
raise HTTPException(status_code=400, detail="Text is empty")
audio_bytes = await synthesize_speech(text, voice)
return Response(
content=audio_bytes,
media_type="audio/mpeg",
headers={
"X-Voice": voice,
"X-Text-Length": str(len(text)),
},
)
# ============================================================================
# Main
# ============================================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=PORT)