managarten/services/mana-tts/app/piper_service.py

"""
German TTS Service - Piper TTS (local, fast) with Edge TTS fallback.

Primary: Piper TTS - 100% local, DSGVO-konform, very fast
Fallback: Edge TTS - Cloud-based (Microsoft), high quality but sends data externally
"""

import logging
import tempfile
import os
import asyncio
from dataclasses import dataclass
from typing import Optional
from pathlib import Path
import numpy as np
import soundfile as sf

logger = logging.getLogger(__name__)

# Paths for Piper models
PIPER_VOICES_DIR = Path(__file__).parent.parent / "piper_voices"

# Available German voices
PIPER_VOICES = {
    # === LOCAL PIPER VOICES (Primary - 100% local) ===
    "de_thorsten": {
        "type": "piper",
        "model": "thorsten_medium.onnx",
        "name": "Thorsten",
        "description": "Deutsche Männerstimme (lokal, schnell)",
        "language": "de",
        "gender": "male",
        "local": True,
    },
    "de_kerstin": {
        "type": "piper",
        "model": "kerstin_low.onnx",
        "name": "Kerstin",
        "description": "Deutsche Frauenstimme (lokal, schnell)",
        "language": "de",
        "gender": "female",
        "local": True,
    },
    # === EDGE TTS VOICES (Fallback - Cloud) ===
    "de_katja": {
        "type": "edge",
        "edge_voice": "de-DE-KatjaNeural",
        "name": "Katja",
        "description": "Deutsche Frauenstimme (Cloud)",
        "language": "de",
        "gender": "female",
        "local": False,
    },
    "de_conrad": {
        "type": "edge",
        "edge_voice": "de-DE-ConradNeural",
        "name": "Conrad",
        "description": "Deutsche Männerstimme (Cloud)",
        "language": "de",
        "gender": "male",
        "local": False,
    },
    "de_amala": {
        "type": "edge",
        "edge_voice": "de-DE-AmalaNeural",
        "name": "Amala",
        "description": "Deutsche Frauenstimme jung (Cloud)",
        "language": "de",
        "gender": "female",
        "local": False,
    },
    "de_florian": {
        "type": "edge",
        "edge_voice": "de-DE-FlorianNeural",
        "name": "Florian",
        "description": "Deutsche Männerstimme jung (Cloud)",
        "language": "de",
        "gender": "male",
        "local": False,
    },
    # Legacy alias - maps to local Thorsten
    "de_anna": {
        "type": "piper",
        "model": "thorsten_medium.onnx",
        "name": "Anna (→ Thorsten)",
        "description": "Alias für Thorsten (lokal)",
        "language": "de",
        "gender": "male",
        "local": True,
    },
}

DEFAULT_PIPER_VOICE = "de_thorsten"

# Cached Piper voice instances (one per model)
_piper_voices: dict = {}
_piper_available = None
_edge_available = None


def _get_piper_model_path(model_name: str) -> Path:
    """Get full path to a Piper model."""
    return PIPER_VOICES_DIR / model_name


def check_piper_available() -> bool:
    """Check if Piper TTS is available."""
    global _piper_available
    if _piper_available is not None:
        return _piper_available

    try:
        from piper import PiperVoice
        model_path = _get_piper_model_path("thorsten_medium.onnx")
        if model_path.exists():
            _piper_available = True
            logger.info(f"Piper TTS available with model: {model_path}")
        else:
            _piper_available = False
            logger.warning(f"Piper model not found: {model_path}")
    except ImportError as e:
        _piper_available = False
        logger.warning(f"Piper TTS not installed: {e}")

    return _piper_available


def _check_edge_available() -> bool:
    """Check if Edge TTS is available."""
    global _edge_available
    if _edge_available is not None:
        return _edge_available

    try:
        import edge_tts
        _edge_available = True
        logger.info("Edge TTS available as fallback")
    except ImportError:
        _edge_available = False
        logger.warning("Edge TTS not installed")

    return _edge_available


def is_piper_loaded() -> bool:
    """Check if any TTS is available."""
    return check_piper_available() or _check_edge_available()


def _get_piper_voice(model_name: str = "thorsten_medium.onnx"):
    """Get or create cached Piper voice instance for a specific model."""
    global _piper_voices

    if model_name in _piper_voices:
        return _piper_voices[model_name]

    if not check_piper_available():
        return None

    try:
        from piper import PiperVoice
        model_path = _get_piper_model_path(model_name)
        config_path = _get_piper_model_path(f"{model_name}.json")

        logger.info(f"Loading Piper voice from {model_path}")
        voice = PiperVoice.load(str(model_path), str(config_path))
        _piper_voices[model_name] = voice
        logger.info(f"Piper voice {model_name} loaded successfully")
        return voice
    except Exception as e:
        logger.error(f"Failed to load Piper voice {model_name}: {e}")
        return None


@dataclass
class PiperSynthesisResult:
    """Result of TTS synthesis."""
    audio: np.ndarray
    sample_rate: int
    duration: float
    voice: str


async def _synthesize_with_piper(
    text: str,
    voice_id: str = "de_thorsten",
    length_scale: float = 1.0,
) -> PiperSynthesisResult:
    """Synthesize using local Piper TTS."""
    # Get the model name for this voice
    voice_config = PIPER_VOICES.get(voice_id, PIPER_VOICES["de_thorsten"])
    model_name = voice_config.get("model", "thorsten_medium.onnx")

    piper_voice = _get_piper_voice(model_name)
    if piper_voice is None:
        raise RuntimeError(f"Piper voice {voice_id} not available")

    logger.debug(f"Piper synthesizing with {voice_id}: \"{text[:50]}...\"")

    # Piper uses length_scale directly (1.0 = normal, >1 = slower)
    # Run in thread pool to not block async
    loop = asyncio.get_event_loop()

    def _synth():
        audio_data = []
        for audio_chunk in piper_voice.synthesize_stream_raw(text, length_scale=length_scale):
            audio_data.append(audio_chunk)
        return b"".join(audio_data)

    audio_bytes = await loop.run_in_executor(None, _synth)

    # Convert to numpy (16-bit PCM)
    audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
    sample_rate = piper_voice.config.sample_rate

    duration = len(audio) / sample_rate
    logger.debug(f"Piper synthesis complete: {duration:.2f}s, {sample_rate}Hz")

    return PiperSynthesisResult(
        audio=audio,
        sample_rate=sample_rate,
        duration=duration,
        voice=voice_id,
    )


async def _synthesize_with_edge(
    text: str,
    edge_voice: str,
    length_scale: float = 1.0,
) -> PiperSynthesisResult:
    """Synthesize using Edge TTS (cloud fallback)."""
    import edge_tts

    logger.debug(f"Edge TTS synthesizing: \"{text[:50]}...\" with voice={edge_voice}")

    # Convert length_scale to rate string
    rate_percent = int((1.0 / length_scale - 1.0) * 100)
    rate_str = f"{rate_percent:+d}%"

    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_file:
        tmp_path = tmp_file.name

    try:
        communicate = edge_tts.Communicate(text, edge_voice, rate=rate_str)
        await communicate.save(tmp_path)

        audio, sample_rate = sf.read(tmp_path)

        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)

        audio = audio.astype(np.float32)
        duration = len(audio) / sample_rate

        logger.debug(f"Edge TTS synthesis complete: {duration:.2f}s, {sample_rate}Hz")

        return PiperSynthesisResult(
            audio=audio,
            sample_rate=sample_rate,
            duration=duration,
            voice=edge_voice,
        )
    finally:
        if os.path.exists(tmp_path):
            os.unlink(tmp_path)


async def synthesize_piper(
    text: str,
    voice: str = DEFAULT_PIPER_VOICE,
    length_scale: float = 1.0,
) -> PiperSynthesisResult:
    """
    Synthesize speech - uses local Piper if available, falls back to Edge TTS.

    Args:
        text: Text to synthesize
        voice: Voice ID (e.g., "de_thorsten", "de_katja")
        length_scale: Speed control (1.0 = normal, >1 = slower, <1 = faster)

    Returns:
        PiperSynthesisResult with audio data
    """
    if not text.strip():
        raise ValueError("Text cannot be empty")

    # Get voice config
    if voice not in PIPER_VOICES:
        logger.warning(f"Unknown voice: {voice}, using default {DEFAULT_PIPER_VOICE}")
        voice = DEFAULT_PIPER_VOICE

    voice_config = PIPER_VOICES[voice]
    voice_type = voice_config.get("type", "piper")

    # Try local Piper first for piper-type voices
    if voice_type == "piper" and check_piper_available():
        try:
            return await _synthesize_with_piper(text, voice, length_scale)
        except Exception as e:
            logger.warning(f"Piper synthesis failed, trying Edge fallback: {e}")

    # Use Edge TTS for edge-type voices or as fallback
    if _check_edge_available():
        edge_voice = voice_config.get("edge_voice", "de-DE-ConradNeural")
        if voice_type == "piper":
            # Fallback: use appropriate Edge voice based on gender
            gender = voice_config.get("gender", "male")
            edge_voice = "de-DE-KatjaNeural" if gender == "female" else "de-DE-ConradNeural"
        return await _synthesize_with_edge(text, edge_voice, length_scale)

    raise RuntimeError("No TTS backend available (neither Piper nor Edge TTS)")


def list_piper_voices() -> list[dict]:
    """List all available German voices."""
    voices = []
    piper_available = check_piper_available()
    edge_available = _check_edge_available()

    for voice_id, config in PIPER_VOICES.items():
        # Skip legacy alias
        if voice_id == "de_anna":
            continue

        voice_type = config.get("type", "piper")
        is_available = (voice_type == "piper" and piper_available) or \
                       (voice_type == "edge" and edge_available)

        voices.append({
            "id": voice_id,
            "name": config["name"],
            "description": config["description"],
            "language": config["language"],
            "gender": config.get("gender", "unknown"),
            "local": config.get("local", False),
            "installed": is_available,
            "loaded": is_available,
        })

    # Sort: local voices first
    voices.sort(key=lambda v: (not v["local"], v["id"]))

    return voices


def get_piper_voice(voice_id: str) -> Optional[dict]:
    """Get voice configuration by ID."""
    if voice_id not in PIPER_VOICES:
        return None

    config = PIPER_VOICES[voice_id]
    voice_type = config.get("type", "piper")
    piper_available = check_piper_available()
    edge_available = _check_edge_available()

    is_available = (voice_type == "piper" and piper_available) or \
                   (voice_type == "edge" and edge_available)

    return {
        "id": voice_id,
        "name": config["name"],
        "description": config["description"],
        "language": config["language"],
        "gender": config.get("gender", "unknown"),
        "local": config.get("local", False),
        "installed": is_available,
        "loaded": is_available,
    }


async def download_piper_voice(voice_id: str) -> bool:
    """Check if voice is available."""
    if voice_id not in PIPER_VOICES:
        return False

    config = PIPER_VOICES[voice_id]
    voice_type = config.get("type", "piper")

    if voice_type == "piper":
        return check_piper_available()
    elif voice_type == "edge":
        return _check_edge_available()

    return False