managarten/services/mana-tts/app/kokoro_service.py

"""
Kokoro TTS Service for fast preset voice synthesis.
CUDA version using kokoro PyTorch package.
"""

import logging
from dataclasses import dataclass
from typing import Optional

import numpy as np

logger = logging.getLogger(__name__)

# Global singleton for lazy initialization
_kokoro_pipeline = None

# Default model
DEFAULT_KOKORO_MODEL = "hexgrad/Kokoro-82M"

# Available Kokoro voices (American Female/Male, British Female/Male)
KOKORO_VOICES = {
    # American Female voices
    "af_heart": "American Female - Heart (warm, emotional)",
    "af_alloy": "American Female - Alloy (neutral, professional)",
    "af_aoede": "American Female - Aoede (clear, articulate)",
    "af_bella": "American Female - Bella (friendly, approachable)",
    "af_jessica": "American Female - Jessica (confident, clear)",
    "af_kore": "American Female - Kore (calm, measured)",
    "af_nicole": "American Female - Nicole (bright, energetic)",
    "af_nova": "American Female - Nova (modern, dynamic)",
    "af_river": "American Female - River (smooth, flowing)",
    "af_sarah": "American Female - Sarah (warm, conversational)",
    "af_sky": "American Female - Sky (light, airy)",
    # American Male voices
    "am_adam": "American Male - Adam (deep, authoritative)",
    "am_echo": "American Male - Echo (resonant, clear)",
    "am_eric": "American Male - Eric (professional, neutral)",
    "am_fenrir": "American Male - Fenrir (strong, commanding)",
    "am_liam": "American Male - Liam (friendly, casual)",
    "am_michael": "American Male - Michael (warm, trustworthy)",
    "am_onyx": "American Male - Onyx (deep, smooth)",
    "am_puck": "American Male - Puck (playful, light)",
    # British Female voices
    "bf_alice": "British Female - Alice (refined, elegant)",
    "bf_emma": "British Female - Emma (clear, professional)",
    "bf_isabella": "British Female - Isabella (sophisticated, warm)",
    "bf_lily": "British Female - Lily (soft, gentle)",
    # British Male voices
    "bm_daniel": "British Male - Daniel (classic, authoritative)",
    "bm_fable": "British Male - Fable (storyteller, expressive)",
    "bm_george": "British Male - George (traditional, clear)",
    "bm_lewis": "British Male - Lewis (modern, approachable)",
}

DEFAULT_VOICE = "af_heart"


@dataclass
class KokoroResult:
    """Result from Kokoro TTS synthesis."""

    audio: np.ndarray
    sample_rate: int
    voice: str
    duration: float


def get_kokoro_model(model_name: str = DEFAULT_KOKORO_MODEL):
    """Get or create Kokoro pipeline instance (singleton pattern)."""
    global _kokoro_pipeline

    if _kokoro_pipeline is not None:
        return _kokoro_pipeline

    logger.info(f"Loading Kokoro model: {model_name}")

    try:
        from kokoro import KPipeline

        _kokoro_pipeline = KPipeline(lang_code="a")  # 'a' for American English
        logger.info("Kokoro pipeline loaded successfully")
        return _kokoro_pipeline

    except ImportError as e:
        logger.error(f"Failed to import kokoro: {e}")
        raise RuntimeError(
            "kokoro not installed. Run: pip install kokoro"
        )
    except Exception as e:
        logger.error(f"Failed to load Kokoro model: {e}")
        raise


def is_kokoro_loaded() -> bool:
    """Check if Kokoro model is currently loaded."""
    return _kokoro_pipeline is not None


def get_available_voices() -> dict[str, str]:
    """Get dictionary of available Kokoro voices."""
    return KOKORO_VOICES.copy()


async def synthesize_kokoro(
    text: str,
    voice: str = DEFAULT_VOICE,
    speed: float = 1.0,
    model_name: str = DEFAULT_KOKORO_MODEL,
) -> KokoroResult:
    """
    Synthesize speech using Kokoro TTS.

    Args:
        text: Text to synthesize
        voice: Voice ID from KOKORO_VOICES
        speed: Speech speed multiplier (0.5-2.0)
        model_name: Model identifier

    Returns:
        KokoroResult with audio data
    """
    # Validate voice
    if voice not in KOKORO_VOICES:
        logger.warning(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
        voice = DEFAULT_VOICE

    # Clamp speed to valid range
    speed = max(0.5, min(2.0, speed))

    # Get model
    pipeline = get_kokoro_model(model_name)

    logger.info(f"Synthesizing with Kokoro: voice={voice}, speed={speed}, text_length={len(text)}")

    try:
        # Generate audio using kokoro pipeline
        audio_chunks = []
        sample_rate = 24000  # Kokoro default

        for result in pipeline(text, voice=voice, speed=speed):
            # result is a KPipelineResult with .audio (tensor) and .graphemes/.phonemes
            audio_np = result.audio.numpy()
            audio_chunks.append(audio_np)

        # Concatenate all chunks
        if audio_chunks:
            full_audio = np.concatenate(audio_chunks)
        else:
            raise RuntimeError("No audio generated")

        # Calculate duration from audio length
        total_duration = len(full_audio) / sample_rate

        logger.info(f"Kokoro synthesis complete: duration={total_duration:.2f}s")

        return KokoroResult(
            audio=full_audio,
            sample_rate=sample_rate,
            voice=voice,
            duration=total_duration,
        )

    except Exception as e:
        logger.error(f"Kokoro synthesis failed: {e}")
        raise RuntimeError(f"TTS synthesis failed: {e}")