""" Kokoro TTS Service for fast preset voice synthesis. Uses mlx-audio's Kokoro implementation optimized for Apple Silicon. """ import logging from dataclasses import dataclass from typing import Optional import numpy as np logger = logging.getLogger(__name__) # Global singleton for lazy initialization _kokoro_model = None _kokoro_model_name = None # Default model DEFAULT_KOKORO_MODEL = "mlx-community/Kokoro-82M-bf16" # Available Kokoro voices (American Female/Male, British Female/Male) KOKORO_VOICES = { # American Female voices "af_heart": "American Female - Heart (warm, emotional)", "af_alloy": "American Female - Alloy (neutral, professional)", "af_aoede": "American Female - Aoede (clear, articulate)", "af_bella": "American Female - Bella (friendly, approachable)", "af_jessica": "American Female - Jessica (confident, clear)", "af_kore": "American Female - Kore (calm, measured)", "af_nicole": "American Female - Nicole (bright, energetic)", "af_nova": "American Female - Nova (modern, dynamic)", "af_river": "American Female - River (smooth, flowing)", "af_sarah": "American Female - Sarah (warm, conversational)", "af_sky": "American Female - Sky (light, airy)", # American Male voices "am_adam": "American Male - Adam (deep, authoritative)", "am_echo": "American Male - Echo (resonant, clear)", "am_eric": "American Male - Eric (professional, neutral)", "am_fenrir": "American Male - Fenrir (strong, commanding)", "am_liam": "American Male - Liam (friendly, casual)", "am_michael": "American Male - Michael (warm, trustworthy)", "am_onyx": "American Male - Onyx (deep, smooth)", "am_puck": "American Male - Puck (playful, light)", # British Female voices "bf_alice": "British Female - Alice (refined, elegant)", "bf_emma": "British Female - Emma (clear, professional)", "bf_isabella": "British Female - Isabella (sophisticated, warm)", "bf_lily": "British Female - Lily (soft, gentle)", # British Male voices "bm_daniel": "British Male - Daniel (classic, authoritative)", "bm_fable": "British Male - Fable (storyteller, expressive)", "bm_george": "British Male - George (traditional, clear)", "bm_lewis": "British Male - Lewis (modern, approachable)", } DEFAULT_VOICE = "af_heart" @dataclass class KokoroResult: """Result from Kokoro TTS synthesis.""" audio: np.ndarray sample_rate: int voice: str duration: float def get_kokoro_model(model_name: str = DEFAULT_KOKORO_MODEL): """ Get or create Kokoro model instance (singleton pattern). Args: model_name: HuggingFace model identifier Returns: Kokoro model instance """ global _kokoro_model, _kokoro_model_name # Return existing model if same model name if _kokoro_model is not None and _kokoro_model_name == model_name: return _kokoro_model logger.info(f"Loading Kokoro model: {model_name}") try: from mlx_audio.tts import load _kokoro_model = load(model_name) _kokoro_model_name = model_name logger.info("Kokoro model loaded successfully") return _kokoro_model except ImportError as e: logger.error(f"Failed to import mlx_audio: {e}") raise RuntimeError( "mlx-audio not installed. Run: pip install mlx-audio" ) except Exception as e: logger.error(f"Failed to load Kokoro model: {e}") raise def is_kokoro_loaded() -> bool: """Check if Kokoro model is currently loaded.""" return _kokoro_model is not None def get_available_voices() -> dict[str, str]: """Get dictionary of available Kokoro voices.""" return KOKORO_VOICES.copy() async def synthesize_kokoro( text: str, voice: str = DEFAULT_VOICE, speed: float = 1.0, model_name: str = DEFAULT_KOKORO_MODEL, ) -> KokoroResult: """ Synthesize speech using Kokoro TTS. Args: text: Text to synthesize voice: Voice ID from KOKORO_VOICES speed: Speech speed multiplier (0.5-2.0) model_name: HuggingFace model identifier Returns: KokoroResult with audio data """ # Validate voice if voice not in KOKORO_VOICES: logger.warning(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'") voice = DEFAULT_VOICE # Clamp speed to valid range speed = max(0.5, min(2.0, speed)) # Get model model = get_kokoro_model(model_name) logger.info(f"Synthesizing with Kokoro: voice={voice}, speed={speed}, text_length={len(text)}") try: # Generate audio using mlx-audio's generate method # Returns a generator of GenerationResult objects result_gen = model.generate( text=text, voice=voice, speed=speed, ) # Collect all audio chunks from the generator audio_chunks = [] sample_rate = 24000 # Default, will be updated from result for result in result_gen: # Each result has audio, sample_rate, audio_duration (string) sample_rate = result.sample_rate # Convert MLX array to numpy audio_np = np.array(result.audio, dtype=np.float32) audio_chunks.append(audio_np) # Concatenate all chunks if audio_chunks: full_audio = np.concatenate(audio_chunks) else: raise RuntimeError("No audio generated") # Calculate duration from audio length total_duration = len(full_audio) / sample_rate logger.info(f"Kokoro synthesis complete: duration={total_duration:.2f}s") return KokoroResult( audio=full_audio, sample_rate=sample_rate, voice=voice, duration=total_duration, ) except Exception as e: logger.error(f"Kokoro synthesis failed: {e}") raise RuntimeError(f"TTS synthesis failed: {e}")