managarten/services/mana-tts/app/kokoro_service.py
Till-JS 5a0815708c 🌐 feat: add i18n support to 6 web apps
Add internationalization (DE + EN) to previously missing apps:
- todo: task management translations
- skilltree: skill/XP system translations
- nutriphi: nutrition tracking translations
- planta: plant care translations
- questions: research app translations
- matrix: chat client translations (layout integration)

Each app includes:
- svelte-i18n setup with SSR support
- localStorage persistence ({app}_locale pattern)
- i18n loading state in +layout.svelte
- German (default) and English translations

Updated CONSISTENCY_REPORT.md to mark i18n task as complete.

Also includes:
- mana-tts service placeholder files
2026-01-29 14:48:35 +01:00

187 lines
5.8 KiB
Python

"""
Kokoro TTS Service for fast preset voice synthesis.
Uses mlx-audio's Kokoro implementation optimized for Apple Silicon.
"""
import logging
from dataclasses import dataclass
from typing import Optional
import numpy as np
logger = logging.getLogger(__name__)
# Global singleton for lazy initialization
_kokoro_model = None
_kokoro_model_name = None
# Default model
DEFAULT_KOKORO_MODEL = "mlx-community/Kokoro-82M-bf16"
# Available Kokoro voices (American Female/Male, British Female/Male)
KOKORO_VOICES = {
# American Female voices
"af_heart": "American Female - Heart (warm, emotional)",
"af_alloy": "American Female - Alloy (neutral, professional)",
"af_aoede": "American Female - Aoede (clear, articulate)",
"af_bella": "American Female - Bella (friendly, approachable)",
"af_jessica": "American Female - Jessica (confident, clear)",
"af_kore": "American Female - Kore (calm, measured)",
"af_nicole": "American Female - Nicole (bright, energetic)",
"af_nova": "American Female - Nova (modern, dynamic)",
"af_river": "American Female - River (smooth, flowing)",
"af_sarah": "American Female - Sarah (warm, conversational)",
"af_sky": "American Female - Sky (light, airy)",
# American Male voices
"am_adam": "American Male - Adam (deep, authoritative)",
"am_echo": "American Male - Echo (resonant, clear)",
"am_eric": "American Male - Eric (professional, neutral)",
"am_fenrir": "American Male - Fenrir (strong, commanding)",
"am_liam": "American Male - Liam (friendly, casual)",
"am_michael": "American Male - Michael (warm, trustworthy)",
"am_onyx": "American Male - Onyx (deep, smooth)",
"am_puck": "American Male - Puck (playful, light)",
# British Female voices
"bf_alice": "British Female - Alice (refined, elegant)",
"bf_emma": "British Female - Emma (clear, professional)",
"bf_isabella": "British Female - Isabella (sophisticated, warm)",
"bf_lily": "British Female - Lily (soft, gentle)",
# British Male voices
"bm_daniel": "British Male - Daniel (classic, authoritative)",
"bm_fable": "British Male - Fable (storyteller, expressive)",
"bm_george": "British Male - George (traditional, clear)",
"bm_lewis": "British Male - Lewis (modern, approachable)",
}
DEFAULT_VOICE = "af_heart"
@dataclass
class KokoroResult:
"""Result from Kokoro TTS synthesis."""
audio: np.ndarray
sample_rate: int
voice: str
duration: float
def get_kokoro_model(model_name: str = DEFAULT_KOKORO_MODEL):
"""
Get or create Kokoro model instance (singleton pattern).
Args:
model_name: HuggingFace model identifier
Returns:
Kokoro model instance
"""
global _kokoro_model, _kokoro_model_name
# Return existing model if same model name
if _kokoro_model is not None and _kokoro_model_name == model_name:
return _kokoro_model
logger.info(f"Loading Kokoro model: {model_name}")
try:
from mlx_audio.tts import load
_kokoro_model = load(model_name)
_kokoro_model_name = model_name
logger.info("Kokoro model loaded successfully")
return _kokoro_model
except ImportError as e:
logger.error(f"Failed to import mlx_audio: {e}")
raise RuntimeError(
"mlx-audio not installed. Run: pip install mlx-audio"
)
except Exception as e:
logger.error(f"Failed to load Kokoro model: {e}")
raise
def is_kokoro_loaded() -> bool:
"""Check if Kokoro model is currently loaded."""
return _kokoro_model is not None
def get_available_voices() -> dict[str, str]:
"""Get dictionary of available Kokoro voices."""
return KOKORO_VOICES.copy()
async def synthesize_kokoro(
text: str,
voice: str = DEFAULT_VOICE,
speed: float = 1.0,
model_name: str = DEFAULT_KOKORO_MODEL,
) -> KokoroResult:
"""
Synthesize speech using Kokoro TTS.
Args:
text: Text to synthesize
voice: Voice ID from KOKORO_VOICES
speed: Speech speed multiplier (0.5-2.0)
model_name: HuggingFace model identifier
Returns:
KokoroResult with audio data
"""
# Validate voice
if voice not in KOKORO_VOICES:
logger.warning(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
voice = DEFAULT_VOICE
# Clamp speed to valid range
speed = max(0.5, min(2.0, speed))
# Get model
model = get_kokoro_model(model_name)
logger.info(f"Synthesizing with Kokoro: voice={voice}, speed={speed}, text_length={len(text)}")
try:
# Generate audio using mlx-audio's generate method
# Returns a generator of GenerationResult objects
result_gen = model.generate(
text=text,
voice=voice,
speed=speed,
)
# Collect all audio chunks from the generator
audio_chunks = []
sample_rate = 24000 # Default, will be updated from result
for result in result_gen:
# Each result has audio, sample_rate, audio_duration (string)
sample_rate = result.sample_rate
# Convert MLX array to numpy
audio_np = np.array(result.audio, dtype=np.float32)
audio_chunks.append(audio_np)
# Concatenate all chunks
if audio_chunks:
full_audio = np.concatenate(audio_chunks)
else:
raise RuntimeError("No audio generated")
# Calculate duration from audio length
total_duration = len(full_audio) / sample_rate
logger.info(f"Kokoro synthesis complete: duration={total_duration:.2f}s")
return KokoroResult(
audio=full_audio,
sample_rate=sample_rate,
voice=voice,
duration=total_duration,
)
except Exception as e:
logger.error(f"Kokoro synthesis failed: {e}")
raise RuntimeError(f"TTS synthesis failed: {e}")