managarten/services/mana-tts/app/f5_service.py
Till-JS 5a0815708c 🌐 feat: add i18n support to 6 web apps
Add internationalization (DE + EN) to previously missing apps:
- todo: task management translations
- skilltree: skill/XP system translations
- nutriphi: nutrition tracking translations
- planta: plant care translations
- questions: research app translations
- matrix: chat client translations (layout integration)

Each app includes:
- svelte-i18n setup with SSR support
- localStorage persistence ({app}_locale pattern)
- i18n loading state in +layout.svelte
- German (default) and English translations

Updated CONSISTENCY_REPORT.md to mark i18n task as complete.

Also includes:
- mana-tts service placeholder files
2026-01-29 14:48:35 +01:00

208 lines
5.3 KiB
Python

"""
F5-TTS Service for voice cloning synthesis.
Uses f5-tts-mlx optimized for Apple Silicon.
"""
import logging
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import numpy as np
logger = logging.getLogger(__name__)
# Global singleton for lazy initialization
_f5_model = None
_f5_model_name = None
# Default model
DEFAULT_F5_MODEL = os.getenv("F5_MODEL", "lucasnewman/f5-tts-mlx")
# Default generation parameters
DEFAULT_DURATION = 10.0 # seconds
DEFAULT_STEPS = 32
DEFAULT_CFG_STRENGTH = 2.0
DEFAULT_SWAY_COEF = -1.0
DEFAULT_SPEED = 1.0
@dataclass
class F5Result:
"""Result from F5-TTS synthesis."""
audio: np.ndarray
sample_rate: int
duration: float
voice_id: Optional[str] = None
def get_f5_model(model_name: str = DEFAULT_F5_MODEL):
"""
Get or create F5-TTS model instance (singleton pattern).
Args:
model_name: HuggingFace model identifier
Returns:
F5TTS model instance
"""
global _f5_model, _f5_model_name
# Return existing model if same model name
if _f5_model is not None and _f5_model_name == model_name:
return _f5_model
logger.info(f"Loading F5-TTS model: {model_name}")
try:
from f5_tts_mlx import F5TTS
_f5_model = F5TTS(model_name=model_name)
_f5_model_name = model_name
logger.info("F5-TTS model loaded successfully")
return _f5_model
except ImportError as e:
logger.error(f"Failed to import f5_tts_mlx: {e}")
raise RuntimeError(
"f5-tts-mlx not installed. Run: pip install f5-tts-mlx"
)
except Exception as e:
logger.error(f"Failed to load F5-TTS model: {e}")
raise
def is_f5_loaded() -> bool:
"""Check if F5-TTS model is currently loaded."""
return _f5_model is not None
async def synthesize_f5(
text: str,
reference_audio_path: str,
reference_text: str,
duration: Optional[float] = None,
steps: int = DEFAULT_STEPS,
cfg_strength: float = DEFAULT_CFG_STRENGTH,
sway_coef: float = DEFAULT_SWAY_COEF,
speed: float = DEFAULT_SPEED,
model_name: str = DEFAULT_F5_MODEL,
) -> F5Result:
"""
Synthesize speech using F5-TTS with voice cloning.
Args:
text: Text to synthesize
reference_audio_path: Path to reference audio file
reference_text: Transcript of the reference audio
duration: Target duration in seconds (auto-calculated if None)
steps: Number of diffusion steps
cfg_strength: Classifier-free guidance strength
sway_coef: Sway sampling coefficient
speed: Speech speed multiplier
model_name: HuggingFace model identifier
Returns:
F5Result with audio data
"""
# Get model
model = get_f5_model(model_name)
logger.info(
f"Synthesizing with F5-TTS: text_length={len(text)}, "
f"ref_audio={reference_audio_path}, steps={steps}"
)
try:
# Generate audio
audio, sample_rate = model.generate(
text=text,
ref_audio_path=reference_audio_path,
ref_audio_text=reference_text,
duration=duration,
steps=steps,
cfg_strength=cfg_strength,
sway_coef=sway_coef,
speed=speed,
)
# Calculate duration
audio_duration = len(audio) / sample_rate
logger.info(f"F5-TTS synthesis complete: duration={audio_duration:.2f}s")
return F5Result(
audio=audio,
sample_rate=sample_rate,
duration=audio_duration,
)
except Exception as e:
logger.error(f"F5-TTS synthesis failed: {e}")
raise RuntimeError(f"Voice cloning synthesis failed: {e}")
async def synthesize_f5_from_bytes(
text: str,
reference_audio_bytes: bytes,
reference_text: str,
audio_extension: str = ".wav",
**kwargs,
) -> F5Result:
"""
Synthesize speech using F5-TTS with reference audio as bytes.
Args:
text: Text to synthesize
reference_audio_bytes: Reference audio as bytes
reference_text: Transcript of the reference audio
audio_extension: File extension for temp file
**kwargs: Additional arguments passed to synthesize_f5
Returns:
F5Result with audio data
"""
# Save reference audio to temp file
with tempfile.NamedTemporaryFile(
suffix=audio_extension,
delete=False,
) as tmp:
tmp.write(reference_audio_bytes)
tmp_path = tmp.name
try:
result = await synthesize_f5(
text=text,
reference_audio_path=tmp_path,
reference_text=reference_text,
**kwargs,
)
return result
finally:
# Clean up temp file
try:
Path(tmp_path).unlink()
except Exception:
pass
def estimate_duration(text: str, speed: float = 1.0) -> float:
"""
Estimate audio duration from text.
Args:
text: Text to synthesize
speed: Speech speed multiplier
Returns:
Estimated duration in seconds
"""
# Rough estimate: ~150 words per minute at normal speed
# Average word length: ~5 characters
words = len(text) / 5
minutes = words / 150
seconds = minutes * 60
return seconds / speed