managarten/services/mana-tts/app/f5_service.py

"""
F5-TTS Service for voice cloning synthesis.
Uses f5-tts-mlx optimized for Apple Silicon.
"""

import logging
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import numpy as np

logger = logging.getLogger(__name__)

# Global singleton for lazy initialization
_f5_model = None
_f5_model_name = None

# Default model
DEFAULT_F5_MODEL = os.getenv("F5_MODEL", "lucasnewman/f5-tts-mlx")

# Default generation parameters
DEFAULT_DURATION = 10.0  # seconds
DEFAULT_STEPS = 32
DEFAULT_CFG_STRENGTH = 2.0
DEFAULT_SWAY_COEF = -1.0
DEFAULT_SPEED = 1.0


@dataclass
class F5Result:
    """Result from F5-TTS synthesis."""

    audio: np.ndarray
    sample_rate: int
    duration: float
    voice_id: Optional[str] = None


def get_f5_model(model_name: str = DEFAULT_F5_MODEL):
    """
    Get or create F5-TTS model instance (singleton pattern).

    Args:
        model_name: HuggingFace model identifier

    Returns:
        F5TTS model instance
    """
    global _f5_model, _f5_model_name

    # Return existing model if same model name
    if _f5_model is not None and _f5_model_name == model_name:
        return _f5_model

    logger.info(f"Loading F5-TTS model: {model_name}")

    try:
        from f5_tts_mlx import F5TTS

        _f5_model = F5TTS(model_name=model_name)
        _f5_model_name = model_name
        logger.info("F5-TTS model loaded successfully")
        return _f5_model

    except ImportError as e:
        logger.error(f"Failed to import f5_tts_mlx: {e}")
        raise RuntimeError(
            "f5-tts-mlx not installed. Run: pip install f5-tts-mlx"
        )
    except Exception as e:
        logger.error(f"Failed to load F5-TTS model: {e}")
        raise


def is_f5_loaded() -> bool:
    """Check if F5-TTS model is currently loaded."""
    return _f5_model is not None


async def synthesize_f5(
    text: str,
    reference_audio_path: str,
    reference_text: str,
    duration: Optional[float] = None,
    steps: int = DEFAULT_STEPS,
    cfg_strength: float = DEFAULT_CFG_STRENGTH,
    sway_coef: float = DEFAULT_SWAY_COEF,
    speed: float = DEFAULT_SPEED,
    model_name: str = DEFAULT_F5_MODEL,
) -> F5Result:
    """
    Synthesize speech using F5-TTS with voice cloning.

    Args:
        text: Text to synthesize
        reference_audio_path: Path to reference audio file
        reference_text: Transcript of the reference audio
        duration: Target duration in seconds (auto-calculated if None)
        steps: Number of diffusion steps
        cfg_strength: Classifier-free guidance strength
        sway_coef: Sway sampling coefficient
        speed: Speech speed multiplier
        model_name: HuggingFace model identifier

    Returns:
        F5Result with audio data
    """
    # Get model
    model = get_f5_model(model_name)

    logger.info(
        f"Synthesizing with F5-TTS: text_length={len(text)}, "
        f"ref_audio={reference_audio_path}, steps={steps}"
    )

    try:
        # Generate audio
        audio, sample_rate = model.generate(
            text=text,
            ref_audio_path=reference_audio_path,
            ref_audio_text=reference_text,
            duration=duration,
            steps=steps,
            cfg_strength=cfg_strength,
            sway_coef=sway_coef,
            speed=speed,
        )

        # Calculate duration
        audio_duration = len(audio) / sample_rate

        logger.info(f"F5-TTS synthesis complete: duration={audio_duration:.2f}s")

        return F5Result(
            audio=audio,
            sample_rate=sample_rate,
            duration=audio_duration,
        )

    except Exception as e:
        logger.error(f"F5-TTS synthesis failed: {e}")
        raise RuntimeError(f"Voice cloning synthesis failed: {e}")


async def synthesize_f5_from_bytes(
    text: str,
    reference_audio_bytes: bytes,
    reference_text: str,
    audio_extension: str = ".wav",
    **kwargs,
) -> F5Result:
    """
    Synthesize speech using F5-TTS with reference audio as bytes.

    Args:
        text: Text to synthesize
        reference_audio_bytes: Reference audio as bytes
        reference_text: Transcript of the reference audio
        audio_extension: File extension for temp file
        **kwargs: Additional arguments passed to synthesize_f5

    Returns:
        F5Result with audio data
    """
    # Save reference audio to temp file
    with tempfile.NamedTemporaryFile(
        suffix=audio_extension,
        delete=False,
    ) as tmp:
        tmp.write(reference_audio_bytes)
        tmp_path = tmp.name

    try:
        result = await synthesize_f5(
            text=text,
            reference_audio_path=tmp_path,
            reference_text=reference_text,
            **kwargs,
        )
        return result
    finally:
        # Clean up temp file
        try:
            Path(tmp_path).unlink()
        except Exception:
            pass


def estimate_duration(text: str, speed: float = 1.0) -> float:
    """
    Estimate audio duration from text.

    Args:
        text: Text to synthesize
        speed: Speech speed multiplier

    Returns:
        Estimated duration in seconds
    """
    # Rough estimate: ~150 words per minute at normal speed
    # Average word length: ~5 characters
    words = len(text) / 5
    minutes = words / 150
    seconds = minutes * 60
    return seconds / speed