managarten/services/mana-tts/app/f5_service.py

"""
F5-TTS Service for voice cloning synthesis.
CUDA version using f5-tts PyTorch package.
"""

import logging
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import numpy as np

logger = logging.getLogger(__name__)

# Global singleton for lazy initialization
_f5_api = None

# Default model
DEFAULT_F5_MODEL = os.getenv("F5_MODEL", "F5-TTS")

# Default generation parameters
DEFAULT_STEPS = 32
DEFAULT_CFG_STRENGTH = 2.0
DEFAULT_SWAY_COEF = -1.0
DEFAULT_SPEED = 1.0


@dataclass
class F5Result:
    """Result from F5-TTS synthesis."""

    audio: np.ndarray
    sample_rate: int
    duration: float
    voice_id: Optional[str] = None


def get_f5_model(model_name: str = DEFAULT_F5_MODEL):
    """Get or create F5-TTS API instance (singleton pattern)."""
    global _f5_api

    if _f5_api is not None:
        return _f5_api

    logger.info(f"Loading F5-TTS model: {model_name}")

    try:
        from f5_tts.api import F5TTS

        _f5_api = F5TTS(model_type="F5-TTS")
        logger.info("F5-TTS model loaded successfully (CUDA)")
        return _f5_api

    except ImportError as e:
        logger.error(f"Failed to import f5_tts: {e}")
        raise RuntimeError(
            "f5-tts not installed. Run: pip install f5-tts"
        )
    except Exception as e:
        logger.error(f"Failed to load F5-TTS model: {e}")
        raise


def is_f5_loaded() -> bool:
    """Check if F5-TTS model is currently loaded."""
    return _f5_api is not None


async def synthesize_f5(
    text: str,
    reference_audio_path: str,
    reference_text: str,
    duration: Optional[float] = None,
    steps: int = DEFAULT_STEPS,
    cfg_strength: float = DEFAULT_CFG_STRENGTH,
    sway_coef: float = DEFAULT_SWAY_COEF,
    speed: float = DEFAULT_SPEED,
    model_name: str = DEFAULT_F5_MODEL,
) -> F5Result:
    """
    Synthesize speech using F5-TTS with voice cloning.

    Args:
        text: Text to synthesize
        reference_audio_path: Path to reference audio file
        reference_text: Transcript of the reference audio
        duration: Target duration in seconds (auto-calculated if None)
        steps: Number of diffusion steps
        cfg_strength: Classifier-free guidance strength
        sway_coef: Sway sampling coefficient
        speed: Speech speed multiplier
        model_name: Model identifier

    Returns:
        F5Result with audio data
    """
    import asyncio

    api = get_f5_model(model_name)

    logger.info(
        f"Synthesizing with F5-TTS: text_length={len(text)}, "
        f"ref_audio={reference_audio_path}, steps={steps}"
    )

    try:
        # F5-TTS API infer method (runs synchronously, offload to thread)
        loop = asyncio.get_event_loop()

        def _generate():
            wav, sr, _ = api.infer(
                ref_file=reference_audio_path,
                ref_text=reference_text,
                gen_text=text,
                nfe_step=steps,
                cfg_strength=cfg_strength,
                sway_sampling_coeff=sway_coef,
                speed=speed,
            )
            return wav, sr

        audio, sample_rate = await loop.run_in_executor(None, _generate)

        # Convert to numpy if needed
        if not isinstance(audio, np.ndarray):
            audio = np.array(audio, dtype=np.float32)

        # Calculate duration
        audio_duration = len(audio) / sample_rate

        logger.info(f"F5-TTS synthesis complete: duration={audio_duration:.2f}s")

        return F5Result(
            audio=audio,
            sample_rate=sample_rate,
            duration=audio_duration,
        )

    except Exception as e:
        logger.error(f"F5-TTS synthesis failed: {e}")
        raise RuntimeError(f"Voice cloning synthesis failed: {e}")


async def synthesize_f5_from_bytes(
    text: str,
    reference_audio_bytes: bytes,
    reference_text: str,
    audio_extension: str = ".wav",
    **kwargs,
) -> F5Result:
    """Synthesize speech using F5-TTS with reference audio as bytes."""
    with tempfile.NamedTemporaryFile(suffix=audio_extension, delete=False) as tmp:
        tmp.write(reference_audio_bytes)
        tmp_path = tmp.name

    try:
        result = await synthesize_f5(
            text=text,
            reference_audio_path=tmp_path,
            reference_text=reference_text,
            **kwargs,
        )
        return result
    finally:
        try:
            Path(tmp_path).unlink()
        except Exception:
            pass


def estimate_duration(text: str, speed: float = 1.0) -> float:
    """Estimate audio duration from text."""
    words = len(text) / 5
    minutes = words / 150
    seconds = minutes * 60
    return seconds / speed