mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:21:09 +02:00
The Windows GPU server has been the actual production home for these
services for some time, and the running code there has drifted ahead of
the repo. This sync pulls the live versions back into the repo so the
Windows box is no longer the only place those changes exist.
Pulled from C:\mana\services\* on mana-server-gpu (192.168.178.11):
mana-llm:
- src/main.py, src/config.py — small fixes (auth wiring, config tweaks)
- src/api_auth.py — NEW (cross-service GPU_API_KEY validator)
- service.pyw — Windows runner used by the ManaLLM scheduled task
(sets up logging redirect, loads .env, calls uvicorn)
mana-stt:
- app/main.py — substantial cleanup (684→392 lines), drops the
whisperx-as-separate-backend branching now that whisper_service.py
rolls whisperx in directly
- app/whisper_service.py — full CUDA + whisperx rewrite (158→358 lines)
- app/auth.py + external_auth.py — significantly expanded auth
- app/vram_manager.py — NEW (shared VRAM accounting helper)
- service.pyw — Windows runner with CUDA pre-init, FFmpeg PATH
injection, .env loading
- removed: app/whisper_service_cuda.py (folded into whisper_service.py)
- removed: app/whisperx_service.py (folded into whisper_service.py)
mana-tts:
- app/auth.py, external_auth.py — same auth expansion as stt
- app/f5_service.py, kokoro_service.py — Windows tweaks
- app/vram_manager.py — NEW (same shared helper as stt)
- service.pyw — Windows runner
mana-video-gen:
- service.pyw — Windows runner (no other changes; the .py code on the
GPU box is byte-identical to what's already in the repo)
The service.pyw files contain absolute Windows paths
(C:\mana\services\<svc>) and a hardcoded FFmpeg PATH for the tills user
profile. Kept as-is intentionally — they exist to be deployed to that
one machine and any abstraction layer would just hide what's actually
happening. Anyone redeploying to a different layout will need to edit
the path strings, which is a known and obvious change.
Mac-Mini infrastructure for these services (launchd plists, install
scripts, scripts/mac-mini/setup-{stt,tts}.sh, the Mac-flux2c image-gen
implementation) is still on disk and will be removed in a follow-up
commit, along with replacing mana-image-gen with the Windows
diffusers+CUDA implementation. This commit is just the live-code sync.
178 lines
4.6 KiB
Python
178 lines
4.6 KiB
Python
"""
|
|
F5-TTS Service for voice cloning synthesis.
|
|
CUDA version using f5-tts PyTorch package.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Global singleton for lazy initialization
|
|
_f5_api = None
|
|
|
|
# Default model
|
|
DEFAULT_F5_MODEL = os.getenv("F5_MODEL", "F5-TTS")
|
|
|
|
# Default generation parameters
|
|
DEFAULT_STEPS = 32
|
|
DEFAULT_CFG_STRENGTH = 2.0
|
|
DEFAULT_SWAY_COEF = -1.0
|
|
DEFAULT_SPEED = 1.0
|
|
|
|
|
|
@dataclass
|
|
class F5Result:
|
|
"""Result from F5-TTS synthesis."""
|
|
|
|
audio: np.ndarray
|
|
sample_rate: int
|
|
duration: float
|
|
voice_id: Optional[str] = None
|
|
|
|
|
|
def get_f5_model(model_name: str = DEFAULT_F5_MODEL):
|
|
"""Get or create F5-TTS API instance (singleton pattern)."""
|
|
global _f5_api
|
|
|
|
if _f5_api is not None:
|
|
return _f5_api
|
|
|
|
logger.info(f"Loading F5-TTS model: {model_name}")
|
|
|
|
try:
|
|
from f5_tts.api import F5TTS
|
|
|
|
_f5_api = F5TTS(model_type="F5-TTS")
|
|
logger.info("F5-TTS model loaded successfully (CUDA)")
|
|
return _f5_api
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import f5_tts: {e}")
|
|
raise RuntimeError(
|
|
"f5-tts not installed. Run: pip install f5-tts"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to load F5-TTS model: {e}")
|
|
raise
|
|
|
|
|
|
def is_f5_loaded() -> bool:
|
|
"""Check if F5-TTS model is currently loaded."""
|
|
return _f5_api is not None
|
|
|
|
|
|
async def synthesize_f5(
|
|
text: str,
|
|
reference_audio_path: str,
|
|
reference_text: str,
|
|
duration: Optional[float] = None,
|
|
steps: int = DEFAULT_STEPS,
|
|
cfg_strength: float = DEFAULT_CFG_STRENGTH,
|
|
sway_coef: float = DEFAULT_SWAY_COEF,
|
|
speed: float = DEFAULT_SPEED,
|
|
model_name: str = DEFAULT_F5_MODEL,
|
|
) -> F5Result:
|
|
"""
|
|
Synthesize speech using F5-TTS with voice cloning.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
reference_audio_path: Path to reference audio file
|
|
reference_text: Transcript of the reference audio
|
|
duration: Target duration in seconds (auto-calculated if None)
|
|
steps: Number of diffusion steps
|
|
cfg_strength: Classifier-free guidance strength
|
|
sway_coef: Sway sampling coefficient
|
|
speed: Speech speed multiplier
|
|
model_name: Model identifier
|
|
|
|
Returns:
|
|
F5Result with audio data
|
|
"""
|
|
import asyncio
|
|
|
|
api = get_f5_model(model_name)
|
|
|
|
logger.info(
|
|
f"Synthesizing with F5-TTS: text_length={len(text)}, "
|
|
f"ref_audio={reference_audio_path}, steps={steps}"
|
|
)
|
|
|
|
try:
|
|
# F5-TTS API infer method (runs synchronously, offload to thread)
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _generate():
|
|
wav, sr, _ = api.infer(
|
|
ref_file=reference_audio_path,
|
|
ref_text=reference_text,
|
|
gen_text=text,
|
|
nfe_step=steps,
|
|
cfg_strength=cfg_strength,
|
|
sway_sampling_coeff=sway_coef,
|
|
speed=speed,
|
|
)
|
|
return wav, sr
|
|
|
|
audio, sample_rate = await loop.run_in_executor(None, _generate)
|
|
|
|
# Convert to numpy if needed
|
|
if not isinstance(audio, np.ndarray):
|
|
audio = np.array(audio, dtype=np.float32)
|
|
|
|
# Calculate duration
|
|
audio_duration = len(audio) / sample_rate
|
|
|
|
logger.info(f"F5-TTS synthesis complete: duration={audio_duration:.2f}s")
|
|
|
|
return F5Result(
|
|
audio=audio,
|
|
sample_rate=sample_rate,
|
|
duration=audio_duration,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"F5-TTS synthesis failed: {e}")
|
|
raise RuntimeError(f"Voice cloning synthesis failed: {e}")
|
|
|
|
|
|
async def synthesize_f5_from_bytes(
|
|
text: str,
|
|
reference_audio_bytes: bytes,
|
|
reference_text: str,
|
|
audio_extension: str = ".wav",
|
|
**kwargs,
|
|
) -> F5Result:
|
|
"""Synthesize speech using F5-TTS with reference audio as bytes."""
|
|
with tempfile.NamedTemporaryFile(suffix=audio_extension, delete=False) as tmp:
|
|
tmp.write(reference_audio_bytes)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
result = await synthesize_f5(
|
|
text=text,
|
|
reference_audio_path=tmp_path,
|
|
reference_text=reference_text,
|
|
**kwargs,
|
|
)
|
|
return result
|
|
finally:
|
|
try:
|
|
Path(tmp_path).unlink()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def estimate_duration(text: str, speed: float = 1.0) -> float:
|
|
"""Estimate audio duration from text."""
|
|
words = len(text) / 5
|
|
minutes = words / 150
|
|
seconds = minutes * 60
|
|
return seconds / speed
|