mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 18:01:09 +02:00
Voice-based interview for the profile module — users choose between text, voice (question read aloud + mic for answer), or conversation mode (fully automatic flow with auto-save). Interview audio: - 92 pre-rendered MP3 files (23 questions × 4 voices) via Edge TTS - Voices: Seraphina (DE-f), Florian (DE-m), Leni (CH-f), Jan (CH-m) - User picks voice via dropdown, persisted in localStorage - Web Speech API fallback for missing audio files Profile UI: - Interview hero block on overview with 3 start modes (text/voice/conversation) - Voice/conversation toggle + voice picker in interview view - Mic button on text/textarea/tags inputs for per-question voice input - Conversation mode: auto-save + auto-advance after STT transcription - Recording/transcribing/speaking state indicators mana-tts service: - New Orpheus TTS backend (German finetune, SNAC codec) - New Zonos TTS backend (Zyphra, 200k hours, emotion control) - Endpoints: POST /synthesize/orpheus, POST /synthesize/zonos - espeak-ng installed on GPU server for Zonos phonemizer - Compare script for side-by-side voice quality testing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
205 lines
5.5 KiB
Python
205 lines
5.5 KiB
Python
"""
|
|
Zonos TTS — Expressive multilingual speech synthesis by Zyphra.
|
|
|
|
Trained on 200k hours of speech data with explicit German support.
|
|
Fine-grained control over pitch, speaking rate, and emotions.
|
|
|
|
Model: Zyphra/Zonos-v0.1-transformer (HuggingFace)
|
|
VRAM: ~5 GB (fits comfortably on RTX 3090)
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import os
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
|
|
# Disable torch.compile (requires MSVC cl.exe on Windows which we don't have)
|
|
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Lazy-loaded model state
|
|
_model = None
|
|
_loaded = False
|
|
|
|
MODEL_ID = "Zyphra/Zonos-v0.1-transformer"
|
|
SAMPLE_RATE = 44100 # Zonos outputs 44.1 kHz audio
|
|
|
|
# Emotion presets for the interview context
|
|
EMOTION_PRESETS = {
|
|
"neutral": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # neutral dominant
|
|
"friendly": [0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5], # happiness + neutral
|
|
"warm": [0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # slight warmth
|
|
"curious": [0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7], # interested
|
|
}
|
|
|
|
DEFAULT_EMOTION = "friendly"
|
|
|
|
|
|
@dataclass
|
|
class ZonosResult:
|
|
audio: np.ndarray
|
|
sample_rate: int
|
|
duration: float
|
|
emotion: str
|
|
|
|
|
|
def is_zonos_loaded() -> bool:
|
|
return _loaded
|
|
|
|
|
|
def get_zonos_model():
|
|
"""Load the Zonos model (lazy, first call only)."""
|
|
global _model, _loaded
|
|
|
|
if _loaded:
|
|
return _model
|
|
|
|
logger.info(f"Loading Zonos model: {MODEL_ID}")
|
|
|
|
try:
|
|
import torch
|
|
|
|
# Zonos provides its own loader
|
|
# Try the official zonos package first, fall back to transformers
|
|
try:
|
|
from zonos.model import Zonos
|
|
|
|
_model = Zonos.from_pretrained(MODEL_ID, device="cuda")
|
|
except ImportError:
|
|
# If zonos package not installed, use transformers
|
|
logger.info("zonos package not found, trying transformers loading")
|
|
from transformers import AutoModel
|
|
|
|
_model = AutoModel.from_pretrained(
|
|
MODEL_ID,
|
|
torch_dtype=torch.float32,
|
|
trust_remote_code=True,
|
|
).to("cuda")
|
|
|
|
_loaded = True
|
|
logger.info("Zonos model loaded successfully")
|
|
return _model
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Zonos model: {e}")
|
|
raise RuntimeError(f"Failed to load Zonos model: {e}")
|
|
|
|
|
|
def unload_zonos():
|
|
"""Free VRAM by unloading the model."""
|
|
global _model, _loaded
|
|
import torch
|
|
|
|
if _model is not None:
|
|
del _model
|
|
_model = None
|
|
_loaded = False
|
|
torch.cuda.empty_cache()
|
|
logger.info("Zonos model unloaded")
|
|
|
|
|
|
async def synthesize_zonos(
|
|
text: str,
|
|
language: str = "de",
|
|
emotion: str = DEFAULT_EMOTION,
|
|
speaking_rate: float = 13.0,
|
|
pitch_std: float = 20.0,
|
|
speaker_audio: Optional[bytes] = None,
|
|
) -> ZonosResult:
|
|
"""
|
|
Synthesize speech using Zonos TTS.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
language: Language code (default: 'de' for German)
|
|
emotion: Emotion preset name or custom emotion vector
|
|
speaking_rate: Speaking rate in phonemes/sec (default 13.0, range ~8-20)
|
|
pitch_std: Pitch variation in Hz (default 20.0, range ~5-50)
|
|
speaker_audio: Optional reference audio bytes for voice cloning
|
|
|
|
Returns ZonosResult with audio as numpy float32 array.
|
|
"""
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(
|
|
None,
|
|
_synthesize_sync,
|
|
text,
|
|
language,
|
|
emotion,
|
|
speaking_rate,
|
|
pitch_std,
|
|
speaker_audio,
|
|
)
|
|
|
|
|
|
def _synthesize_sync(
|
|
text: str,
|
|
language: str,
|
|
emotion: str,
|
|
speaking_rate: float,
|
|
pitch_std: float,
|
|
speaker_audio: Optional[bytes],
|
|
) -> ZonosResult:
|
|
"""Synchronous synthesis (runs in thread pool)."""
|
|
import torch
|
|
from zonos.conditioning import make_cond_dict
|
|
|
|
model = get_zonos_model()
|
|
|
|
# Resolve emotion preset
|
|
emotion_values = EMOTION_PRESETS.get(emotion, EMOTION_PRESETS["friendly"])
|
|
|
|
# Build speaker embedding if reference audio provided
|
|
speaker_embedding = None
|
|
if speaker_audio:
|
|
speaker_embedding = _embed_speaker(speaker_audio, model)
|
|
|
|
# Map language codes: Zonos expects espeak language codes like 'de' or 'en-us'
|
|
lang_map = {"de": "de", "en": "en-us", "fr": "fr-fr", "es": "es", "it": "it"}
|
|
espeak_lang = lang_map.get(language, language)
|
|
|
|
# Build conditioning using Zonos's own helper
|
|
cond = make_cond_dict(
|
|
text=text,
|
|
language=espeak_lang,
|
|
emotion=emotion_values,
|
|
speaking_rate=speaking_rate,
|
|
pitch_std=pitch_std,
|
|
speaker=speaker_embedding,
|
|
)
|
|
|
|
# Generate
|
|
with torch.no_grad():
|
|
conditioning = model.prepare_conditioning(cond)
|
|
codes = model.generate(conditioning)
|
|
audio = model.autoencoder.decode(codes).squeeze().cpu().numpy()
|
|
|
|
audio = audio.astype(np.float32)
|
|
duration = len(audio) / SAMPLE_RATE
|
|
|
|
return ZonosResult(
|
|
audio=audio,
|
|
sample_rate=SAMPLE_RATE,
|
|
duration=duration,
|
|
emotion=emotion,
|
|
)
|
|
|
|
|
|
def _embed_speaker(audio_bytes: bytes, model) -> "torch.Tensor":
|
|
"""Create speaker embedding from reference audio bytes."""
|
|
import torch
|
|
import io
|
|
import soundfile as sf
|
|
|
|
audio_data, sr = sf.read(io.BytesIO(audio_bytes))
|
|
|
|
if len(audio_data.shape) > 1:
|
|
audio_data = audio_data.mean(axis=1) # mono
|
|
|
|
audio_tensor = torch.tensor(audio_data, dtype=torch.float32, device="cuda").unsqueeze(0)
|
|
|
|
return model.make_speaker_embedding(audio_tensor, sr)
|