managarten/services/mana-tts/app/orpheus_service.py
Till JS 8823cc0bf0 feat(profile): voice interview with pre-rendered TTS audio + Orpheus/Zonos backends
Voice-based interview for the profile module — users choose between text,
voice (question read aloud + mic for answer), or conversation mode (fully
automatic flow with auto-save).

Interview audio:
- 92 pre-rendered MP3 files (23 questions × 4 voices) via Edge TTS
- Voices: Seraphina (DE-f), Florian (DE-m), Leni (CH-f), Jan (CH-m)
- User picks voice via dropdown, persisted in localStorage
- Web Speech API fallback for missing audio files

Profile UI:
- Interview hero block on overview with 3 start modes (text/voice/conversation)
- Voice/conversation toggle + voice picker in interview view
- Mic button on text/textarea/tags inputs for per-question voice input
- Conversation mode: auto-save + auto-advance after STT transcription
- Recording/transcribing/speaking state indicators

mana-tts service:
- New Orpheus TTS backend (German finetune, SNAC codec)
- New Zonos TTS backend (Zyphra, 200k hours, emotion control)
- Endpoints: POST /synthesize/orpheus, POST /synthesize/zonos
- espeak-ng installed on GPU server for Zonos phonemizer
- Compare script for side-by-side voice quality testing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-17 15:22:52 +02:00

229 lines
6.3 KiB
Python

"""
Orpheus TTS — High-quality German speech synthesis.
Uses the Orpheus-TTS model with German finetune for natural-sounding
interview question generation. Not optimized for real-time — quality first.
Model: Kartoffel_Orpheus-3B_german_natural-v0.1 (HuggingFace)
VRAM: ~8 GB (fits comfortably on RTX 3090 alongside other models)
"""
import logging
import asyncio
from dataclasses import dataclass
from typing import Optional
import numpy as np
logger = logging.getLogger(__name__)
# Lazy-loaded model state
_model = None
_tokenizer = None
_loaded = False
MODEL_ID = "Vishalshendge3198/orpheus-3b-tts-german-emotional-merged"
SAMPLE_RATE = 24000
# Available voices (Orpheus built-in speaker tags)
ORPHEUS_VOICES = {
"tara": "Female, warm and clear (default)",
"leah": "Female, soft and friendly",
"jess": "Female, energetic",
"leo": "Male, calm and professional",
"dan": "Male, deep and warm",
"mia": "Female, young and bright",
"zac": "Male, confident",
"emma": "Female, neutral",
}
DEFAULT_VOICE = "tara"
@dataclass
class OrpheusResult:
audio: np.ndarray
sample_rate: int
duration: float
voice: str
def is_orpheus_loaded() -> bool:
return _loaded
def get_orpheus_model():
"""Load the Orpheus German model (lazy, first call only)."""
global _model, _tokenizer, _loaded
if _loaded:
return _model, _tokenizer
logger.info(f"Loading Orpheus German model: {MODEL_ID}")
try:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
_tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True,
)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
_model.eval()
_loaded = True
logger.info("Orpheus German model loaded successfully")
return _model, _tokenizer
except Exception as e:
logger.error(f"Failed to load Orpheus model: {e}")
raise RuntimeError(f"Failed to load Orpheus model: {e}")
def unload_orpheus():
"""Free VRAM by unloading the model."""
global _model, _tokenizer, _loaded
import torch
if _model is not None:
del _model
_model = None
if _tokenizer is not None:
del _tokenizer
_tokenizer = None
_loaded = False
torch.cuda.empty_cache()
logger.info("Orpheus model unloaded")
async def synthesize_orpheus(
text: str,
voice: str = DEFAULT_VOICE,
temperature: float = 0.6,
top_p: float = 0.95,
max_new_tokens: int = 4096,
) -> OrpheusResult:
"""
Synthesize German speech using Orpheus TTS.
Returns OrpheusResult with audio as numpy float32 array.
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
_synthesize_sync,
text,
voice,
temperature,
top_p,
max_new_tokens,
)
def _synthesize_sync(
text: str,
voice: str,
temperature: float,
top_p: float,
max_new_tokens: int,
) -> OrpheusResult:
"""Synchronous synthesis (runs in thread pool)."""
import torch
model, tokenizer = get_orpheus_model()
# Orpheus uses a specific prompt format with speaker tags
prompt = f"<|speaker:{voice}|>{text}"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Extract audio tokens (model-specific decoding)
audio_tokens = outputs[0][inputs["input_ids"].shape[1]:]
# Decode audio tokens to waveform
# Orpheus uses a SNAC-based codec — tokens map to audio via the model's decode method
if hasattr(model, "decode_audio"):
audio_np = model.decode_audio(audio_tokens).cpu().numpy().flatten()
else:
# Fallback: use the tokenizer's decode if model doesn't have decode_audio
# This handles different Orpheus model versions
audio_np = _decode_orpheus_tokens(audio_tokens, model)
duration = len(audio_np) / SAMPLE_RATE
return OrpheusResult(
audio=audio_np,
sample_rate=SAMPLE_RATE,
duration=duration,
voice=voice,
)
def _decode_orpheus_tokens(tokens, model) -> np.ndarray:
"""
Decode Orpheus audio tokens using SNAC codec.
Orpheus generates special audio tokens that need to be decoded
through the SNAC vocoder to produce the final waveform.
"""
import torch
try:
from snac import SNAC
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(model.device)
# Filter to audio-only tokens (above text vocab range)
audio_token_ids = tokens[tokens >= 128256].tolist()
if not audio_token_ids:
logger.warning("No audio tokens generated")
return np.zeros(SAMPLE_RATE, dtype=np.float32) # 1s silence
# Orpheus interleaves 3 codebook levels: [c1, c2, c3, c1, c2, c3, ...]
# Redistribute into separate codebook tensors
codes_0, codes_1, codes_2 = [], [], []
for i, token_id in enumerate(audio_token_ids):
# Offset tokens back to codebook range
code = token_id - 128256
level = i % 3
if level == 0:
codes_0.append(code)
elif level == 1:
codes_1.append(code)
else:
codes_2.append(code)
# Trim to equal lengths
min_len = min(len(codes_0), len(codes_1), len(codes_2))
if min_len == 0:
return np.zeros(SAMPLE_RATE, dtype=np.float32)
codes = [
torch.tensor(codes_0[:min_len], device=model.device).unsqueeze(0),
torch.tensor(codes_1[:min_len], device=model.device).unsqueeze(0),
torch.tensor(codes_2[:min_len], device=model.device).unsqueeze(0),
]
with torch.no_grad():
audio = snac.decode(codes).squeeze().cpu().numpy()
return audio.astype(np.float32)
except ImportError:
logger.error("snac package not installed — pip install snac")
raise RuntimeError("snac package required for Orpheus audio decoding")