mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 22:41:09 +02:00
✨ feat(tts): add de_kerstin female German voice and set as default
- Download and configure Kerstin Piper voice (63MB, local) - Update piper_service.py to support multiple voice models - Set de_kerstin as default voice for TTS bot - Update help text with new voice options Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
01b011235f
commit
4b950b7083
4 changed files with 54 additions and 31 deletions
|
|
@ -991,7 +991,7 @@ services:
|
|||
MATRIX_ALLOWED_ROOMS: ${MATRIX_TTS_BOT_ROOMS:-}
|
||||
TTS_URL: http://host.docker.internal:3022
|
||||
TTS_API_KEY: ${TTS_INTERNAL_API_KEY:-}
|
||||
DEFAULT_VOICE: de_thorsten
|
||||
DEFAULT_VOICE: de_kerstin
|
||||
DEFAULT_SPEED: 1.0
|
||||
MAX_TEXT_LENGTH: 500
|
||||
volumes:
|
||||
|
|
|
|||
|
|
@ -32,6 +32,15 @@ PIPER_VOICES = {
|
|||
"gender": "male",
|
||||
"local": True,
|
||||
},
|
||||
"de_kerstin": {
|
||||
"type": "piper",
|
||||
"model": "kerstin_low.onnx",
|
||||
"name": "Kerstin",
|
||||
"description": "Deutsche Frauenstimme (lokal, schnell)",
|
||||
"language": "de",
|
||||
"gender": "female",
|
||||
"local": True,
|
||||
},
|
||||
# === EDGE TTS VOICES (Fallback - Cloud) ===
|
||||
"de_katja": {
|
||||
"type": "edge",
|
||||
|
|
@ -83,8 +92,8 @@ PIPER_VOICES = {
|
|||
|
||||
DEFAULT_PIPER_VOICE = "de_thorsten"
|
||||
|
||||
# Cached Piper voice instance
|
||||
_piper_voice = None
|
||||
# Cached Piper voice instances (one per model)
|
||||
_piper_voices: dict = {}
|
||||
_piper_available = None
|
||||
_edge_available = None
|
||||
|
||||
|
|
@ -138,26 +147,28 @@ def is_piper_loaded() -> bool:
|
|||
return check_piper_available() or _check_edge_available()
|
||||
|
||||
|
||||
def _get_piper_voice():
|
||||
"""Get or create cached Piper voice instance."""
|
||||
global _piper_voice
|
||||
if _piper_voice is not None:
|
||||
return _piper_voice
|
||||
def _get_piper_voice(model_name: str = "thorsten_medium.onnx"):
|
||||
"""Get or create cached Piper voice instance for a specific model."""
|
||||
global _piper_voices
|
||||
|
||||
if model_name in _piper_voices:
|
||||
return _piper_voices[model_name]
|
||||
|
||||
if not check_piper_available():
|
||||
return None
|
||||
|
||||
try:
|
||||
from piper import PiperVoice
|
||||
model_path = _get_piper_model_path("thorsten_medium.onnx")
|
||||
config_path = _get_piper_model_path("thorsten_medium.onnx.json")
|
||||
model_path = _get_piper_model_path(model_name)
|
||||
config_path = _get_piper_model_path(f"{model_name}.json")
|
||||
|
||||
logger.info(f"Loading Piper voice from {model_path}")
|
||||
_piper_voice = PiperVoice.load(str(model_path), str(config_path))
|
||||
logger.info("Piper voice loaded successfully")
|
||||
return _piper_voice
|
||||
voice = PiperVoice.load(str(model_path), str(config_path))
|
||||
_piper_voices[model_name] = voice
|
||||
logger.info(f"Piper voice {model_name} loaded successfully")
|
||||
return voice
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Piper voice: {e}")
|
||||
logger.error(f"Failed to load Piper voice {model_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -172,14 +183,19 @@ class PiperSynthesisResult:
|
|||
|
||||
async def _synthesize_with_piper(
|
||||
text: str,
|
||||
voice_id: str = "de_thorsten",
|
||||
length_scale: float = 1.0,
|
||||
) -> PiperSynthesisResult:
|
||||
"""Synthesize using local Piper TTS."""
|
||||
voice = _get_piper_voice()
|
||||
if voice is None:
|
||||
raise RuntimeError("Piper voice not available")
|
||||
# Get the model name for this voice
|
||||
voice_config = PIPER_VOICES.get(voice_id, PIPER_VOICES["de_thorsten"])
|
||||
model_name = voice_config.get("model", "thorsten_medium.onnx")
|
||||
|
||||
logger.debug(f"Piper synthesizing: \"{text[:50]}...\"")
|
||||
piper_voice = _get_piper_voice(model_name)
|
||||
if piper_voice is None:
|
||||
raise RuntimeError(f"Piper voice {voice_id} not available")
|
||||
|
||||
logger.debug(f"Piper synthesizing with {voice_id}: \"{text[:50]}...\"")
|
||||
|
||||
# Piper uses length_scale directly (1.0 = normal, >1 = slower)
|
||||
# Run in thread pool to not block async
|
||||
|
|
@ -187,7 +203,7 @@ async def _synthesize_with_piper(
|
|||
|
||||
def _synth():
|
||||
audio_data = []
|
||||
for audio_chunk in voice.synthesize_stream_raw(text, length_scale=length_scale):
|
||||
for audio_chunk in piper_voice.synthesize_stream_raw(text, length_scale=length_scale):
|
||||
audio_data.append(audio_chunk)
|
||||
return b"".join(audio_data)
|
||||
|
||||
|
|
@ -195,7 +211,7 @@ async def _synthesize_with_piper(
|
|||
|
||||
# Convert to numpy (16-bit PCM)
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
sample_rate = voice.config.sample_rate
|
||||
sample_rate = piper_voice.config.sample_rate
|
||||
|
||||
duration = len(audio) / sample_rate
|
||||
logger.debug(f"Piper synthesis complete: {duration:.2f}s, {sample_rate}Hz")
|
||||
|
|
@ -204,7 +220,7 @@ async def _synthesize_with_piper(
|
|||
audio=audio,
|
||||
sample_rate=sample_rate,
|
||||
duration=duration,
|
||||
voice="de_thorsten",
|
||||
voice=voice_id,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -280,7 +296,7 @@ async def synthesize_piper(
|
|||
# Try local Piper first for piper-type voices
|
||||
if voice_type == "piper" and check_piper_available():
|
||||
try:
|
||||
return await _synthesize_with_piper(text, length_scale)
|
||||
return await _synthesize_with_piper(text, voice, length_scale)
|
||||
except Exception as e:
|
||||
logger.warning(f"Piper synthesis failed, trying Edge fallback: {e}")
|
||||
|
||||
|
|
@ -288,8 +304,9 @@ async def synthesize_piper(
|
|||
if _check_edge_available():
|
||||
edge_voice = voice_config.get("edge_voice", "de-DE-ConradNeural")
|
||||
if voice_type == "piper":
|
||||
# Fallback: use Conrad for male voices
|
||||
edge_voice = "de-DE-ConradNeural"
|
||||
# Fallback: use appropriate Edge voice based on gender
|
||||
gender = voice_config.get("gender", "male")
|
||||
edge_voice = "de-DE-KatjaNeural" if gender == "female" else "de-DE-ConradNeural"
|
||||
return await _synthesize_with_edge(text, edge_voice, length_scale)
|
||||
|
||||
raise RuntimeError("No TTS backend available (neither Piper nor Edge TTS)")
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ export default () => ({
|
|||
tts: {
|
||||
url: process.env.TTS_URL || 'http://localhost:3022',
|
||||
apiKey: process.env.TTS_API_KEY || '',
|
||||
defaultVoice: process.env.DEFAULT_VOICE || 'de_thorsten',
|
||||
defaultVoice: process.env.DEFAULT_VOICE || 'de_kerstin',
|
||||
defaultSpeed: parseFloat(process.env.DEFAULT_SPEED || '1.0'),
|
||||
maxTextLength: parseInt(process.env.MAX_TEXT_LENGTH || '500', 10),
|
||||
},
|
||||
|
|
@ -31,10 +31,11 @@ Schreibe einfach eine Nachricht und ich sende dir die Sprachausgabe zurück.
|
|||
Die Sprache wird automatisch erkannt (Deutsch/Englisch).
|
||||
|
||||
**Deutsche Stimmen:**
|
||||
- \`de_kerstin\` - Deutsch weiblich (lokal, Standard)
|
||||
- \`de_thorsten\` - Deutsch männlich (lokal)
|
||||
- \`de_katja\` - Deutsch weiblich
|
||||
- \`de_conrad\` - Deutsch männlich
|
||||
- \`de_florian\` - Deutsch männlich jung
|
||||
- \`de_katja\` - Deutsch weiblich (Cloud)
|
||||
- \`de_conrad\` - Deutsch männlich (Cloud)
|
||||
- \`de_florian\` - Deutsch männlich jung (Cloud)
|
||||
|
||||
**Englische Stimmen:**
|
||||
- \`af_heart\` - Amerikanisch weiblich (warm)
|
||||
|
|
|
|||
|
|
@ -15,14 +15,15 @@ export interface VoicesResponse {
|
|||
|
||||
// German voice mapping
|
||||
const GERMAN_VOICES: Record<string, string> = {
|
||||
de_thorsten: 'de_thorsten', // Local Piper
|
||||
de_kerstin: 'de_kerstin', // Local Piper female
|
||||
de_thorsten: 'de_thorsten', // Local Piper male
|
||||
de_katja: 'de_katja', // Edge TTS female
|
||||
de_conrad: 'de_conrad', // Edge TTS male
|
||||
de_amala: 'de_amala', // Edge TTS female young
|
||||
de_florian: 'de_florian', // Edge TTS male young
|
||||
};
|
||||
|
||||
const DEFAULT_GERMAN_VOICE = 'de_thorsten';
|
||||
const DEFAULT_GERMAN_VOICE = 'de_kerstin';
|
||||
|
||||
// Common German words for language detection
|
||||
const GERMAN_INDICATORS = [
|
||||
|
|
@ -122,7 +123,11 @@ export class TtsService {
|
|||
/**
|
||||
* Synthesize text to speech - auto-detects language
|
||||
*/
|
||||
async synthesize(text: string, voice: string = 'de_thorsten', speed: number = 1.0): Promise<Buffer> {
|
||||
async synthesize(
|
||||
text: string,
|
||||
voice: string = 'de_thorsten',
|
||||
speed: number = 1.0
|
||||
): Promise<Buffer> {
|
||||
// Auto-detect language if using English voice but text is German
|
||||
const textIsGerman = this.isGerman(text);
|
||||
const voiceIsGerman = this.isGermanVoice(voice);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue