feat(tts): add de_kerstin female German voice and set as default

- Download and configure Kerstin Piper voice (63MB, local)
- Update piper_service.py to support multiple voice models
- Set de_kerstin as default voice for TTS bot
- Update help text with new voice options

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-14 12:16:58 +01:00
parent 01b011235f
commit 4b950b7083
4 changed files with 54 additions and 31 deletions

View file

@ -991,7 +991,7 @@ services:
MATRIX_ALLOWED_ROOMS: ${MATRIX_TTS_BOT_ROOMS:-}
TTS_URL: http://host.docker.internal:3022
TTS_API_KEY: ${TTS_INTERNAL_API_KEY:-}
DEFAULT_VOICE: de_thorsten
DEFAULT_VOICE: de_kerstin
DEFAULT_SPEED: 1.0
MAX_TEXT_LENGTH: 500
volumes:

View file

@ -32,6 +32,15 @@ PIPER_VOICES = {
"gender": "male",
"local": True,
},
"de_kerstin": {
"type": "piper",
"model": "kerstin_low.onnx",
"name": "Kerstin",
"description": "Deutsche Frauenstimme (lokal, schnell)",
"language": "de",
"gender": "female",
"local": True,
},
# === EDGE TTS VOICES (Fallback - Cloud) ===
"de_katja": {
"type": "edge",
@ -83,8 +92,8 @@ PIPER_VOICES = {
DEFAULT_PIPER_VOICE = "de_thorsten"
# Cached Piper voice instance
_piper_voice = None
# Cached Piper voice instances (one per model)
_piper_voices: dict = {}
_piper_available = None
_edge_available = None
@ -138,26 +147,28 @@ def is_piper_loaded() -> bool:
return check_piper_available() or _check_edge_available()
def _get_piper_voice():
"""Get or create cached Piper voice instance."""
global _piper_voice
if _piper_voice is not None:
return _piper_voice
def _get_piper_voice(model_name: str = "thorsten_medium.onnx"):
"""Get or create cached Piper voice instance for a specific model."""
global _piper_voices
if model_name in _piper_voices:
return _piper_voices[model_name]
if not check_piper_available():
return None
try:
from piper import PiperVoice
model_path = _get_piper_model_path("thorsten_medium.onnx")
config_path = _get_piper_model_path("thorsten_medium.onnx.json")
model_path = _get_piper_model_path(model_name)
config_path = _get_piper_model_path(f"{model_name}.json")
logger.info(f"Loading Piper voice from {model_path}")
_piper_voice = PiperVoice.load(str(model_path), str(config_path))
logger.info("Piper voice loaded successfully")
return _piper_voice
voice = PiperVoice.load(str(model_path), str(config_path))
_piper_voices[model_name] = voice
logger.info(f"Piper voice {model_name} loaded successfully")
return voice
except Exception as e:
logger.error(f"Failed to load Piper voice: {e}")
logger.error(f"Failed to load Piper voice {model_name}: {e}")
return None
@ -172,14 +183,19 @@ class PiperSynthesisResult:
async def _synthesize_with_piper(
text: str,
voice_id: str = "de_thorsten",
length_scale: float = 1.0,
) -> PiperSynthesisResult:
"""Synthesize using local Piper TTS."""
voice = _get_piper_voice()
if voice is None:
raise RuntimeError("Piper voice not available")
# Get the model name for this voice
voice_config = PIPER_VOICES.get(voice_id, PIPER_VOICES["de_thorsten"])
model_name = voice_config.get("model", "thorsten_medium.onnx")
logger.debug(f"Piper synthesizing: \"{text[:50]}...\"")
piper_voice = _get_piper_voice(model_name)
if piper_voice is None:
raise RuntimeError(f"Piper voice {voice_id} not available")
logger.debug(f"Piper synthesizing with {voice_id}: \"{text[:50]}...\"")
# Piper uses length_scale directly (1.0 = normal, >1 = slower)
# Run in thread pool to not block async
@ -187,7 +203,7 @@ async def _synthesize_with_piper(
def _synth():
audio_data = []
for audio_chunk in voice.synthesize_stream_raw(text, length_scale=length_scale):
for audio_chunk in piper_voice.synthesize_stream_raw(text, length_scale=length_scale):
audio_data.append(audio_chunk)
return b"".join(audio_data)
@ -195,7 +211,7 @@ async def _synthesize_with_piper(
# Convert to numpy (16-bit PCM)
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
sample_rate = voice.config.sample_rate
sample_rate = piper_voice.config.sample_rate
duration = len(audio) / sample_rate
logger.debug(f"Piper synthesis complete: {duration:.2f}s, {sample_rate}Hz")
@ -204,7 +220,7 @@ async def _synthesize_with_piper(
audio=audio,
sample_rate=sample_rate,
duration=duration,
voice="de_thorsten",
voice=voice_id,
)
@ -280,7 +296,7 @@ async def synthesize_piper(
# Try local Piper first for piper-type voices
if voice_type == "piper" and check_piper_available():
try:
return await _synthesize_with_piper(text, length_scale)
return await _synthesize_with_piper(text, voice, length_scale)
except Exception as e:
logger.warning(f"Piper synthesis failed, trying Edge fallback: {e}")
@ -288,8 +304,9 @@ async def synthesize_piper(
if _check_edge_available():
edge_voice = voice_config.get("edge_voice", "de-DE-ConradNeural")
if voice_type == "piper":
# Fallback: use Conrad for male voices
edge_voice = "de-DE-ConradNeural"
# Fallback: use appropriate Edge voice based on gender
gender = voice_config.get("gender", "male")
edge_voice = "de-DE-KatjaNeural" if gender == "female" else "de-DE-ConradNeural"
return await _synthesize_with_edge(text, edge_voice, length_scale)
raise RuntimeError("No TTS backend available (neither Piper nor Edge TTS)")

View file

@ -9,7 +9,7 @@ export default () => ({
tts: {
url: process.env.TTS_URL || 'http://localhost:3022',
apiKey: process.env.TTS_API_KEY || '',
defaultVoice: process.env.DEFAULT_VOICE || 'de_thorsten',
defaultVoice: process.env.DEFAULT_VOICE || 'de_kerstin',
defaultSpeed: parseFloat(process.env.DEFAULT_SPEED || '1.0'),
maxTextLength: parseInt(process.env.MAX_TEXT_LENGTH || '500', 10),
},
@ -31,10 +31,11 @@ Schreibe einfach eine Nachricht und ich sende dir die Sprachausgabe zurück.
Die Sprache wird automatisch erkannt (Deutsch/Englisch).
**Deutsche Stimmen:**
- \`de_kerstin\` - Deutsch weiblich (lokal, Standard)
- \`de_thorsten\` - Deutsch männlich (lokal)
- \`de_katja\` - Deutsch weiblich
- \`de_conrad\` - Deutsch männlich
- \`de_florian\` - Deutsch männlich jung
- \`de_katja\` - Deutsch weiblich (Cloud)
- \`de_conrad\` - Deutsch männlich (Cloud)
- \`de_florian\` - Deutsch männlich jung (Cloud)
**Englische Stimmen:**
- \`af_heart\` - Amerikanisch weiblich (warm)

View file

@ -15,14 +15,15 @@ export interface VoicesResponse {
// German voice mapping
const GERMAN_VOICES: Record<string, string> = {
de_thorsten: 'de_thorsten', // Local Piper
de_kerstin: 'de_kerstin', // Local Piper female
de_thorsten: 'de_thorsten', // Local Piper male
de_katja: 'de_katja', // Edge TTS female
de_conrad: 'de_conrad', // Edge TTS male
de_amala: 'de_amala', // Edge TTS female young
de_florian: 'de_florian', // Edge TTS male young
};
const DEFAULT_GERMAN_VOICE = 'de_thorsten';
const DEFAULT_GERMAN_VOICE = 'de_kerstin';
// Common German words for language detection
const GERMAN_INDICATORS = [
@ -122,7 +123,11 @@ export class TtsService {
/**
* Synthesize text to speech - auto-detects language
*/
async synthesize(text: string, voice: string = 'de_thorsten', speed: number = 1.0): Promise<Buffer> {
async synthesize(
text: string,
voice: string = 'de_thorsten',
speed: number = 1.0
): Promise<Buffer> {
// Auto-detect language if using English voice but text is German
const textIsGerman = this.isGerman(text);
const voiceIsGerman = this.isGermanVoice(voice);