✨ feat(tts): add de_kerstin female German voice and set as default

- Download and configure Kerstin Piper voice (63MB, local) - Update piper_service.py to support multiple voice models - Set de_kerstin as default voice for TTS bot - Update help text with new voice options Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 22:41:09 +02:00 · 2026-02-14 12:16:58 +01:00 · 2026-02-14 12:16:58 +01:00 · 4b950b7083
commit 4b950b7083
parent 01b011235f
4 changed files with 54 additions and 31 deletions
--- a/docker-compose.macmini.yml
+++ b/docker-compose.macmini.yml
@ -991,7 +991,7 @@ services:
      MATRIX_ALLOWED_ROOMS: ${MATRIX_TTS_BOT_ROOMS:-}
      TTS_URL: http://host.docker.internal:3022
      TTS_API_KEY: ${TTS_INTERNAL_API_KEY:-}
-      DEFAULT_VOICE: de_thorsten
+      DEFAULT_VOICE: de_kerstin
      DEFAULT_SPEED: 1.0
      MAX_TEXT_LENGTH: 500
    volumes:
--- a/services/mana-tts/app/piper_service.py
+++ b/services/mana-tts/app/piper_service.py
@ -32,6 +32,15 @@ PIPER_VOICES = {
        "gender": "male",
        "local": True,
    },
+    "de_kerstin": {
+        "type": "piper",
+        "model": "kerstin_low.onnx",
+        "name": "Kerstin",
+        "description": "Deutsche Frauenstimme (lokal, schnell)",
+        "language": "de",
+        "gender": "female",
+        "local": True,
+    },
    # === EDGE TTS VOICES (Fallback - Cloud) ===
    "de_katja": {
        "type": "edge",
@ -83,8 +92,8 @@ PIPER_VOICES = {

 DEFAULT_PIPER_VOICE = "de_thorsten"

-# Cached Piper voice instance
-_piper_voice = None
+# Cached Piper voice instances (one per model)
+_piper_voices: dict = {}
 _piper_available = None
 _edge_available = None

@ -138,26 +147,28 @@ def is_piper_loaded() -> bool:
    return check_piper_available() or _check_edge_available()


-def _get_piper_voice():
-    """Get or create cached Piper voice instance."""
-    global _piper_voice
-    if _piper_voice is not None:
-        return _piper_voice
+def _get_piper_voice(model_name: str = "thorsten_medium.onnx"):
+    """Get or create cached Piper voice instance for a specific model."""
+    global _piper_voices
+
+    if model_name in _piper_voices:
+        return _piper_voices[model_name]

    if not check_piper_available():
        return None

    try:
        from piper import PiperVoice
-        model_path = _get_piper_model_path("thorsten_medium.onnx")
-        config_path = _get_piper_model_path("thorsten_medium.onnx.json")
+        model_path = _get_piper_model_path(model_name)
+        config_path = _get_piper_model_path(f"{model_name}.json")

        logger.info(f"Loading Piper voice from {model_path}")
-        _piper_voice = PiperVoice.load(str(model_path), str(config_path))
-        logger.info("Piper voice loaded successfully")
-        return _piper_voice
+        voice = PiperVoice.load(str(model_path), str(config_path))
+        _piper_voices[model_name] = voice
+        logger.info(f"Piper voice {model_name} loaded successfully")
+        return voice
    except Exception as e:
-        logger.error(f"Failed to load Piper voice: {e}")
+        logger.error(f"Failed to load Piper voice {model_name}: {e}")
        return None


@ -172,14 +183,19 @@ class PiperSynthesisResult:

 async def _synthesize_with_piper(
    text: str,
+    voice_id: str = "de_thorsten",
    length_scale: float = 1.0,
 ) -> PiperSynthesisResult:
    """Synthesize using local Piper TTS."""
-    voice = _get_piper_voice()
-    if voice is None:
-        raise RuntimeError("Piper voice not available")
+    # Get the model name for this voice
+    voice_config = PIPER_VOICES.get(voice_id, PIPER_VOICES["de_thorsten"])
+    model_name = voice_config.get("model", "thorsten_medium.onnx")

-    logger.debug(f"Piper synthesizing: \"{text[:50]}...\"")
+    piper_voice = _get_piper_voice(model_name)
+    if piper_voice is None:
+        raise RuntimeError(f"Piper voice {voice_id} not available")
+
+    logger.debug(f"Piper synthesizing with {voice_id}: \"{text[:50]}...\"")

    # Piper uses length_scale directly (1.0 = normal, >1 = slower)
    # Run in thread pool to not block async
@ -187,7 +203,7 @@ async def _synthesize_with_piper(

    def _synth():
        audio_data = []
-        for audio_chunk in voice.synthesize_stream_raw(text, length_scale=length_scale):
+        for audio_chunk in piper_voice.synthesize_stream_raw(text, length_scale=length_scale):
            audio_data.append(audio_chunk)
        return b"".join(audio_data)

@ -195,7 +211,7 @@ async def _synthesize_with_piper(

    # Convert to numpy (16-bit PCM)
    audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
-    sample_rate = voice.config.sample_rate
+    sample_rate = piper_voice.config.sample_rate

    duration = len(audio) / sample_rate
    logger.debug(f"Piper synthesis complete: {duration:.2f}s, {sample_rate}Hz")
@ -204,7 +220,7 @@ async def _synthesize_with_piper(
        audio=audio,
        sample_rate=sample_rate,
        duration=duration,
-        voice="de_thorsten",
+        voice=voice_id,
    )


@ -280,7 +296,7 @@ async def synthesize_piper(
    # Try local Piper first for piper-type voices
    if voice_type == "piper" and check_piper_available():
        try:
-            return await _synthesize_with_piper(text, length_scale)
+            return await _synthesize_with_piper(text, voice, length_scale)
        except Exception as e:
            logger.warning(f"Piper synthesis failed, trying Edge fallback: {e}")

@ -288,8 +304,9 @@ async def synthesize_piper(
    if _check_edge_available():
        edge_voice = voice_config.get("edge_voice", "de-DE-ConradNeural")
        if voice_type == "piper":
-            # Fallback: use Conrad for male voices
-            edge_voice = "de-DE-ConradNeural"
+            # Fallback: use appropriate Edge voice based on gender
+            gender = voice_config.get("gender", "male")
+            edge_voice = "de-DE-KatjaNeural" if gender == "female" else "de-DE-ConradNeural"
        return await _synthesize_with_edge(text, edge_voice, length_scale)

    raise RuntimeError("No TTS backend available (neither Piper nor Edge TTS)")
--- a/services/matrix-tts-bot/src/config/configuration.ts
+++ b/services/matrix-tts-bot/src/config/configuration.ts
@ -9,7 +9,7 @@ export default () => ({
 	tts: {
 		url: process.env.TTS_URL || 'http://localhost:3022',
 		apiKey: process.env.TTS_API_KEY || '',
-		defaultVoice: process.env.DEFAULT_VOICE || 'de_thorsten',
+		defaultVoice: process.env.DEFAULT_VOICE || 'de_kerstin',
 		defaultSpeed: parseFloat(process.env.DEFAULT_SPEED || '1.0'),
 		maxTextLength: parseInt(process.env.MAX_TEXT_LENGTH || '500', 10),
 	},
@ -31,10 +31,11 @@ Schreibe einfach eine Nachricht und ich sende dir die Sprachausgabe zurück.
 Die Sprache wird automatisch erkannt (Deutsch/Englisch).

 **Deutsche Stimmen:**
+- \`de_kerstin\` - Deutsch weiblich (lokal, Standard)
 - \`de_thorsten\` - Deutsch männlich (lokal)
- \`de_katja\` - Deutsch weiblich
- \`de_conrad\` - Deutsch männlich
- \`de_florian\` - Deutsch männlich jung
+- \`de_katja\` - Deutsch weiblich (Cloud)
+- \`de_conrad\` - Deutsch männlich (Cloud)
+- \`de_florian\` - Deutsch männlich jung (Cloud)

 **Englische Stimmen:**
 - \`af_heart\` - Amerikanisch weiblich (warm)
--- a/services/matrix-tts-bot/src/tts/tts.service.ts
+++ b/services/matrix-tts-bot/src/tts/tts.service.ts
@ -15,14 +15,15 @@ export interface VoicesResponse {

 // German voice mapping
 const GERMAN_VOICES: Record<string, string> = {
-	de_thorsten: 'de_thorsten', // Local Piper
+	de_kerstin: 'de_kerstin', // Local Piper female
+	de_thorsten: 'de_thorsten', // Local Piper male
 	de_katja: 'de_katja', // Edge TTS female
 	de_conrad: 'de_conrad', // Edge TTS male
 	de_amala: 'de_amala', // Edge TTS female young
 	de_florian: 'de_florian', // Edge TTS male young
 };

-const DEFAULT_GERMAN_VOICE = 'de_thorsten';
+const DEFAULT_GERMAN_VOICE = 'de_kerstin';

 // Common German words for language detection
 const GERMAN_INDICATORS = [
@ -122,7 +123,11 @@ export class TtsService {
 	/**
 	 * Synthesize text to speech - auto-detects language
 	 */
-	async synthesize(text: string, voice: string = 'de_thorsten', speed: number = 1.0): Promise<Buffer> {
+	async synthesize(
+		text: string,
+		voice: string = 'de_thorsten',
+		speed: number = 1.0
+	): Promise<Buffer> {
 		// Auto-detect language if using English voice but text is German
 		const textIsGerman = this.isGerman(text);
 		const voiceIsGerman = this.isGermanVoice(voice);