feat(matrix-mana-bot): add voice input support (Phase 1)

- Add VoiceModule and VoiceService for STT integration - Override handleAudioMessage to process voice notes - Transcribe audio via mana-stt (Whisper) - Route transcribed text through CommandRouter - Add voice configuration and environment variables - Update help text and documentation Voice flow: 1. User sends voice note 2. Bot downloads and transcribes audio 3. Shows transcription: 🎤 *"text"* 4. Routes as normal text command 5. Returns text response Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-14 20:41:09 +02:00 · 2026-02-01 02:59:13 +01:00 · 2026-02-01 02:59:13 +01:00 · db07b5613d
commit db07b5613d
parent f04c27fe26
9 changed files with 833 additions and 13 deletions
--- a/.claude/plans/voice-integration-mana-bot.md
+++ b/.claude/plans/voice-integration-mana-bot.md
@ -0,0 +1,475 @@
+# Voice Integration für matrix-mana-bot
+
+## Übersicht
+
+Integration des mana-voice-bot Service (Port 3050) in den matrix-mana-bot Gateway, um vollständige Voice-to-Voice Interaktion zu ermöglichen.
+
+## Architektur
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         Matrix Client (Element)                          │
+│                                                                          │
+│  ┌────────────────┐   ┌────────────────┐   ┌────────────────┐          │
+│  │  Text Message  │   │  Voice Note    │   │  Audio Reply   │          │
+│  │  "!heute"      │   │  🎤 "Was..."   │   │  🔊 Response   │          │
+│  └───────┬────────┘   └───────┬────────┘   └───────▲────────┘          │
+└──────────┼────────────────────┼────────────────────┼────────────────────┘
+           │                    │                    │
+           ▼                    ▼                    │
+┌──────────────────────────────────────────────────────────────────────────┐
+│                        matrix-mana-bot (Port 3310)                        │
+│                                                                           │
+│  ┌─────────────────────────────────────────────────────────────────────┐ │
+│  │                         MatrixService                                │ │
+│  │  handleTextMessage()  │  handleAudioMessage()  │  sendAudioReply() │ │
+│  └───────────┬───────────────────┬───────────────────────▲─────────────┘ │
+│              │                   │                       │               │
+│              ▼                   ▼                       │               │
+│  ┌───────────────────────────────────────────────────────┐               │
+│  │              VoiceService (NEU)                       │               │
+│  │  • transcribeAudio() → mana-stt (3020)               │               │
+│  │  • synthesizeSpeech() → mana-voice-bot (3050)        │               │
+│  │  • User preferences (voice, speed)                   │               │
+│  └───────────────────────────────────────────────────────┘               │
+│              │                                                           │
+│              ▼                                                           │
+│  ┌───────────────────────────────────────────────────────────────────┐   │
+│  │                     CommandRouter                                  │   │
+│  │  route(ctx) → AI | Todo | Calendar | Clock | Orchestration        │   │
+│  └───────────────────────────────────────────────────────────────────┘   │
+└──────────────────────────────────────────────────────────────────────────┘
+```
+
+## User Flow
+
+### Flow 1: Voice Input → Text + Audio Output
+
+```
+User                           Bot                          Services
+  │                             │                              │
+  │  🎤 Voice Note              │                              │
+  │  "Was steht heute an?"      │                              │
+  │ ───────────────────────────>│                              │
+  │                             │                              │
+  │                             │  Download Audio              │
+  │                             │ ────────────────────────────>│ Matrix
+  │                             │<──────────────────────────── │
+  │                             │                              │
+  │                             │  POST /transcribe            │
+  │                             │ ────────────────────────────>│ mana-stt
+  │                             │  "Was steht heute an?"       │
+  │                             │<──────────────────────────── │
+  │                             │                              │
+  │                             │  route("Was steht heute an?")│
+  │                             │ ──────────────────────────>  │ CommandRouter
+  │                             │  📋 Termine + Aufgaben       │
+  │                             │<────────────────────────────>│
+  │                             │                              │
+  │  📝 Text Response           │                              │
+  │  "Heute hast du..."         │                              │
+  │<─────────────────────────── │                              │
+  │                             │                              │
+  │                             │  POST /tts                   │
+  │                             │ ────────────────────────────>│ mana-voice-bot
+  │                             │  [audio/mpeg]                │
+  │                             │<──────────────────────────── │
+  │                             │                              │
+  │                             │  Upload Audio                │
+  │                             │ ────────────────────────────>│ Matrix
+  │                             │  mxc://...                   │
+  │                             │<──────────────────────────── │
+  │                             │                              │
+  │  🔊 Audio Response          │                              │
+  │  "Heute hast du..."         │                              │
+  │<─────────────────────────── │                              │
+```
+
+### Flow 2: Text Input mit Voice Response (Optional)
+
+```
+User                           Bot
+  │                             │
+  │  "!heute"                   │
+  │ ───────────────────────────>│
+  │                             │
+  │  📝 Text: "Heute hast..."   │
+  │<─────────────────────────── │
+  │                             │
+  │  (Voice Response optional)  │
+  │  🔊 Audio wenn aktiviert    │
+  │<─────────────────────────── │
+```
+
+## Neue Befehle
+
+### Voice-Einstellungen
+
+| Befehl                     | Beschreibung                          |
+| -------------------------- | ------------------------------------- |
+| `!voice`                   | Zeigt aktuelle Voice-Einstellungen    |
+| `!voice an` / `!voice aus` | Aktiviert/deaktiviert Audio-Antworten |
+| `!stimme [name]`           | Wählt TTS-Stimme                      |
+| `!stimmen`                 | Zeigt verfügbare Stimmen              |
+| `!speed [0.5-2.0]`         | Sprechgeschwindigkeit                 |
+
+### Beispiel-Session
+
+```
+User: 🎤 "Mana, was habe ich heute vor?"
+
+Bot:  📝 **Dein Tag heute (15. Februar):**
+
+      **Termine:**
+      • 10:00 - Team Meeting
+      • 14:30 - Zahnarzt
+
+      **Aufgaben:**
+      1. Einkaufen gehen !p1
+      2. Report fertigstellen @heute
+
+Bot:  🔊 [Audio: "Heute hast du zwei Termine: Um zehn Uhr Team Meeting
+          und um halb drei Zahnarzt. Außerdem stehen zwei Aufgaben an:
+          Einkaufen gehen mit hoher Priorität und Report fertigstellen."]
+```
+
+## UX-Prinzipien
+
+### 1. Text + Audio (Dual Response)
+
+Bei Voice-Input immer **beides** senden:
+
+- **Text zuerst** → Sofortiges visuelles Feedback, scrollbar, kopierbar
+- **Audio danach** → Natürliche Sprachausgabe
+
+Vorteile:
+
+- User kann sofort lesen während Audio lädt
+- Referenz bleibt im Chat-Verlauf
+- Accessibility für verschiedene Situationen
+
+### 2. Intelligente Audio-Länge
+
+| Antwort-Typ            | Audio                   | Begründung          |
+| ---------------------- | ----------------------- | ------------------- |
+| Kurz (< 200 Zeichen)   | Ja                      | Schnell, natürlich  |
+| Mittel (< 500 Zeichen) | Ja                      | Noch angenehm       |
+| Lang (> 500 Zeichen)   | Zusammenfassung         | Voller Text zu lang |
+| Listen (> 5 Items)     | Top 3 + "und X weitere" | Fokus auf Wichtiges |
+| Fehler                 | Kurze Erklärung         | Klar und hilfreich  |
+
+### 3. Kontext-Sensitive Antworten
+
+```typescript
+// Kurze Bestätigung
+"Aufgabe hinzugefügt: Einkaufen gehen"
+→ 🔊 "Erledigt, Einkaufen gehen wurde hinzugefügt."
+
+// Liste mit vielen Items
+"Du hast 12 Aufgaben..."
+→ 🔊 "Du hast zwölf Aufgaben, davon drei mit hoher Priorität.
+       Die wichtigsten sind: Erstens, Report fertigstellen.
+       Zweitens, Meeting vorbereiten. Drittens, E-Mails beantworten."
+
+// AI-Antwort
+[Lange Erklärung...]
+→ 🔊 [Gekürzte Version, max 30 Sekunden]
+```
+
+### 4. Natürliche Deutsche Sprache
+
+Voice-Antworten werden für Sprache optimiert:
+
+```typescript
+// Text-Format
+'10:00 - Team Meeting';
+'14:30 - Zahnarzt';
+
+// Voice-Format
+'Um zehn Uhr Team Meeting und um halb drei Zahnarzt';
+
+// Text-Format
+'!p1 @heute #arbeit';
+
+// Voice-Format
+'mit hoher Priorität, fällig heute, im Projekt Arbeit';
+```
+
+### 5. Feedback-Sounds
+
+Kurze Audio-Cues für Aktionen:
+
+| Aktion           | Sound                  |
+| ---------------- | ---------------------- |
+| Aufgabe erledigt | ✅ Kurzer "Done"-Sound |
+| Timer gestartet  | 🔔 Start-Ton           |
+| Timer abgelaufen | 🔔 Alarm-Ton           |
+| Fehler           | ❌ Sanfter Error-Ton   |
+
+## User Preferences
+
+### Persistente Einstellungen pro User
+
+```typescript
+interface VoicePreferences {
+	// Voice Response
+	voiceEnabled: boolean; // Default: true bei Voice-Input
+	alwaysVoice: boolean; // Default: false (nur bei Voice-Input)
+
+	// TTS Settings
+	voice: string; // Default: "de-DE-ConradNeural"
+	speed: number; // Default: 1.0
+
+	// Behavior
+	readLongTexts: boolean; // Default: false (Zusammenfassung)
+	maxAudioLength: number; // Default: 30 (Sekunden)
+	feedbackSounds: boolean; // Default: true
+}
+```
+
+### Speicherung
+
+- In-Memory für aktuelle Session
+- Optional: Persistierung in User-Settings-Datei
+
+## Implementierungs-Plan
+
+### Phase 1: Grundlegende Voice-Input
+
+**Ziel:** Voice Notes werden transkribiert und als Text verarbeitet
+
+1. `VoiceModule` erstellen
+2. `VoiceService` mit STT-Integration
+3. `handleAudioMessage()` in MatrixService überschreiben
+4. Transkribierte Nachricht durch CommandRouter leiten
+
+**Aufwand:** ~2-3 Stunden
+
+### Phase 2: Voice-Output
+
+**Ziel:** Antworten werden als Audio zurückgesendet
+
+1. TTS-Integration in VoiceService
+2. Audio-Upload zu Matrix
+3. `sendAudioReply()` Methode
+4. Dual-Response (Text + Audio)
+
+**Aufwand:** ~2-3 Stunden
+
+### Phase 3: Smart Formatting
+
+**Ziel:** Antworten werden für Sprache optimiert
+
+1. `VoiceFormatter` Service
+2. Zahlen → Wörter ("10:00" → "zehn Uhr")
+3. Listen-Zusammenfassung
+4. Markdown-Entfernung für TTS
+
+**Aufwand:** ~2 Stunden
+
+### Phase 4: User Preferences
+
+**Ziel:** User können Voice-Einstellungen anpassen
+
+1. Preference-Speicherung
+2. `!voice`, `!stimme`, `!stimmen` Befehle
+3. Automatische Aktivierung bei Voice-Input
+
+**Aufwand:** ~1-2 Stunden
+
+### Phase 5: Polish & Testing
+
+**Ziel:** Optimierte User Experience
+
+1. Latenz-Optimierung (parallel Processing)
+2. Error Handling
+3. Edge Cases (leere Audio, etc.)
+4. Testing mit verschiedenen Stimmen
+
+**Aufwand:** ~2 Stunden
+
+## Technische Details
+
+### Neue Dateien
+
+```
+services/matrix-mana-bot/src/
+├── voice/
+│   ├── voice.module.ts
+│   ├── voice.service.ts        # STT + TTS Orchestration
+│   ├── voice-formatter.ts      # Text → Speech-optimized
+│   └── voice-preferences.ts    # User Settings
+```
+
+### Environment Variables
+
+```env
+# Voice Bot (bestehend)
+VOICE_BOT_URL=http://localhost:3050
+
+# STT (bestehend)
+STT_URL=http://localhost:3020
+
+# Voice Settings
+VOICE_ENABLED=true
+DEFAULT_VOICE=de-DE-ConradNeural
+DEFAULT_SPEED=1.0
+MAX_AUDIO_LENGTH=30
+```
+
+### Dependencies
+
+Keine neuen Dependencies nötig - alles via HTTP APIs:
+
+- mana-stt (Port 3020) - bereits vorhanden
+- mana-voice-bot (Port 3050) - gerade erstellt
+
+## Audio-Nachricht Format
+
+### Matrix Audio Message
+
+```typescript
+// Upload Audio zu Matrix
+const mxcUrl = await this.client.uploadContent(audioBuffer, 'audio/mpeg', 'response.mp3');
+
+// Send Audio Message
+await this.client.sendMessage(roomId, {
+	msgtype: 'm.audio',
+	body: 'Voice Response',
+	url: mxcUrl,
+	info: {
+		mimetype: 'audio/mpeg',
+		size: audioBuffer.length,
+		duration: durationMs, // Optional
+	},
+	// Reply to original message
+	'm.relates_to': {
+		'm.in_reply_to': {
+			event_id: originalEventId,
+		},
+	},
+});
+```
+
+## Performance-Optimierungen
+
+### Parallel Processing
+
+```typescript
+async handleVoiceMessage(roomId: string, event: MatrixRoomEvent) {
+  // 1. Download + Transcribe
+  const audioBuffer = await this.downloadMedia(event.content.url);
+  const transcript = await this.voiceService.transcribe(audioBuffer);
+
+  // 2. Process Command (get text response)
+  const textResponse = await this.commandRouter.route({
+    roomId,
+    userId: event.sender,
+    message: transcript,
+    event,
+  });
+
+  // 3. Send Text immediately
+  await this.sendReply(roomId, event, textResponse);
+
+  // 4. Generate Audio in parallel (don't await for user)
+  this.generateAndSendAudio(roomId, event, textResponse)
+    .catch(err => this.logger.error('Audio generation failed:', err));
+}
+```
+
+### Caching
+
+- Voice-Preferences pro User cachen
+- Häufige kurze Antworten cachen ("Erledigt", "Hinzugefügt", etc.)
+
+## Fallback-Verhalten
+
+| Situation            | Verhalten                         |
+| -------------------- | --------------------------------- |
+| STT nicht erreichbar | Fehlermeldung, nur Text           |
+| TTS nicht erreichbar | Nur Text-Antwort, kein Audio      |
+| Leeres Audio         | "Ich konnte dich nicht verstehen" |
+| Zu langes Audio      | Transkribieren + Warnung          |
+| Unbekannte Sprache   | Auf Deutsch antworten             |
+
+## Beispiel-Interaktionen
+
+### Morgen-Routine
+
+```
+User: 🎤 "Guten Morgen Mana, was steht heute an?"
+
+Bot:  📝 ☀️ **Guten Morgen!**
+
+      **Deine Termine:**
+      • 09:00 Daily Standup
+      • 11:00 Code Review
+      • 15:00 Sprint Planning
+
+      **Wichtige Aufgaben:**
+      1. Bug-Fix für Login !p1 @heute
+      2. Dokumentation aktualisieren
+
+      **Dein Tag sieht machbar aus!** 💪
+
+Bot:  🔊 "Guten Morgen! Heute hast du drei Termine: Um neun das Daily,
+          um elf Code Review und um drei Sprint Planning.
+          Außerdem zwei wichtige Aufgaben: Der Bug-Fix für den Login
+          hat hohe Priorität und die Dokumentation sollte aktualisiert werden.
+          Dein Tag sieht machbar aus!"
+```
+
+### Quick Task
+
+```
+User: 🎤 "Neue Aufgabe: Milch kaufen"
+
+Bot:  📝 ✅ Aufgabe hinzugefügt:
+      **Milch kaufen** (Inbox)
+
+Bot:  🔊 "Erledigt, Milch kaufen wurde hinzugefügt."
+```
+
+### Timer
+
+```
+User: 🎤 "Timer 25 Minuten für Pomodoro"
+
+Bot:  📝 ⏱️ Timer gestartet: **25 Minuten** (Pomodoro)
+      Endet um 14:55
+
+Bot:  🔊 [Start-Sound] + "Timer für 25 Minuten gestartet."
+
+--- 25 Minuten später ---
+
+Bot:  📝 🔔 **Timer abgelaufen!** Pomodoro (25 min)
+
+Bot:  🔊 [Alarm-Sound] + "Dein Pomodoro Timer ist abgelaufen."
+```
+
+## Erfolgs-Metriken
+
+- **Latenz:** Voice-Input → Text-Response < 3s
+- **Latenz:** Text-Response → Audio-Response < 2s
+- **Transkription:** > 95% Genauigkeit für Deutsche Sprache
+- **Audio-Qualität:** Natürlich klingende Stimme
+
+## Offene Fragen
+
+1. **Wakeword?**
+   - Optional: "Hey Mana" am Anfang der Voice Note?
+   - Oder: Jede Voice Note wird verarbeitet?
+
+2. **Audio-Format?**
+   - MP3 (klein, universell) ✓
+   - WAV (schneller zu generieren)
+   - Opus (noch kleiner, nicht überall unterstützt)
+
+3. **Stimmen-Auswahl?**
+   - Alle 11 deutschen Stimmen anbieten?
+   - Oder nur 3-4 beste?
+
+4. **Multi-User Room?**
+   - Voice-Antwort nur an den fragenden User?
+   - Oder für alle im Room?
--- a/services/matrix-mana-bot/.env.example
+++ b/services/matrix-mana-bot/.env.example
@ -22,3 +22,10 @@ CLOCK_API_URL=http://localhost:3017/api/v1
 # Storage paths
 TODO_STORAGE_PATH=./data/todos.json
 CALENDAR_STORAGE_PATH=./data/calendar.json
+
+# Voice Services
+STT_URL=http://localhost:3020
+VOICE_BOT_URL=http://localhost:3050
+DEFAULT_VOICE=de-DE-ConradNeural
+DEFAULT_SPEED=1.0
+VOICE_ENABLED=true
--- a/services/matrix-mana-bot/CLAUDE.md
+++ b/services/matrix-mana-bot/CLAUDE.md
@ -53,6 +53,7 @@ Unified Matrix bot that combines all features in one. Users can interact with a
 | **Calendar** | `!cal`, `!week`, `!event`, `!calendars` | Event scheduling |
 | **Timers** | `!timer`, `!timers`, `!stop`, `!alarm`, `!alarms` | Time management |
 | **Smart** | `!summary`, `!ai-todo` | Cross-feature AI features |
+| **Voice** | Send voice note | Speech-to-text via Whisper |

 ## Commands

@ -143,6 +144,20 @@ Was ist TypeScript?
 !time tokyo
 ```

+### Voice Input
+
+```
+# Send a voice note in Matrix - bot transcribes and responds
+🎤 "Was steht heute an?"
+→ Bot shows: 🎤 *"Was steht heute an?"*
+→ Bot responds with today's events and tasks
+
+# Voice commands work naturally
+🎤 "Neue Aufgabe: Einkaufen gehen"
+🎤 "Timer 25 Minuten"
+🎤 "Was sind meine Termine diese Woche?"
+```
+
 ### Smart Features (Cross-Feature)

 ```
@ -211,6 +226,9 @@ src/
 │   ├── bot.module.ts
 │   ├── matrix.service.ts      # Matrix connection
 │   └── command-router.service.ts  # Command routing
+├── voice/
+│   ├── voice.module.ts
+│   └── voice.service.ts       # STT/TTS integration
 ├── handlers/
 │   ├── handlers.module.ts
 │   ├── ai.handler.ts          # AI/Ollama commands
@ -304,3 +322,7 @@ All bots share the same `@manacore/bot-services` package, so data is consistent.
 | `CLOCK_API_URL` | No | localhost:3017 | Clock backend |
 | `TODO_STORAGE_PATH` | No | ./data/todos.json | Todo storage |
 | `CALENDAR_STORAGE_PATH` | No | ./data/calendar.json | Calendar storage |
+| `STT_URL` | No | localhost:3020 | Speech-to-text (Whisper) |
+| `VOICE_BOT_URL` | No | localhost:3050 | Voice bot (TTS) |
+| `DEFAULT_VOICE` | No | de-DE-ConradNeural | Default TTS voice |
+| `VOICE_ENABLED` | No | true | Enable voice processing |
--- a/services/matrix-mana-bot/src/bot/bot.module.ts
+++ b/services/matrix-mana-bot/src/bot/bot.module.ts
@ -3,9 +3,10 @@ import { MatrixService } from './matrix.service';
 import { CommandRouterService } from './command-router.service';
 import { HandlersModule } from '../handlers/handlers.module';
 import { OrchestrationModule } from '../orchestration/orchestration.module';
+import { VoiceModule } from '../voice/voice.module';

@Module({
-	imports: [forwardRef(() => HandlersModule), forwardRef(() => OrchestrationModule)],
+	imports: [forwardRef(() => HandlersModule), forwardRef(() => OrchestrationModule), VoiceModule],
 	providers: [MatrixService, CommandRouterService],
 	exports: [MatrixService, CommandRouterService],
 })
--- a/services/matrix-mana-bot/src/bot/command-router.service.ts
+++ b/services/matrix-mana-bot/src/bot/command-router.service.ts
@ -11,6 +11,7 @@ export interface CommandContext {
 	userId: string;
 	message: string;
 	event: any;
+	isVoice?: boolean; // True if message came from voice input
 }

 interface CommandRoute {
@ -23,7 +24,10 @@ interface CommandRoute {
 const KEYWORD_COMMANDS: { keywords: string[]; command: string }[] = [
 	{ keywords: ['hilfe', 'help', 'was kannst du', 'befehle'], command: '!help' },
 	{ keywords: ['modelle', 'models', 'welche modelle'], command: '!models' },
-	{ keywords: ['meine aufgaben', 'zeige aufgaben', 'todo liste', 'was muss ich'], command: '!list' },
+	{
+		keywords: ['meine aufgaben', 'zeige aufgaben', 'todo liste', 'was muss ich'],
+		command: '!list',
+	},
 	{ keywords: ['heute', 'was steht heute an'], command: '!today' },
 	{ keywords: ['termine', 'kalender', 'meine termine'], command: '!cal' },
 	{ keywords: ['timer', 'stoppuhr'], command: '!timers' },
@ -97,7 +101,7 @@ export class CommandRouterService {
 			{
 				patterns: ['!today', '!heute'],
 				handler: (ctx) => this.todoHandler.today(ctx),
-				description: 'Today\'s todos',
+				description: "Today's todos",
 			},
 			{
 				patterns: ['!inbox'],
@ -124,7 +128,7 @@ export class CommandRouterService {
 			{
 				patterns: ['!cal', '!termine'],
 				handler: (ctx) => this.calendarHandler.today(ctx),
-				description: 'Today\'s events',
+				description: "Today's events",
 			},
 			{
 				patterns: ['!week', '!woche'],
--- a/services/matrix-mana-bot/src/bot/matrix.service.ts
+++ b/services/matrix-mana-bot/src/bot/matrix.service.ts
@ -1,28 +1,32 @@
 import { Injectable, Inject, forwardRef } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
-import {
-	BaseMatrixService,
-	MatrixBotConfig,
-	MatrixRoomEvent,
-} from '@manacore/matrix-bot-common';
+import { BaseMatrixService, MatrixBotConfig, MatrixRoomEvent } from '@manacore/matrix-bot-common';
 import { CommandRouterService, CommandContext } from './command-router.service';
+import { VoiceService } from '../voice/voice.service';
 import { HELP_TEXT, WELCOME_TEXT, BOT_INTRODUCTION } from '../config/configuration';

@Injectable()
 export class MatrixService extends BaseMatrixService {
+	private voiceEnabled: boolean;
+
 	constructor(
 		configService: ConfigService,
 		@Inject(forwardRef(() => CommandRouterService))
-		private commandRouter: CommandRouterService
+		private commandRouter: CommandRouterService,
+		@Inject(forwardRef(() => VoiceService))
+		private voiceService: VoiceService
 	) {
 		super(configService);
+		this.voiceEnabled = configService.get('voice.enabled') !== false;
 	}

 	protected getConfig(): MatrixBotConfig {
 		return {
-			homeserverUrl: this.configService.get<string>('matrix.homeserverUrl') || 'http://localhost:8008',
+			homeserverUrl:
+				this.configService.get<string>('matrix.homeserverUrl') || 'http://localhost:8008',
 			accessToken: this.configService.get<string>('matrix.accessToken') || '',
-			storagePath: this.configService.get<string>('matrix.storagePath') || './data/mana-bot-storage.json',
+			storagePath:
+				this.configService.get<string>('matrix.storagePath') || './data/mana-bot-storage.json',
 			allowedRooms: this.configService.get<string[]>('matrix.allowedRooms') || [],
 		};
 	}
@ -99,6 +103,81 @@ export class MatrixService extends BaseMatrixService {
 		}
 	}

+	/**
+	 * Handle voice note messages - transcribe and process as text
+	 */
+	protected async handleAudioMessage(
+		roomId: string,
+		event: MatrixRoomEvent,
+		sender: string
+	): Promise<void> {
+		if (!this.voiceEnabled) {
+			return;
+		}
+
+		const audioUrl = event.content?.url;
+		if (!audioUrl) {
+			this.logger.warn('Audio message without URL');
+			return;
+		}
+
+		try {
+			// Set typing indicator
+			await this.client.setTyping(roomId, true, 60000);
+
+			// Download audio from Matrix
+			this.logger.debug(`Downloading audio from ${audioUrl}`);
+			const audioBuffer = await this.downloadMedia(audioUrl);
+
+			// Transcribe audio
+			this.logger.debug(`Transcribing ${audioBuffer.length} bytes`);
+			const transcription = await this.voiceService.transcribe(audioBuffer);
+
+			if (!transcription.text || transcription.text.trim() === '') {
+				await this.client.setTyping(roomId, false);
+				await this.sendReply(
+					roomId,
+					event,
+					'🎤 Ich konnte leider nichts verstehen. Bitte versuche es noch einmal.'
+				);
+				return;
+			}
+
+			const message = transcription.text.trim();
+			this.logger.log(`Transcribed from ${sender}: "${message}"`);
+
+			// Show what was understood
+			await this.sendReply(roomId, event, `🎤 *"${message}"*`);
+
+			// Create context and route
+			const ctx: CommandContext = {
+				roomId,
+				userId: sender,
+				message,
+				event,
+				isVoice: true, // Flag for voice input
+			};
+
+			// Route the transcribed message
+			const response = await this.commandRouter.route(ctx);
+
+			// Stop typing
+			await this.client.setTyping(roomId, false);
+
+			if (response) {
+				await this.sendReply(roomId, event, response);
+			}
+		} catch (error) {
+			await this.client.setTyping(roomId, false);
+			this.logger.error(`Error handling voice message:`, error);
+			await this.sendReply(
+				roomId,
+				event,
+				'❌ Spracherkennung fehlgeschlagen. Bitte versuche es noch einmal.'
+			);
+		}
+	}
+
 	private async sendWelcomeMessage(roomId: string, userId: string) {
 		try {
 			await this.sendMessage(roomId, WELCOME_TEXT);
--- a/services/matrix-mana-bot/src/config/configuration.ts
+++ b/services/matrix-mana-bot/src/config/configuration.ts
@ -24,6 +24,13 @@ export default () => ({
 			storagePath: process.env.CALENDAR_STORAGE_PATH || './data/calendar.json',
 		},
 	},
+	voice: {
+		sttUrl: process.env.STT_URL || 'http://localhost:3020',
+		voiceBotUrl: process.env.VOICE_BOT_URL || 'http://localhost:3050',
+		defaultVoice: process.env.DEFAULT_VOICE || 'de-DE-ConradNeural',
+		defaultSpeed: parseFloat(process.env.DEFAULT_SPEED) || 1.0,
+		enabled: process.env.VOICE_ENABLED !== 'false',
+	},
 });

 // Help text for the unified bot
@ -57,6 +64,12 @@ Schreib einfach eine Nachricht - ich antworte!
 • \`!summary\` - Tages-Zusammenfassung (AI)
 • \`!ai-todo [text]\` - AI extrahiert Todos aus Text

+**🎤 Spracheingabe**
+Sende eine Sprachnachricht - ich verstehe dich!
+• Natürliche Befehle: "Was steht heute an?"
+• Aufgaben: "Neue Aufgabe: Einkaufen gehen"
+• Timer: "Timer 25 Minuten"
+
 **💡 Tipps**
 • Natürliche Sprache funktioniert: "Was sind meine Todos?"
 • Prioritäten: \`!todo Wichtig !p1\`
@ -73,8 +86,9 @@ Ich bin dein persönlicher Assistent mit vielen Funktionen:
 • 📋 Todo-Verwaltung
 • 📅 Kalender
 • ⏱️ Timer & Alarme
+• 🎤 Spracherkennung

-Schreib einfach eine Nachricht oder sag "hilfe" für alle Befehle!`;
+Schreib einfach eine Nachricht, sende eine Sprachnachricht, oder sag "hilfe" für alle Befehle!`;

 export const BOT_INTRODUCTION = `🤖 **Hallo! Ich bin Mana, euer All-in-One Assistent.**

--- a/services/matrix-mana-bot/src/voice/voice.module.ts
+++ b/services/matrix-mana-bot/src/voice/voice.module.ts
@ -0,0 +1,8 @@
+import { Module } from '@nestjs/common';
+import { VoiceService } from './voice.service';
+
+@Module({
+	providers: [VoiceService],
+	exports: [VoiceService],
+})
+export class VoiceModule {}
--- a/services/matrix-mana-bot/src/voice/voice.service.ts
+++ b/services/matrix-mana-bot/src/voice/voice.service.ts
@ -0,0 +1,210 @@
+import { Injectable, Logger } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
+
+export interface TranscriptionResult {
+	text: string;
+	language: string;
+	duration?: number;
+}
+
+export interface VoicePreferences {
+	voiceEnabled: boolean;
+	voice: string;
+	speed: number;
+}
+
+@Injectable()
+export class VoiceService {
+	private readonly logger = new Logger(VoiceService.name);
+	private readonly sttUrl: string;
+	private readonly voiceBotUrl: string;
+	private readonly defaultVoice: string;
+	private readonly defaultSpeed: number;
+
+	// User preferences (in-memory for now)
+	private userPreferences = new Map<string, VoicePreferences>();
+
+	constructor(private configService: ConfigService) {
+		this.sttUrl = this.configService.get('voice.sttUrl') || 'http://localhost:3020';
+		this.voiceBotUrl = this.configService.get('voice.voiceBotUrl') || 'http://localhost:3050';
+		this.defaultVoice = this.configService.get('voice.defaultVoice') || 'de-DE-ConradNeural';
+		this.defaultSpeed = this.configService.get('voice.defaultSpeed') || 1.0;
+
+		this.logger.log(`Voice Service initialized`);
+		this.logger.log(`STT URL: ${this.sttUrl}`);
+		this.logger.log(`Voice Bot URL: ${this.voiceBotUrl}`);
+	}
+
+	/**
+	 * Transcribe audio to text using mana-stt (Whisper)
+	 */
+	async transcribe(audioBuffer: Buffer, language = 'de'): Promise<TranscriptionResult> {
+		const startTime = Date.now();
+
+		try {
+			const formData = new FormData();
+			// Convert Buffer to Uint8Array for Blob compatibility
+			const uint8Array = new Uint8Array(audioBuffer);
+			formData.append('file', new Blob([uint8Array]), 'audio.ogg');
+			formData.append('language', language);
+
+			const response = await fetch(`${this.sttUrl}/transcribe`, {
+				method: 'POST',
+				body: formData,
+			});
+
+			if (!response.ok) {
+				const error = await response.text();
+				throw new Error(`STT error: ${response.status} - ${error}`);
+			}
+
+			const result = await response.json();
+			const duration = Date.now() - startTime;
+
+			this.logger.debug(`Transcribed in ${duration}ms: "${result.text?.substring(0, 50)}..."`);
+
+			return {
+				text: result.text || '',
+				language: result.language || language,
+				duration,
+			};
+		} catch (error) {
+			this.logger.error(`Transcription failed: ${error}`);
+			throw error;
+		}
+	}
+
+	/**
+	 * Synthesize speech from text using mana-voice-bot (Edge TTS)
+	 */
+	async synthesize(text: string, userId?: string): Promise<Buffer> {
+		const prefs = this.getUserPreferences(userId);
+		const startTime = Date.now();
+
+		try {
+			const formData = new FormData();
+			formData.append('text', text);
+			formData.append('voice', prefs.voice);
+
+			const response = await fetch(`${this.voiceBotUrl}/tts`, {
+				method: 'POST',
+				body: formData,
+			});
+
+			if (!response.ok) {
+				const error = await response.text();
+				throw new Error(`TTS error: ${response.status} - ${error}`);
+			}
+
+			const arrayBuffer = await response.arrayBuffer();
+			const buffer = Buffer.from(arrayBuffer);
+			const duration = Date.now() - startTime;
+
+			this.logger.debug(`Synthesized ${buffer.length} bytes in ${duration}ms`);
+
+			return buffer;
+		} catch (error) {
+			this.logger.error(`Synthesis failed: ${error}`);
+			throw error;
+		}
+	}
+
+	/**
+	 * Get available TTS voices
+	 */
+	async getVoices(): Promise<Record<string, string>> {
+		try {
+			const response = await fetch(`${this.voiceBotUrl}/voices`);
+			if (!response.ok) {
+				throw new Error(`Failed to get voices: ${response.status}`);
+			}
+			const data = await response.json();
+			return data.voices || {};
+		} catch (error) {
+			this.logger.error(`Failed to get voices: ${error}`);
+			return {};
+		}
+	}
+
+	/**
+	 * Check if voice services are available
+	 */
+	async checkHealth(): Promise<{ stt: boolean; tts: boolean }> {
+		const results = { stt: false, tts: false };
+
+		try {
+			const sttResponse = await fetch(`${this.sttUrl}/health`, {
+				signal: AbortSignal.timeout(5000),
+			});
+			results.stt = sttResponse.ok;
+		} catch {
+			results.stt = false;
+		}
+
+		try {
+			const ttsResponse = await fetch(`${this.voiceBotUrl}/health`, {
+				signal: AbortSignal.timeout(5000),
+			});
+			results.tts = ttsResponse.ok;
+		} catch {
+			results.tts = false;
+		}
+
+		return results;
+	}
+
+	/**
+	 * Get user voice preferences
+	 */
+	getUserPreferences(userId?: string): VoicePreferences {
+		if (!userId) {
+			return {
+				voiceEnabled: true,
+				voice: this.defaultVoice,
+				speed: this.defaultSpeed,
+			};
+		}
+
+		const prefs = this.userPreferences.get(userId);
+		if (prefs) {
+			return prefs;
+		}
+
+		// Default preferences
+		return {
+			voiceEnabled: true,
+			voice: this.defaultVoice,
+			speed: this.defaultSpeed,
+		};
+	}
+
+	/**
+	 * Update user voice preferences
+	 */
+	setUserPreferences(userId: string, prefs: Partial<VoicePreferences>): void {
+		const current = this.getUserPreferences(userId);
+		this.userPreferences.set(userId, { ...current, ...prefs });
+	}
+
+	/**
+	 * Enable/disable voice responses for user
+	 */
+	setVoiceEnabled(userId: string, enabled: boolean): void {
+		this.setUserPreferences(userId, { voiceEnabled: enabled });
+	}
+
+	/**
+	 * Set user's preferred voice
+	 */
+	setVoice(userId: string, voice: string): void {
+		this.setUserPreferences(userId, { voice });
+	}
+
+	/**
+	 * Set user's preferred speed
+	 */
+	setSpeed(userId: string, speed: number): void {
+		const clampedSpeed = Math.max(0.5, Math.min(2.0, speed));
+		this.setUserPreferences(userId, { speed: clampedSpeed });
+	}
+}