feat(matrix-mana-bot): add voice input support (Phase 1)

- Add VoiceModule and VoiceService for STT integration
- Override handleAudioMessage to process voice notes
- Transcribe audio via mana-stt (Whisper)
- Route transcribed text through CommandRouter
- Add voice configuration and environment variables
- Update help text and documentation

Voice flow:
1. User sends voice note
2. Bot downloads and transcribes audio
3. Shows transcription: 🎤 *"text"*
4. Routes as normal text command
5. Returns text response

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-01 02:59:13 +01:00
parent f04c27fe26
commit db07b5613d
9 changed files with 833 additions and 13 deletions

View file

@ -22,3 +22,10 @@ CLOCK_API_URL=http://localhost:3017/api/v1
# Storage paths
TODO_STORAGE_PATH=./data/todos.json
CALENDAR_STORAGE_PATH=./data/calendar.json
# Voice Services
STT_URL=http://localhost:3020
VOICE_BOT_URL=http://localhost:3050
DEFAULT_VOICE=de-DE-ConradNeural
DEFAULT_SPEED=1.0
VOICE_ENABLED=true

View file

@ -53,6 +53,7 @@ Unified Matrix bot that combines all features in one. Users can interact with a
| **Calendar** | `!cal`, `!week`, `!event`, `!calendars` | Event scheduling |
| **Timers** | `!timer`, `!timers`, `!stop`, `!alarm`, `!alarms` | Time management |
| **Smart** | `!summary`, `!ai-todo` | Cross-feature AI features |
| **Voice** | Send voice note | Speech-to-text via Whisper |
## Commands
@ -143,6 +144,20 @@ Was ist TypeScript?
!time tokyo
```
### Voice Input
```
# Send a voice note in Matrix - bot transcribes and responds
🎤 "Was steht heute an?"
→ Bot shows: 🎤 *"Was steht heute an?"*
→ Bot responds with today's events and tasks
# Voice commands work naturally
🎤 "Neue Aufgabe: Einkaufen gehen"
🎤 "Timer 25 Minuten"
🎤 "Was sind meine Termine diese Woche?"
```
### Smart Features (Cross-Feature)
```
@ -211,6 +226,9 @@ src/
│ ├── bot.module.ts
│ ├── matrix.service.ts # Matrix connection
│ └── command-router.service.ts # Command routing
├── voice/
│ ├── voice.module.ts
│ └── voice.service.ts # STT/TTS integration
├── handlers/
│ ├── handlers.module.ts
│ ├── ai.handler.ts # AI/Ollama commands
@ -304,3 +322,7 @@ All bots share the same `@manacore/bot-services` package, so data is consistent.
| `CLOCK_API_URL` | No | localhost:3017 | Clock backend |
| `TODO_STORAGE_PATH` | No | ./data/todos.json | Todo storage |
| `CALENDAR_STORAGE_PATH` | No | ./data/calendar.json | Calendar storage |
| `STT_URL` | No | localhost:3020 | Speech-to-text (Whisper) |
| `VOICE_BOT_URL` | No | localhost:3050 | Voice bot (TTS) |
| `DEFAULT_VOICE` | No | de-DE-ConradNeural | Default TTS voice |
| `VOICE_ENABLED` | No | true | Enable voice processing |

View file

@ -3,9 +3,10 @@ import { MatrixService } from './matrix.service';
import { CommandRouterService } from './command-router.service';
import { HandlersModule } from '../handlers/handlers.module';
import { OrchestrationModule } from '../orchestration/orchestration.module';
import { VoiceModule } from '../voice/voice.module';
@Module({
imports: [forwardRef(() => HandlersModule), forwardRef(() => OrchestrationModule)],
imports: [forwardRef(() => HandlersModule), forwardRef(() => OrchestrationModule), VoiceModule],
providers: [MatrixService, CommandRouterService],
exports: [MatrixService, CommandRouterService],
})

View file

@ -11,6 +11,7 @@ export interface CommandContext {
userId: string;
message: string;
event: any;
isVoice?: boolean; // True if message came from voice input
}
interface CommandRoute {
@ -23,7 +24,10 @@ interface CommandRoute {
const KEYWORD_COMMANDS: { keywords: string[]; command: string }[] = [
{ keywords: ['hilfe', 'help', 'was kannst du', 'befehle'], command: '!help' },
{ keywords: ['modelle', 'models', 'welche modelle'], command: '!models' },
{ keywords: ['meine aufgaben', 'zeige aufgaben', 'todo liste', 'was muss ich'], command: '!list' },
{
keywords: ['meine aufgaben', 'zeige aufgaben', 'todo liste', 'was muss ich'],
command: '!list',
},
{ keywords: ['heute', 'was steht heute an'], command: '!today' },
{ keywords: ['termine', 'kalender', 'meine termine'], command: '!cal' },
{ keywords: ['timer', 'stoppuhr'], command: '!timers' },
@ -97,7 +101,7 @@ export class CommandRouterService {
{
patterns: ['!today', '!heute'],
handler: (ctx) => this.todoHandler.today(ctx),
description: 'Today\'s todos',
description: "Today's todos",
},
{
patterns: ['!inbox'],
@ -124,7 +128,7 @@ export class CommandRouterService {
{
patterns: ['!cal', '!termine'],
handler: (ctx) => this.calendarHandler.today(ctx),
description: 'Today\'s events',
description: "Today's events",
},
{
patterns: ['!week', '!woche'],

View file

@ -1,28 +1,32 @@
import { Injectable, Inject, forwardRef } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import {
BaseMatrixService,
MatrixBotConfig,
MatrixRoomEvent,
} from '@manacore/matrix-bot-common';
import { BaseMatrixService, MatrixBotConfig, MatrixRoomEvent } from '@manacore/matrix-bot-common';
import { CommandRouterService, CommandContext } from './command-router.service';
import { VoiceService } from '../voice/voice.service';
import { HELP_TEXT, WELCOME_TEXT, BOT_INTRODUCTION } from '../config/configuration';
@Injectable()
export class MatrixService extends BaseMatrixService {
private voiceEnabled: boolean;
constructor(
configService: ConfigService,
@Inject(forwardRef(() => CommandRouterService))
private commandRouter: CommandRouterService
private commandRouter: CommandRouterService,
@Inject(forwardRef(() => VoiceService))
private voiceService: VoiceService
) {
super(configService);
this.voiceEnabled = configService.get('voice.enabled') !== false;
}
protected getConfig(): MatrixBotConfig {
return {
homeserverUrl: this.configService.get<string>('matrix.homeserverUrl') || 'http://localhost:8008',
homeserverUrl:
this.configService.get<string>('matrix.homeserverUrl') || 'http://localhost:8008',
accessToken: this.configService.get<string>('matrix.accessToken') || '',
storagePath: this.configService.get<string>('matrix.storagePath') || './data/mana-bot-storage.json',
storagePath:
this.configService.get<string>('matrix.storagePath') || './data/mana-bot-storage.json',
allowedRooms: this.configService.get<string[]>('matrix.allowedRooms') || [],
};
}
@ -99,6 +103,81 @@ export class MatrixService extends BaseMatrixService {
}
}
/**
* Handle voice note messages - transcribe and process as text
*/
protected async handleAudioMessage(
roomId: string,
event: MatrixRoomEvent,
sender: string
): Promise<void> {
if (!this.voiceEnabled) {
return;
}
const audioUrl = event.content?.url;
if (!audioUrl) {
this.logger.warn('Audio message without URL');
return;
}
try {
// Set typing indicator
await this.client.setTyping(roomId, true, 60000);
// Download audio from Matrix
this.logger.debug(`Downloading audio from ${audioUrl}`);
const audioBuffer = await this.downloadMedia(audioUrl);
// Transcribe audio
this.logger.debug(`Transcribing ${audioBuffer.length} bytes`);
const transcription = await this.voiceService.transcribe(audioBuffer);
if (!transcription.text || transcription.text.trim() === '') {
await this.client.setTyping(roomId, false);
await this.sendReply(
roomId,
event,
'🎤 Ich konnte leider nichts verstehen. Bitte versuche es noch einmal.'
);
return;
}
const message = transcription.text.trim();
this.logger.log(`Transcribed from ${sender}: "${message}"`);
// Show what was understood
await this.sendReply(roomId, event, `🎤 *"${message}"*`);
// Create context and route
const ctx: CommandContext = {
roomId,
userId: sender,
message,
event,
isVoice: true, // Flag for voice input
};
// Route the transcribed message
const response = await this.commandRouter.route(ctx);
// Stop typing
await this.client.setTyping(roomId, false);
if (response) {
await this.sendReply(roomId, event, response);
}
} catch (error) {
await this.client.setTyping(roomId, false);
this.logger.error(`Error handling voice message:`, error);
await this.sendReply(
roomId,
event,
'❌ Spracherkennung fehlgeschlagen. Bitte versuche es noch einmal.'
);
}
}
private async sendWelcomeMessage(roomId: string, userId: string) {
try {
await this.sendMessage(roomId, WELCOME_TEXT);

View file

@ -24,6 +24,13 @@ export default () => ({
storagePath: process.env.CALENDAR_STORAGE_PATH || './data/calendar.json',
},
},
voice: {
sttUrl: process.env.STT_URL || 'http://localhost:3020',
voiceBotUrl: process.env.VOICE_BOT_URL || 'http://localhost:3050',
defaultVoice: process.env.DEFAULT_VOICE || 'de-DE-ConradNeural',
defaultSpeed: parseFloat(process.env.DEFAULT_SPEED) || 1.0,
enabled: process.env.VOICE_ENABLED !== 'false',
},
});
// Help text for the unified bot
@ -57,6 +64,12 @@ Schreib einfach eine Nachricht - ich antworte!
\`!summary\` - Tages-Zusammenfassung (AI)
\`!ai-todo [text]\` - AI extrahiert Todos aus Text
**🎤 Spracheingabe**
Sende eine Sprachnachricht - ich verstehe dich!
Natürliche Befehle: "Was steht heute an?"
Aufgaben: "Neue Aufgabe: Einkaufen gehen"
Timer: "Timer 25 Minuten"
**💡 Tipps**
Natürliche Sprache funktioniert: "Was sind meine Todos?"
Prioritäten: \`!todo Wichtig !p1\`
@ -73,8 +86,9 @@ Ich bin dein persönlicher Assistent mit vielen Funktionen:
📋 Todo-Verwaltung
📅 Kalender
Timer & Alarme
🎤 Spracherkennung
Schreib einfach eine Nachricht oder sag "hilfe" für alle Befehle!`;
Schreib einfach eine Nachricht, sende eine Sprachnachricht, oder sag "hilfe" für alle Befehle!`;
export const BOT_INTRODUCTION = `🤖 **Hallo! Ich bin Mana, euer All-in-One Assistent.**

View file

@ -0,0 +1,8 @@
import { Module } from '@nestjs/common';
import { VoiceService } from './voice.service';
@Module({
providers: [VoiceService],
exports: [VoiceService],
})
export class VoiceModule {}

View file

@ -0,0 +1,210 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
export interface TranscriptionResult {
text: string;
language: string;
duration?: number;
}
export interface VoicePreferences {
voiceEnabled: boolean;
voice: string;
speed: number;
}
@Injectable()
export class VoiceService {
private readonly logger = new Logger(VoiceService.name);
private readonly sttUrl: string;
private readonly voiceBotUrl: string;
private readonly defaultVoice: string;
private readonly defaultSpeed: number;
// User preferences (in-memory for now)
private userPreferences = new Map<string, VoicePreferences>();
constructor(private configService: ConfigService) {
this.sttUrl = this.configService.get('voice.sttUrl') || 'http://localhost:3020';
this.voiceBotUrl = this.configService.get('voice.voiceBotUrl') || 'http://localhost:3050';
this.defaultVoice = this.configService.get('voice.defaultVoice') || 'de-DE-ConradNeural';
this.defaultSpeed = this.configService.get('voice.defaultSpeed') || 1.0;
this.logger.log(`Voice Service initialized`);
this.logger.log(`STT URL: ${this.sttUrl}`);
this.logger.log(`Voice Bot URL: ${this.voiceBotUrl}`);
}
/**
* Transcribe audio to text using mana-stt (Whisper)
*/
async transcribe(audioBuffer: Buffer, language = 'de'): Promise<TranscriptionResult> {
const startTime = Date.now();
try {
const formData = new FormData();
// Convert Buffer to Uint8Array for Blob compatibility
const uint8Array = new Uint8Array(audioBuffer);
formData.append('file', new Blob([uint8Array]), 'audio.ogg');
formData.append('language', language);
const response = await fetch(`${this.sttUrl}/transcribe`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`STT error: ${response.status} - ${error}`);
}
const result = await response.json();
const duration = Date.now() - startTime;
this.logger.debug(`Transcribed in ${duration}ms: "${result.text?.substring(0, 50)}..."`);
return {
text: result.text || '',
language: result.language || language,
duration,
};
} catch (error) {
this.logger.error(`Transcription failed: ${error}`);
throw error;
}
}
/**
* Synthesize speech from text using mana-voice-bot (Edge TTS)
*/
async synthesize(text: string, userId?: string): Promise<Buffer> {
const prefs = this.getUserPreferences(userId);
const startTime = Date.now();
try {
const formData = new FormData();
formData.append('text', text);
formData.append('voice', prefs.voice);
const response = await fetch(`${this.voiceBotUrl}/tts`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`TTS error: ${response.status} - ${error}`);
}
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const duration = Date.now() - startTime;
this.logger.debug(`Synthesized ${buffer.length} bytes in ${duration}ms`);
return buffer;
} catch (error) {
this.logger.error(`Synthesis failed: ${error}`);
throw error;
}
}
/**
* Get available TTS voices
*/
async getVoices(): Promise<Record<string, string>> {
try {
const response = await fetch(`${this.voiceBotUrl}/voices`);
if (!response.ok) {
throw new Error(`Failed to get voices: ${response.status}`);
}
const data = await response.json();
return data.voices || {};
} catch (error) {
this.logger.error(`Failed to get voices: ${error}`);
return {};
}
}
/**
* Check if voice services are available
*/
async checkHealth(): Promise<{ stt: boolean; tts: boolean }> {
const results = { stt: false, tts: false };
try {
const sttResponse = await fetch(`${this.sttUrl}/health`, {
signal: AbortSignal.timeout(5000),
});
results.stt = sttResponse.ok;
} catch {
results.stt = false;
}
try {
const ttsResponse = await fetch(`${this.voiceBotUrl}/health`, {
signal: AbortSignal.timeout(5000),
});
results.tts = ttsResponse.ok;
} catch {
results.tts = false;
}
return results;
}
/**
* Get user voice preferences
*/
getUserPreferences(userId?: string): VoicePreferences {
if (!userId) {
return {
voiceEnabled: true,
voice: this.defaultVoice,
speed: this.defaultSpeed,
};
}
const prefs = this.userPreferences.get(userId);
if (prefs) {
return prefs;
}
// Default preferences
return {
voiceEnabled: true,
voice: this.defaultVoice,
speed: this.defaultSpeed,
};
}
/**
* Update user voice preferences
*/
setUserPreferences(userId: string, prefs: Partial<VoicePreferences>): void {
const current = this.getUserPreferences(userId);
this.userPreferences.set(userId, { ...current, ...prefs });
}
/**
* Enable/disable voice responses for user
*/
setVoiceEnabled(userId: string, enabled: boolean): void {
this.setUserPreferences(userId, { voiceEnabled: enabled });
}
/**
* Set user's preferred voice
*/
setVoice(userId: string, voice: string): void {
this.setUserPreferences(userId, { voice });
}
/**
* Set user's preferred speed
*/
setSpeed(userId: string, speed: number): void {
const clampedSpeed = Math.max(0.5, Math.min(2.0, speed));
this.setUserPreferences(userId, { speed: clampedSpeed });
}
}