feat(matrix-bots): add voice note transcription via mana-stt

- Switch matrix-project-doc-bot from OpenAI Whisper to local mana-stt
- Add voice note support to matrix-nutriphi-bot (auto-analyze meals)
- Add voice note support to matrix-todo-bot (create todos via voice)
- All bots now use STT_URL config for the mana-stt service (port 3020)
This commit is contained in:
Till-JS 2026-01-28 16:10:20 +01:00
parent 57b9d4cb37
commit 3b9d99ccd9
15 changed files with 339 additions and 43 deletions

View file

@ -12,6 +12,9 @@ MATRIX_STORAGE_PATH=./data/bot-storage.json
NUTRIPHI_BACKEND_URL=http://localhost:3023
NUTRIPHI_API_PREFIX=/api/v1
# Speech-to-Text (mana-stt service)
STT_URL=http://localhost:3020
# Mana Core Auth
MANA_CORE_AUTH_URL=http://localhost:3001

View file

@ -2,9 +2,10 @@ import { Module } from '@nestjs/common';
import { MatrixService } from './matrix.service';
import { NutriPhiModule } from '../nutriphi/nutriphi.module';
import { SessionModule } from '../session/session.module';
import { TranscriptionModule } from '../transcription/transcription.module';
@Module({
imports: [NutriPhiModule, SessionModule],
imports: [NutriPhiModule, SessionModule, TranscriptionModule],
providers: [MatrixService],
exports: [MatrixService],
})

View file

@ -14,6 +14,7 @@ import {
WeeklyStats,
} from '../nutriphi/nutriphi.service';
import { SessionService } from '../session/session.service';
import { TranscriptionService } from '../transcription/transcription.service';
import { HELP_MESSAGE, MEAL_TYPE_LABELS } from '../config/configuration';
// Natural language keywords that trigger commands (German + English)
@ -37,7 +38,8 @@ export class MatrixService implements OnModuleInit, OnModuleDestroy {
constructor(
private configService: ConfigService,
private nutriphiService: NutriPhiService,
private sessionService: SessionService
private sessionService: SessionService,
private transcriptionService: TranscriptionService
) {
this.allowedRooms = this.configService.get<string[]>('matrix.allowedRooms') || [];
}
@ -129,7 +131,7 @@ Sag "hilfe" fur alle Befehle!`;
msgtype?: string;
body?: string;
url?: string;
info?: { mimetype?: string };
info?: { mimetype?: string; duration?: number };
};
// Handle image messages
@ -147,6 +149,12 @@ Sag "hilfe" fur alle Befehle!`;
return;
}
// Handle audio/voice messages
if (content.msgtype === 'm.audio' && content.url) {
await this.handleAudioMessage(roomId, event.sender, content);
return;
}
// Only handle text messages
if (content.msgtype !== 'm.text') return;
@ -661,6 +669,63 @@ ${!isLoggedIn ? 'Nutze `!login email passwort` um dich anzumelden.' : ''}`;
}
}
private async handleAudioMessage(
roomId: string,
sender: string,
content: { url?: string; info?: { mimetype?: string; duration?: number } }
) {
const token = this.sessionService.getToken(sender);
if (!token) {
await this.sendMessage(
roomId,
`Du bist nicht angemeldet. Nutze \`!login email passwort\` um dich anzumelden.`
);
return;
}
await this.sendMessage(roomId, 'Verarbeite Sprachnotiz...');
await this.client.setTyping(roomId, true, 60000);
try {
// Download audio from Matrix
const mxcUrl = content.url!;
const httpUrl = this.client.mxcToHttp(mxcUrl);
this.logger.log(`Downloading audio from ${httpUrl}`);
const response = await fetch(httpUrl);
if (!response.ok) {
throw new Error(`Failed to download audio: ${response.status}`);
}
const buffer = Buffer.from(await response.arrayBuffer());
// Transcribe audio
const transcription = await this.transcriptionService.transcribe(buffer);
this.logger.log(`Transcription: ${transcription.substring(0, 50)}...`);
if (!transcription.trim()) {
await this.client.setTyping(roomId, false);
await this.sendMessage(roomId, 'Konnte keine Sprache erkennen. Bitte versuche es erneut.');
return;
}
// Analyze the transcribed text as a meal
await this.sendMessage(roomId, `Transkription: "${transcription}"\n\nAnalysiere...`);
const result = await this.nutriphiService.analyzeText(transcription, token);
await this.client.setTyping(roomId, false);
// Format and send result
const formattedResult = this.formatAnalysisResult(result);
await this.sendMessage(roomId, formattedResult);
} catch (error) {
await this.client.setTyping(roomId, false);
const errorMsg = error instanceof Error ? error.message : 'Unbekannter Fehler';
this.logger.error('Audio processing failed:', error);
await this.sendMessage(roomId, `Fehler bei der Verarbeitung: ${errorMsg}`);
}
}
private async downloadMatrixImage(mxcUrl: string): Promise<string> {
const httpUrl = this.client.mxcToHttp(mxcUrl);
this.logger.log(`Downloading image from ${httpUrl}`);

View file

@ -15,6 +15,9 @@ export default () => ({
devBypass: process.env.DEV_BYPASS_AUTH === 'true',
devUserId: process.env.DEV_USER_ID || '',
},
stt: {
url: process.env.STT_URL || 'http://localhost:3020',
},
});
export const HELP_MESSAGE = `**NutriPhi Bot - KI-Ernahrungsassistent**
@ -22,7 +25,7 @@ export const HELP_MESSAGE = `**NutriPhi Bot - KI-Ernahrungsassistent**
**Befehle:**
- \`!help\` - Diese Hilfe anzeigen
- \`!login email passwort\` - Bei NutriPhi anmelden
- \`!analyze [beschreibung]\` - Foto/Text analysieren
- \`!analyze [beschreibung]\` - Foto/Text/Sprache analysieren
- \`!today\` / \`heute\` - Tages-Zusammenfassung
- \`!week\` / \`woche\` - Wochen-Statistik
- \`!goals\` / \`ziele\` - Aktuelle Ziele
@ -31,9 +34,10 @@ export const HELP_MESSAGE = `**NutriPhi Bot - KI-Ernahrungsassistent**
- \`!tips\` / \`tipps\` - KI-Empfehlungen
- \`!status\` - Bot-Status
**Bild-Analyse:**
1. Sende ein Foto deiner Mahlzeit
2. Dann: \`!analyze\` oder \`!analyze Spaghetti mit Sauce\`
**Mahlzeit erfassen:**
- Foto senden + \`!analyze\`
- Sprachnotiz senden (wird automatisch transkribiert & analysiert)
- \`!analyze Spaghetti mit Sauce\` (Textbeschreibung)
**Beispiele:**
- "heute" - Zeigt Tages-Ubersicht

View file

@ -0,0 +1,8 @@
import { Module } from '@nestjs/common';
import { TranscriptionService } from './transcription.service';
@Module({
providers: [TranscriptionService],
exports: [TranscriptionService],
})
export class TranscriptionModule {}

View file

@ -0,0 +1,54 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
interface SttResponse {
text: string;
language?: string;
model?: string;
}
@Injectable()
export class TranscriptionService {
private readonly logger = new Logger(TranscriptionService.name);
private readonly sttUrl: string;
constructor(private configService: ConfigService) {
this.sttUrl = this.configService.get<string>('stt.url') || 'http://localhost:3020';
this.logger.log(`STT Service URL: ${this.sttUrl}`);
}
async transcribe(audioBuffer: Buffer, language: string = 'de'): Promise<string> {
const formData = new FormData();
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/ogg' });
formData.append('file', blob, 'audio.ogg');
formData.append('language', language);
try {
const response = await fetch(`${this.sttUrl}/transcribe`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`STT service error: ${response.status} - ${errorText}`);
}
const result: SttResponse = await response.json();
this.logger.log(`Transcription completed: ${result.text.substring(0, 50)}...`);
return result.text;
} catch (error) {
this.logger.error('Transcription failed:', error);
throw error;
}
}
async checkHealth(): Promise<boolean> {
try {
const response = await fetch(`${this.sttUrl}/health`);
return response.ok;
} catch {
return false;
}
}
}

View file

@ -17,7 +17,9 @@ S3_ACCESS_KEY=minioadmin
S3_SECRET_KEY=minioadmin
S3_BUCKET=project-doc-bot
# OpenAI
# Speech-to-Text (mana-stt service)
STT_URL=http://localhost:3020
# OpenAI (for blog generation)
OPENAI_API_KEY=
OPENAI_MODEL=gpt-4o-mini
OPENAI_WHISPER_MODEL=whisper-1

View file

@ -16,10 +16,12 @@ export default () => ({
secretKey: process.env.S3_SECRET_KEY || 'minioadmin',
bucket: process.env.S3_BUCKET || 'project-doc-bot',
},
stt: {
url: process.env.STT_URL || 'http://localhost:3020',
},
openai: {
apiKey: process.env.OPENAI_API_KEY || '',
model: process.env.OPENAI_MODEL || 'gpt-4o-mini',
whisperModel: process.env.OPENAI_WHISPER_MODEL || 'whisper-1',
},
});

View file

@ -1,40 +1,47 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import OpenAI from 'openai';
import { Readable } from 'stream';
interface SttResponse {
text: string;
language?: string;
model?: string;
}
@Injectable()
export class TranscriptionService {
private readonly logger = new Logger(TranscriptionService.name);
private readonly openai: OpenAI;
private readonly model: string;
private readonly sttUrl: string;
constructor(private configService: ConfigService) {
const apiKey = this.configService.get<string>('openai.apiKey');
if (!apiKey) {
this.logger.warn('OPENAI_API_KEY not configured - transcription disabled');
}
this.openai = new OpenAI({ apiKey });
this.model = this.configService.get<string>('openai.whisperModel') || 'whisper-1';
this.sttUrl = this.configService.get<string>('stt.url') || 'http://localhost:3020';
this.logger.log(`STT Service URL: ${this.sttUrl}`);
}
async transcribe(audioBuffer: Buffer): Promise<string> {
const apiKey = this.configService.get<string>('openai.apiKey');
if (!apiKey) {
throw new Error('OpenAI API key not configured');
async transcribe(audioBuffer: Buffer, language: string = 'de'): Promise<string> {
const formData = new FormData();
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/ogg' });
formData.append('file', blob, 'audio.ogg');
formData.append('language', language);
try {
const response = await fetch(`${this.sttUrl}/transcribe`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`STT service error: ${response.status} - ${errorText}`);
}
const result: SttResponse = await response.json();
this.logger.log(
`Transcription completed (${result.model || 'whisper'}): ${result.text.substring(0, 50)}...`
);
return result.text;
} catch (error) {
this.logger.error('Transcription failed:', error);
throw error;
}
// Create a File-like object for the API
const file = new File([new Uint8Array(audioBuffer)], 'audio.ogg', { type: 'audio/ogg' });
const response = await this.openai.audio.transcriptions.create({
file,
model: this.model,
language: 'de',
});
return response.text;
}
}

View file

@ -0,0 +1,15 @@
# Server
PORT=3314
# Matrix
MATRIX_HOMESERVER_URL=http://localhost:8008
MATRIX_ACCESS_TOKEN=syt_xxx_your_bot_token
MATRIX_ALLOWED_ROOMS=#todo:matrix.mana.how
MATRIX_STORAGE_PATH=./data/bot-storage.json
# Todo API (optional, for external todo service)
TODO_API_URL=http://localhost:3010/api/v1
TODO_SERVICE_KEY=
# Speech-to-Text (mana-stt service)
STT_URL=http://localhost:3020

View file

@ -1,9 +1,10 @@
import { Module } from '@nestjs/common';
import { MatrixService } from './matrix.service';
import { TodoModule } from '../todo/todo.module';
import { TranscriptionModule } from '../transcription/transcription.module';
@Module({
imports: [TodoModule],
imports: [TodoModule, TranscriptionModule],
providers: [MatrixService],
exports: [MatrixService],
})

View file

@ -9,6 +9,7 @@ import {
import * as path from 'path';
import * as fs from 'fs';
import { TodoService, Task } from '../todo/todo.service';
import { TranscriptionService } from '../transcription/transcription.service';
import { HELP_TEXT, WELCOME_TEXT, BOT_INTRODUCTION } from '../config/configuration';
// Natural language keywords that trigger commands (German + English)
@ -35,7 +36,8 @@ export class MatrixService implements OnModuleInit, OnModuleDestroy {
constructor(
private configService: ConfigService,
private todoService: TodoService
private todoService: TodoService,
private transcriptionService: TranscriptionService
) {
this.homeserverUrl = this.configService.get<string>(
'matrix.homeserverUrl',
@ -141,14 +143,21 @@ export class MatrixService implements OnModuleInit, OnModuleDestroy {
return;
}
const userId = event.sender;
const msgtype = event.content?.msgtype;
// Handle audio/voice messages
if (msgtype === 'm.audio' && event.content?.url) {
await this.handleAudioMessage(roomId, event, userId);
return;
}
// Only handle text messages
if (event.content?.msgtype !== 'm.text') return;
if (msgtype !== 'm.text') return;
const body = event.content.body?.trim();
if (!body) return;
const userId = event.sender;
try {
// Check for natural language keywords first
const keywordCommand = this.detectKeywordCommand(body);
@ -546,6 +555,64 @@ Bot: ✅ Online`;
}
}
private async handleAudioMessage(roomId: string, event: any, userId: string) {
try {
await this.sendReply(roomId, event, 'Verarbeite Sprachnotiz...');
// Download audio from Matrix
const mxcUrl = event.content.url;
const httpUrl = this.client.mxcToHttp(mxcUrl);
this.logger.log(`Downloading audio from ${httpUrl}`);
const response = await fetch(httpUrl);
if (!response.ok) {
throw new Error(`Failed to download audio: ${response.status}`);
}
const buffer = Buffer.from(await response.arrayBuffer());
// Transcribe audio
const transcription = await this.transcriptionService.transcribe(buffer);
this.logger.log(`Transcription: ${transcription.substring(0, 50)}...`);
if (!transcription.trim()) {
await this.sendReply(
roomId,
event,
'Konnte keine Sprache erkennen. Bitte versuche es erneut.'
);
return;
}
// Parse the transcription as a task input
const { title, priority, dueDate, project } = this.todoService.parseTaskInput(transcription);
// Create the task
const task = await this.todoService.createTask(userId, title, {
priority,
dueDate,
project,
});
let responseText = `Transkription: "${transcription}"\n\n✅ Aufgabe erstellt: **${task.title}**`;
const details: string[] = [];
if (priority < 4) details.push(`Prioritat ${priority}`);
if (dueDate) details.push(`Datum: ${this.formatDate(dueDate)}`);
if (project) details.push(`Projekt: ${project}`);
if (details.length > 0) {
responseText += `\n${details.join(' | ')}`;
}
await this.sendReply(roomId, event, responseText);
} catch (error) {
this.logger.error('Audio processing failed:', error);
const errorMsg = error instanceof Error ? error.message : 'Unbekannter Fehler';
await this.sendReply(roomId, event, `Fehler bei der Verarbeitung: ${errorMsg}`);
}
}
private markdownToHtml(text: string): string {
return text
.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')

View file

@ -10,12 +10,16 @@ export default () => ({
apiUrl: process.env.TODO_API_URL || 'http://localhost:3010/api/v1',
serviceKey: process.env.TODO_SERVICE_KEY || '',
},
stt: {
url: process.env.STT_URL || 'http://localhost:3020',
},
});
export const HELP_TEXT = `🎯 **Todo Bot - Hilfe**
**Aufgaben verwalten:**
\`!add [Aufgabe]\` - Neue Aufgabe hinzufügen
Sprachnotiz senden - Aufgabe per Sprache erstellen
\`!list\` oder \`!heute\` - Heutige Aufgaben anzeigen
\`!inbox\` - Aufgaben ohne Datum anzeigen
\`!done [Nr]\` - Aufgabe als erledigt markieren
@ -34,7 +38,8 @@ export const HELP_TEXT = `🎯 **Todo Bot - Hilfe**
\`!help\` oder \`hilfe\` - Diese Hilfe anzeigen
**Natürliche Sprache:**
Du kannst auch einfach "hilfe", "zeige aufgaben", "was muss ich heute machen?" schreiben.`;
Du kannst auch einfach "hilfe", "zeige aufgaben", "was muss ich heute machen?" schreiben.
Oder sende eine Sprachnotiz mit deiner Aufgabe!`;
export const WELCOME_TEXT = `👋 **Willkommen beim Todo Bot!**

View file

@ -0,0 +1,8 @@
import { Module } from '@nestjs/common';
import { TranscriptionService } from './transcription.service';
@Module({
providers: [TranscriptionService],
exports: [TranscriptionService],
})
export class TranscriptionModule {}

View file

@ -0,0 +1,54 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
interface SttResponse {
text: string;
language?: string;
model?: string;
}
@Injectable()
export class TranscriptionService {
private readonly logger = new Logger(TranscriptionService.name);
private readonly sttUrl: string;
constructor(private configService: ConfigService) {
this.sttUrl = this.configService.get<string>('stt.url') || 'http://localhost:3020';
this.logger.log(`STT Service URL: ${this.sttUrl}`);
}
async transcribe(audioBuffer: Buffer, language: string = 'de'): Promise<string> {
const formData = new FormData();
const blob = new Blob([new Uint8Array(audioBuffer)], { type: 'audio/ogg' });
formData.append('file', blob, 'audio.ogg');
formData.append('language', language);
try {
const response = await fetch(`${this.sttUrl}/transcribe`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`STT service error: ${response.status} - ${errorText}`);
}
const result: SttResponse = await response.json();
this.logger.log(`Transcription completed: ${result.text.substring(0, 50)}...`);
return result.text;
} catch (error) {
this.logger.error('Transcription failed:', error);
throw error;
}
}
async checkHealth(): Promise<boolean> {
try {
const response = await fetch(`${this.sttUrl}/health`);
return response.ok;
} catch {
return false;
}
}
}