From 02bd9d3117bdb7d75be861e22d1a6b0a739998ab Mon Sep 17 00:00:00 2001 From: Till JS Date: Fri, 27 Mar 2026 21:59:46 +0100 Subject: [PATCH] feat(apps): integrate GPU services into Picture and Chat apps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Picture App: - Update LocalImageGenService to use GPU server (gpu-img.mana.how) - Add API key authentication (GPU_API_KEY) - Increase timeout to 120s (VRAM may need model loading time) Chat App: - Add VoiceModule with STT/TTS integration via GPU server - POST /api/v1/voice/transcribe — Upload audio, get text + word timestamps - POST /api/v1/voice/synthesize — Send text, get audio response - GET /api/v1/voice/health — Check GPU voice services availability - Supports speaker diarization and language selection Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/chat/apps/backend/src/app.module.ts | 2 + .../backend/src/voice/voice.controller.ts | 76 ++++++++++ .../apps/backend/src/voice/voice.module.ts | 10 ++ .../apps/backend/src/voice/voice.service.ts | 142 ++++++++++++++++++ .../src/generate/local-image-gen.service.ts | 12 +- 5 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 apps/chat/apps/backend/src/voice/voice.controller.ts create mode 100644 apps/chat/apps/backend/src/voice/voice.module.ts create mode 100644 apps/chat/apps/backend/src/voice/voice.service.ts diff --git a/apps/chat/apps/backend/src/app.module.ts b/apps/chat/apps/backend/src/app.module.ts index 8ec99fd0b..4fc687630 100644 --- a/apps/chat/apps/backend/src/app.module.ts +++ b/apps/chat/apps/backend/src/app.module.ts @@ -12,6 +12,7 @@ import { SpaceModule } from './space/space.module'; import { DocumentModule } from './document/document.module'; import { ModelModule } from './model/model.module'; import { AdminModule } from './admin/admin.module'; +import { VoiceModule } from './voice/voice.module'; import { HealthModule } from '@manacore/shared-nestjs-health'; @Module({ @@ -51,6 +52,7 @@ import { HealthModule } from '@manacore/shared-nestjs-health'; DocumentModule, ModelModule, AdminModule, + VoiceModule, HealthModule.forRoot({ serviceName: 'chat-backend' }), ], }) diff --git a/apps/chat/apps/backend/src/voice/voice.controller.ts b/apps/chat/apps/backend/src/voice/voice.controller.ts new file mode 100644 index 000000000..6a8af64a5 --- /dev/null +++ b/apps/chat/apps/backend/src/voice/voice.controller.ts @@ -0,0 +1,76 @@ +import { + Controller, + Post, + Body, + Get, + UseGuards, + UseInterceptors, + UploadedFile, + Res, + Query, +} from '@nestjs/common'; +import { FileInterceptor } from '@nestjs/platform-express'; +import type { Response } from 'express'; +import { JwtAuthGuard } from '@manacore/shared-nestjs-auth'; +import { VoiceService } from './voice.service'; + +@Controller('voice') +@UseGuards(JwtAuthGuard) +export class VoiceController { + constructor(private readonly voiceService: VoiceService) {} + + /** Check GPU voice services availability. */ + @Get('health') + async health() { + return this.voiceService.healthCheck(); + } + + /** + * Transcribe audio to text. + * POST /api/v1/voice/transcribe + * + * Body: multipart/form-data with "file" field + * Query: ?language=de&diarize=true + */ + @Post('transcribe') + @UseInterceptors(FileInterceptor('file')) + async transcribe( + @UploadedFile() file: Express.Multer.File, + @Query('language') language?: string, + @Query('diarize') diarize?: string + ) { + if (!file) { + return { error: 'No audio file provided' }; + } + + return this.voiceService.transcribe(file.buffer, file.originalname, { + language: language || 'de', + diarize: diarize === 'true', + }); + } + + /** + * Synthesize text to speech. + * POST /api/v1/voice/synthesize + * + * Returns audio file directly. + */ + @Post('synthesize') + async synthesize( + @Body() body: { text: string; voice?: string; speed?: number; format?: 'wav' | 'mp3' }, + @Res() res: Response + ) { + const result = await this.voiceService.synthesize(body.text, { + voice: body.voice, + speed: body.speed, + format: body.format, + }); + + res.set({ + 'Content-Type': result.contentType, + 'Content-Length': result.audio.length.toString(), + 'X-Duration': result.duration.toString(), + }); + res.send(result.audio); + } +} diff --git a/apps/chat/apps/backend/src/voice/voice.module.ts b/apps/chat/apps/backend/src/voice/voice.module.ts new file mode 100644 index 000000000..08527e9e0 --- /dev/null +++ b/apps/chat/apps/backend/src/voice/voice.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; +import { VoiceController } from './voice.controller'; +import { VoiceService } from './voice.service'; + +@Module({ + controllers: [VoiceController], + providers: [VoiceService], + exports: [VoiceService], +}) +export class VoiceModule {} diff --git a/apps/chat/apps/backend/src/voice/voice.service.ts b/apps/chat/apps/backend/src/voice/voice.service.ts new file mode 100644 index 000000000..2c3230d53 --- /dev/null +++ b/apps/chat/apps/backend/src/voice/voice.service.ts @@ -0,0 +1,142 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; + +/** + * Voice service for speech-to-text and text-to-speech + * using the GPU server's mana-stt and mana-tts services. + */ +@Injectable() +export class VoiceService { + private readonly logger = new Logger(VoiceService.name); + private readonly sttUrl: string; + private readonly ttsUrl: string; + private readonly apiKey: string; + private readonly timeout: number; + + constructor(private configService: ConfigService) { + this.sttUrl = this.configService.get('GPU_STT_URL') || 'https://gpu-stt.mana.how'; + this.ttsUrl = this.configService.get('GPU_TTS_URL') || 'https://gpu-tts.mana.how'; + this.apiKey = this.configService.get('GPU_API_KEY') || ''; + this.timeout = 60_000; + } + + private authHeaders(): Record { + const headers: Record = {}; + if (this.apiKey) headers['X-API-Key'] = this.apiKey; + return headers; + } + + /** + * Transcribe audio to text using WhisperX on the GPU server. + * Supports word-level timestamps and speaker diarization. + */ + async transcribe( + audioBuffer: Buffer, + filename: string, + options: { + language?: string; + diarize?: boolean; + } = {} + ): Promise<{ + text: string; + language?: string; + words?: Array<{ word: string; start: number; end: number; speaker?: string }>; + speakers?: string[]; + latencyMs?: number; + }> { + const formData = new FormData(); + formData.append('file', new Blob([audioBuffer]), filename); + if (options.language) formData.append('language', options.language); + formData.append('align', 'true'); + formData.append('diarize', String(options.diarize ?? false)); + + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeout); + + try { + const response = await fetch(`${this.sttUrl}/transcribe`, { + method: 'POST', + headers: this.authHeaders(), + body: formData, + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.text().catch(() => ''); + throw new Error(`STT error ${response.status}: ${error}`); + } + + return await response.json(); + } finally { + clearTimeout(timer); + } + } + + /** + * Synthesize text to speech using the GPU server's TTS service. + * Returns audio as a Buffer. + */ + async synthesize( + text: string, + options: { + voice?: string; + speed?: number; + format?: 'wav' | 'mp3'; + } = {} + ): Promise<{ + audio: Buffer; + contentType: string; + duration: number; + }> { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeout); + + try { + const response = await fetch(`${this.ttsUrl}/synthesize/auto`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...this.authHeaders(), + }, + body: JSON.stringify({ + text, + voice: options.voice ?? 'de_katja', + speed: options.speed ?? 1.0, + output_format: options.format ?? 'mp3', + }), + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.text().catch(() => ''); + throw new Error(`TTS error ${response.status}: ${error}`); + } + + const arrayBuffer = await response.arrayBuffer(); + return { + audio: Buffer.from(arrayBuffer), + contentType: response.headers.get('content-type') ?? 'audio/mpeg', + duration: parseFloat(response.headers.get('x-duration') ?? '0'), + }; + } finally { + clearTimeout(timer); + } + } + + /** Check if GPU voice services are available. */ + async healthCheck(): Promise<{ stt: boolean; tts: boolean }> { + const check = async (url: string): Promise => { + try { + const res = await fetch(`${url}/health`, { + signal: AbortSignal.timeout(5000), + }); + return res.ok; + } catch { + return false; + } + }; + + const [stt, tts] = await Promise.all([check(this.sttUrl), check(this.ttsUrl)]); + return { stt, tts }; + } +} diff --git a/apps/picture/apps/backend/src/generate/local-image-gen.service.ts b/apps/picture/apps/backend/src/generate/local-image-gen.service.ts index 107cfdd99..50552af64 100644 --- a/apps/picture/apps/backend/src/generate/local-image-gen.service.ts +++ b/apps/picture/apps/backend/src/generate/local-image-gen.service.ts @@ -16,10 +16,13 @@ export class LocalImageGenService { private readonly timeout: number; private isAvailable = false; + private readonly apiKey?: string; + constructor(private configService: ConfigService) { this.baseUrl = - this.configService.get('IMAGE_GEN_SERVICE_URL') || 'http://localhost:3025'; - this.timeout = 60_000; // 60s (FLUX.2 klein is fast, but allow margin) + this.configService.get('IMAGE_GEN_SERVICE_URL') || 'https://gpu-img.mana.how'; + this.apiKey = this.configService.get('GPU_API_KEY'); + this.timeout = 120_000; // 120s (first request may need to load model into VRAM) this.checkHealth(); } @@ -63,9 +66,12 @@ export class LocalImageGenService { const controller = new AbortController(); setTimeout(() => controller.abort(), this.timeout); + const headers: Record = { 'Content-Type': 'application/json' }; + if (this.apiKey) headers['X-API-Key'] = this.apiKey; + const response = await fetch(`${this.baseUrl}/generate`, { method: 'POST', - headers: { 'Content-Type': 'application/json' }, + headers, body: JSON.stringify({ prompt: params.prompt, width: params.width || 1024,