feat(apps): integrate GPU services into Picture and Chat apps

Picture App: - Update LocalImageGenService to use GPU server (gpu-img.mana.how) - Add API key authentication (GPU_API_KEY) - Increase timeout to 120s (VRAM may need model loading time) Chat App: - Add VoiceModule with STT/TTS integration via GPU server - POST /api/v1/voice/transcribe — Upload audio, get text + word timestamps - POST /api/v1/voice/synthesize — Send text, get audio response - GET /api/v1/voice/health — Check GPU voice services availability - Supports speaker diarization and language selection Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-16 06:39:41 +02:00 · 2026-03-27 21:59:46 +01:00 · 2026-03-27 21:59:46 +01:00 · 02bd9d3117
commit 02bd9d3117
parent 4b0f5a29fd
5 changed files with 239 additions and 3 deletions
--- a/apps/chat/apps/backend/src/app.module.ts
+++ b/apps/chat/apps/backend/src/app.module.ts
@ -12,6 +12,7 @@ import { SpaceModule } from './space/space.module';
 import { DocumentModule } from './document/document.module';
 import { ModelModule } from './model/model.module';
 import { AdminModule } from './admin/admin.module';
+import { VoiceModule } from './voice/voice.module';
 import { HealthModule } from '@manacore/shared-nestjs-health';

@Module({
@ -51,6 +52,7 @@ import { HealthModule } from '@manacore/shared-nestjs-health';
 		DocumentModule,
 		ModelModule,
 		AdminModule,
+		VoiceModule,
 		HealthModule.forRoot({ serviceName: 'chat-backend' }),
 	],
 })
--- a/apps/chat/apps/backend/src/voice/voice.controller.ts
+++ b/apps/chat/apps/backend/src/voice/voice.controller.ts
@ -0,0 +1,76 @@
+import {
+	Controller,
+	Post,
+	Body,
+	Get,
+	UseGuards,
+	UseInterceptors,
+	UploadedFile,
+	Res,
+	Query,
+} from '@nestjs/common';
+import { FileInterceptor } from '@nestjs/platform-express';
+import type { Response } from 'express';
+import { JwtAuthGuard } from '@manacore/shared-nestjs-auth';
+import { VoiceService } from './voice.service';
+
+@Controller('voice')
+@UseGuards(JwtAuthGuard)
+export class VoiceController {
+	constructor(private readonly voiceService: VoiceService) {}
+
+	/** Check GPU voice services availability. */
+	@Get('health')
+	async health() {
+		return this.voiceService.healthCheck();
+	}
+
+	/**
+	 * Transcribe audio to text.
+	 * POST /api/v1/voice/transcribe
+	 *
+	 * Body: multipart/form-data with "file" field
+	 * Query: ?language=de&diarize=true
+	 */
+	@Post('transcribe')
+	@UseInterceptors(FileInterceptor('file'))
+	async transcribe(
+		@UploadedFile() file: Express.Multer.File,
+		@Query('language') language?: string,
+		@Query('diarize') diarize?: string
+	) {
+		if (!file) {
+			return { error: 'No audio file provided' };
+		}
+
+		return this.voiceService.transcribe(file.buffer, file.originalname, {
+			language: language || 'de',
+			diarize: diarize === 'true',
+		});
+	}
+
+	/**
+	 * Synthesize text to speech.
+	 * POST /api/v1/voice/synthesize
+	 *
+	 * Returns audio file directly.
+	 */
+	@Post('synthesize')
+	async synthesize(
+		@Body() body: { text: string; voice?: string; speed?: number; format?: 'wav' | 'mp3' },
+		@Res() res: Response
+	) {
+		const result = await this.voiceService.synthesize(body.text, {
+			voice: body.voice,
+			speed: body.speed,
+			format: body.format,
+		});
+
+		res.set({
+			'Content-Type': result.contentType,
+			'Content-Length': result.audio.length.toString(),
+			'X-Duration': result.duration.toString(),
+		});
+		res.send(result.audio);
+	}
+}
--- a/apps/chat/apps/backend/src/voice/voice.module.ts
+++ b/apps/chat/apps/backend/src/voice/voice.module.ts
@ -0,0 +1,10 @@
+import { Module } from '@nestjs/common';
+import { VoiceController } from './voice.controller';
+import { VoiceService } from './voice.service';
+
+@Module({
+	controllers: [VoiceController],
+	providers: [VoiceService],
+	exports: [VoiceService],
+})
+export class VoiceModule {}
--- a/apps/chat/apps/backend/src/voice/voice.service.ts
+++ b/apps/chat/apps/backend/src/voice/voice.service.ts
@ -0,0 +1,142 @@
+import { Injectable, Logger } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
+
+/**
+ * Voice service for speech-to-text and text-to-speech
+ * using the GPU server's mana-stt and mana-tts services.
+ */
+@Injectable()
+export class VoiceService {
+	private readonly logger = new Logger(VoiceService.name);
+	private readonly sttUrl: string;
+	private readonly ttsUrl: string;
+	private readonly apiKey: string;
+	private readonly timeout: number;
+
+	constructor(private configService: ConfigService) {
+		this.sttUrl = this.configService.get<string>('GPU_STT_URL') || 'https://gpu-stt.mana.how';
+		this.ttsUrl = this.configService.get<string>('GPU_TTS_URL') || 'https://gpu-tts.mana.how';
+		this.apiKey = this.configService.get<string>('GPU_API_KEY') || '';
+		this.timeout = 60_000;
+	}
+
+	private authHeaders(): Record<string, string> {
+		const headers: Record<string, string> = {};
+		if (this.apiKey) headers['X-API-Key'] = this.apiKey;
+		return headers;
+	}
+
+	/**
+	 * Transcribe audio to text using WhisperX on the GPU server.
+	 * Supports word-level timestamps and speaker diarization.
+	 */
+	async transcribe(
+		audioBuffer: Buffer,
+		filename: string,
+		options: {
+			language?: string;
+			diarize?: boolean;
+		} = {}
+	): Promise<{
+		text: string;
+		language?: string;
+		words?: Array<{ word: string; start: number; end: number; speaker?: string }>;
+		speakers?: string[];
+		latencyMs?: number;
+	}> {
+		const formData = new FormData();
+		formData.append('file', new Blob([audioBuffer]), filename);
+		if (options.language) formData.append('language', options.language);
+		formData.append('align', 'true');
+		formData.append('diarize', String(options.diarize ?? false));
+
+		const controller = new AbortController();
+		const timer = setTimeout(() => controller.abort(), this.timeout);
+
+		try {
+			const response = await fetch(`${this.sttUrl}/transcribe`, {
+				method: 'POST',
+				headers: this.authHeaders(),
+				body: formData,
+				signal: controller.signal,
+			});
+
+			if (!response.ok) {
+				const error = await response.text().catch(() => '');
+				throw new Error(`STT error ${response.status}: ${error}`);
+			}
+
+			return await response.json();
+		} finally {
+			clearTimeout(timer);
+		}
+	}
+
+	/**
+	 * Synthesize text to speech using the GPU server's TTS service.
+	 * Returns audio as a Buffer.
+	 */
+	async synthesize(
+		text: string,
+		options: {
+			voice?: string;
+			speed?: number;
+			format?: 'wav' | 'mp3';
+		} = {}
+	): Promise<{
+		audio: Buffer;
+		contentType: string;
+		duration: number;
+	}> {
+		const controller = new AbortController();
+		const timer = setTimeout(() => controller.abort(), this.timeout);
+
+		try {
+			const response = await fetch(`${this.ttsUrl}/synthesize/auto`, {
+				method: 'POST',
+				headers: {
+					'Content-Type': 'application/json',
+					...this.authHeaders(),
+				},
+				body: JSON.stringify({
+					text,
+					voice: options.voice ?? 'de_katja',
+					speed: options.speed ?? 1.0,
+					output_format: options.format ?? 'mp3',
+				}),
+				signal: controller.signal,
+			});
+
+			if (!response.ok) {
+				const error = await response.text().catch(() => '');
+				throw new Error(`TTS error ${response.status}: ${error}`);
+			}
+
+			const arrayBuffer = await response.arrayBuffer();
+			return {
+				audio: Buffer.from(arrayBuffer),
+				contentType: response.headers.get('content-type') ?? 'audio/mpeg',
+				duration: parseFloat(response.headers.get('x-duration') ?? '0'),
+			};
+		} finally {
+			clearTimeout(timer);
+		}
+	}
+
+	/** Check if GPU voice services are available. */
+	async healthCheck(): Promise<{ stt: boolean; tts: boolean }> {
+		const check = async (url: string): Promise<boolean> => {
+			try {
+				const res = await fetch(`${url}/health`, {
+					signal: AbortSignal.timeout(5000),
+				});
+				return res.ok;
+			} catch {
+				return false;
+			}
+		};
+
+		const [stt, tts] = await Promise.all([check(this.sttUrl), check(this.ttsUrl)]);
+		return { stt, tts };
+	}
+}
--- a/apps/picture/apps/backend/src/generate/local-image-gen.service.ts
+++ b/apps/picture/apps/backend/src/generate/local-image-gen.service.ts
@ -16,10 +16,13 @@ export class LocalImageGenService {
 	private readonly timeout: number;
 	private isAvailable = false;

+	private readonly apiKey?: string;
+
 	constructor(private configService: ConfigService) {
 		this.baseUrl =
-			this.configService.get<string>('IMAGE_GEN_SERVICE_URL') || 'http://localhost:3025';
-		this.timeout = 60_000; // 60s (FLUX.2 klein is fast, but allow margin)
+			this.configService.get<string>('IMAGE_GEN_SERVICE_URL') || 'https://gpu-img.mana.how';
+		this.apiKey = this.configService.get<string>('GPU_API_KEY');
+		this.timeout = 120_000; // 120s (first request may need to load model into VRAM)
 		this.checkHealth();
 	}

@ -63,9 +66,12 @@ export class LocalImageGenService {
 			const controller = new AbortController();
 			setTimeout(() => controller.abort(), this.timeout);

+			const headers: Record<string, string> = { 'Content-Type': 'application/json' };
+			if (this.apiKey) headers['X-API-Key'] = this.apiKey;
+
 			const response = await fetch(`${this.baseUrl}/generate`, {
 				method: 'POST',
-				headers: { 'Content-Type': 'application/json' },
+				headers,
 				body: JSON.stringify({
 					prompt: params.prompt,
 					width: params.width || 1024,