feat(apps): integrate GPU services into Picture and Chat apps

Picture App:
- Update LocalImageGenService to use GPU server (gpu-img.mana.how)
- Add API key authentication (GPU_API_KEY)
- Increase timeout to 120s (VRAM may need model loading time)

Chat App:
- Add VoiceModule with STT/TTS integration via GPU server
- POST /api/v1/voice/transcribe — Upload audio, get text + word timestamps
- POST /api/v1/voice/synthesize — Send text, get audio response
- GET /api/v1/voice/health — Check GPU voice services availability
- Supports speaker diarization and language selection

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-27 21:59:46 +01:00
parent 4b0f5a29fd
commit 02bd9d3117
5 changed files with 239 additions and 3 deletions

View file

@ -12,6 +12,7 @@ import { SpaceModule } from './space/space.module';
import { DocumentModule } from './document/document.module';
import { ModelModule } from './model/model.module';
import { AdminModule } from './admin/admin.module';
import { VoiceModule } from './voice/voice.module';
import { HealthModule } from '@manacore/shared-nestjs-health';
@Module({
@ -51,6 +52,7 @@ import { HealthModule } from '@manacore/shared-nestjs-health';
DocumentModule,
ModelModule,
AdminModule,
VoiceModule,
HealthModule.forRoot({ serviceName: 'chat-backend' }),
],
})

View file

@ -0,0 +1,76 @@
import {
Controller,
Post,
Body,
Get,
UseGuards,
UseInterceptors,
UploadedFile,
Res,
Query,
} from '@nestjs/common';
import { FileInterceptor } from '@nestjs/platform-express';
import type { Response } from 'express';
import { JwtAuthGuard } from '@manacore/shared-nestjs-auth';
import { VoiceService } from './voice.service';
@Controller('voice')
@UseGuards(JwtAuthGuard)
export class VoiceController {
constructor(private readonly voiceService: VoiceService) {}
/** Check GPU voice services availability. */
@Get('health')
async health() {
return this.voiceService.healthCheck();
}
/**
* Transcribe audio to text.
* POST /api/v1/voice/transcribe
*
* Body: multipart/form-data with "file" field
* Query: ?language=de&diarize=true
*/
@Post('transcribe')
@UseInterceptors(FileInterceptor('file'))
async transcribe(
@UploadedFile() file: Express.Multer.File,
@Query('language') language?: string,
@Query('diarize') diarize?: string
) {
if (!file) {
return { error: 'No audio file provided' };
}
return this.voiceService.transcribe(file.buffer, file.originalname, {
language: language || 'de',
diarize: diarize === 'true',
});
}
/**
* Synthesize text to speech.
* POST /api/v1/voice/synthesize
*
* Returns audio file directly.
*/
@Post('synthesize')
async synthesize(
@Body() body: { text: string; voice?: string; speed?: number; format?: 'wav' | 'mp3' },
@Res() res: Response
) {
const result = await this.voiceService.synthesize(body.text, {
voice: body.voice,
speed: body.speed,
format: body.format,
});
res.set({
'Content-Type': result.contentType,
'Content-Length': result.audio.length.toString(),
'X-Duration': result.duration.toString(),
});
res.send(result.audio);
}
}

View file

@ -0,0 +1,10 @@
import { Module } from '@nestjs/common';
import { VoiceController } from './voice.controller';
import { VoiceService } from './voice.service';
@Module({
controllers: [VoiceController],
providers: [VoiceService],
exports: [VoiceService],
})
export class VoiceModule {}

View file

@ -0,0 +1,142 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
/**
* Voice service for speech-to-text and text-to-speech
* using the GPU server's mana-stt and mana-tts services.
*/
@Injectable()
export class VoiceService {
private readonly logger = new Logger(VoiceService.name);
private readonly sttUrl: string;
private readonly ttsUrl: string;
private readonly apiKey: string;
private readonly timeout: number;
constructor(private configService: ConfigService) {
this.sttUrl = this.configService.get<string>('GPU_STT_URL') || 'https://gpu-stt.mana.how';
this.ttsUrl = this.configService.get<string>('GPU_TTS_URL') || 'https://gpu-tts.mana.how';
this.apiKey = this.configService.get<string>('GPU_API_KEY') || '';
this.timeout = 60_000;
}
private authHeaders(): Record<string, string> {
const headers: Record<string, string> = {};
if (this.apiKey) headers['X-API-Key'] = this.apiKey;
return headers;
}
/**
* Transcribe audio to text using WhisperX on the GPU server.
* Supports word-level timestamps and speaker diarization.
*/
async transcribe(
audioBuffer: Buffer,
filename: string,
options: {
language?: string;
diarize?: boolean;
} = {}
): Promise<{
text: string;
language?: string;
words?: Array<{ word: string; start: number; end: number; speaker?: string }>;
speakers?: string[];
latencyMs?: number;
}> {
const formData = new FormData();
formData.append('file', new Blob([audioBuffer]), filename);
if (options.language) formData.append('language', options.language);
formData.append('align', 'true');
formData.append('diarize', String(options.diarize ?? false));
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), this.timeout);
try {
const response = await fetch(`${this.sttUrl}/transcribe`, {
method: 'POST',
headers: this.authHeaders(),
body: formData,
signal: controller.signal,
});
if (!response.ok) {
const error = await response.text().catch(() => '');
throw new Error(`STT error ${response.status}: ${error}`);
}
return await response.json();
} finally {
clearTimeout(timer);
}
}
/**
* Synthesize text to speech using the GPU server's TTS service.
* Returns audio as a Buffer.
*/
async synthesize(
text: string,
options: {
voice?: string;
speed?: number;
format?: 'wav' | 'mp3';
} = {}
): Promise<{
audio: Buffer;
contentType: string;
duration: number;
}> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), this.timeout);
try {
const response = await fetch(`${this.ttsUrl}/synthesize/auto`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...this.authHeaders(),
},
body: JSON.stringify({
text,
voice: options.voice ?? 'de_katja',
speed: options.speed ?? 1.0,
output_format: options.format ?? 'mp3',
}),
signal: controller.signal,
});
if (!response.ok) {
const error = await response.text().catch(() => '');
throw new Error(`TTS error ${response.status}: ${error}`);
}
const arrayBuffer = await response.arrayBuffer();
return {
audio: Buffer.from(arrayBuffer),
contentType: response.headers.get('content-type') ?? 'audio/mpeg',
duration: parseFloat(response.headers.get('x-duration') ?? '0'),
};
} finally {
clearTimeout(timer);
}
}
/** Check if GPU voice services are available. */
async healthCheck(): Promise<{ stt: boolean; tts: boolean }> {
const check = async (url: string): Promise<boolean> => {
try {
const res = await fetch(`${url}/health`, {
signal: AbortSignal.timeout(5000),
});
return res.ok;
} catch {
return false;
}
};
const [stt, tts] = await Promise.all([check(this.sttUrl), check(this.ttsUrl)]);
return { stt, tts };
}
}

View file

@ -16,10 +16,13 @@ export class LocalImageGenService {
private readonly timeout: number;
private isAvailable = false;
private readonly apiKey?: string;
constructor(private configService: ConfigService) {
this.baseUrl =
this.configService.get<string>('IMAGE_GEN_SERVICE_URL') || 'http://localhost:3025';
this.timeout = 60_000; // 60s (FLUX.2 klein is fast, but allow margin)
this.configService.get<string>('IMAGE_GEN_SERVICE_URL') || 'https://gpu-img.mana.how';
this.apiKey = this.configService.get<string>('GPU_API_KEY');
this.timeout = 120_000; // 120s (first request may need to load model into VRAM)
this.checkHealth();
}
@ -63,9 +66,12 @@ export class LocalImageGenService {
const controller = new AbortController();
setTimeout(() => controller.abort(), this.timeout);
const headers: Record<string, string> = { 'Content-Type': 'application/json' };
if (this.apiKey) headers['X-API-Key'] = this.apiKey;
const response = await fetch(`${this.baseUrl}/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers,
body: JSON.stringify({
prompt: params.prompt,
width: params.width || 1024,