From 16e0d99c5a59912cdbdf63a925c6a02354aa4d1d Mon Sep 17 00:00:00 2001 From: Till JS Date: Fri, 27 Mar 2026 21:35:30 +0100 Subject: [PATCH] feat(gpu-server): complete GPU server setup with AI services, monitoring, and public access - Set up 5 AI services on Windows GPU server (RTX 3090): - mana-llm (Port 3025): OpenAI-compatible LLM gateway via Ollama - mana-stt (Port 3020): WhisperX with word timestamps + speaker diarization - mana-tts (Port 3022): Kokoro (EN) + Edge TTS (DE) + Piper (local DE) - mana-image-gen (Port 3023): FLUX.2 klein 4B image generation - Ollama (Port 11434): gemma3:4b/12b, qwen2.5-coder:14b, nomic-embed-text - Add @manacore/shared-gpu TypeScript client package with SttClient, TtsClient, ImageClient - Add CUDA-compatible whisper_service using faster-whisper for Windows - Configure public access via Cloudflare Tunnel (gpu-llm/stt/tts/img.mana.how) - Add Loki log aggregator (Docker on Mac Mini) + log shipper on GPU server - Add GPU scrape targets to Prometheus/VictoriaMetrics config - Add Grafana Loki datasource for GPU service logs - Add health check with auto-restart, log rotation, and log shipping - Document complete setup: Always-On config, troubleshooting, architecture Co-Authored-By: Claude Opus 4.6 (1M context) --- .../grafana/provisioning/datasources/loki.yml | 9 + docker/loki/local-config.yaml | 36 ++ docs/WINDOWS_GPU_SERVER_SETUP.md | 532 +++++++++++++++++- packages/shared-gpu/package.json | 28 + packages/shared-gpu/src/gpu-client.ts | 56 ++ packages/shared-gpu/src/image-client.ts | 72 +++ packages/shared-gpu/src/index.ts | 24 + packages/shared-gpu/src/resolve-url.ts | 31 + packages/shared-gpu/src/stt-client.ts | 59 ++ packages/shared-gpu/src/tts-client.ts | 67 +++ packages/shared-gpu/src/types.ts | 142 +++++ packages/shared-gpu/tsconfig.json | 21 + services/mana-stt/app/whisper_service_cuda.py | 175 ++++++ 13 files changed, 1245 insertions(+), 7 deletions(-) create mode 100644 docker/grafana/provisioning/datasources/loki.yml create mode 100644 docker/loki/local-config.yaml create mode 100644 packages/shared-gpu/package.json create mode 100644 packages/shared-gpu/src/gpu-client.ts create mode 100644 packages/shared-gpu/src/image-client.ts create mode 100644 packages/shared-gpu/src/index.ts create mode 100644 packages/shared-gpu/src/resolve-url.ts create mode 100644 packages/shared-gpu/src/stt-client.ts create mode 100644 packages/shared-gpu/src/tts-client.ts create mode 100644 packages/shared-gpu/src/types.ts create mode 100644 packages/shared-gpu/tsconfig.json create mode 100644 services/mana-stt/app/whisper_service_cuda.py diff --git a/docker/grafana/provisioning/datasources/loki.yml b/docker/grafana/provisioning/datasources/loki.yml new file mode 100644 index 000000000..11f3fa87a --- /dev/null +++ b/docker/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: false + editable: true diff --git a/docker/loki/local-config.yaml b/docker/loki/local-config.yaml new file mode 100644 index 000000000..b1997b71c --- /dev/null +++ b/docker/loki/local-config.yaml @@ -0,0 +1,36 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 30d + max_query_length: 721h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + delete_request_store: filesystem diff --git a/docs/WINDOWS_GPU_SERVER_SETUP.md b/docs/WINDOWS_GPU_SERVER_SETUP.md index 4c4bd431f..0b7f72290 100644 --- a/docs/WINDOWS_GPU_SERVER_SETUP.md +++ b/docs/WINDOWS_GPU_SERVER_SETUP.md @@ -7,7 +7,58 @@ Danach kann alles Weitere (Ollama, AI-Services, Cloudflare Tunnel) per SSH erled --- -## Schritt 1: Computername setzen +## Checkliste: Nach jedem Neustart + +> **Wichtig:** Bis der Server-Modus (Schritt 9) vollständig konfiguriert ist, können nach einem Neustart einige Dinge manuell geprüft/repariert werden müssen. + +PowerShell **als Administrator** ausführen: + +```powershell +# 1. Netzwerkprofil auf "Privat" setzen +# (Windows setzt es nach Neustart manchmal auf "Öffentlich" zurück, +# was die Firewall verschärft und SSH/Ping blockt) +Get-NetConnectionProfile | Set-NetConnectionProfile -NetworkCategory Private + +# 2. SSH-Dienst prüfen (muss "Running" sein) +Get-Service sshd +# Falls "Stopped": Start-Service sshd + +# 3. AI-Services prüfen +python C:\mana\status.py +# Falls Services nicht laufen: +Start-ScheduledTask -TaskName "ManaLLM" +Start-ScheduledTask -TaskName "ManaSTT" +Start-ScheduledTask -TaskName "ManaTTS" +Start-ScheduledTask -TaskName "ManaImageGen" +``` + +Wenn Schritt 9 (Server-Modus) korrekt konfiguriert ist, sollte der PC: +- Nie in den Ruhezustand gehen +- Nach Neustart automatisch einloggen +- Alle Services automatisch starten (Scheduled Tasks mit AtLogOn) +- Netzwerkprofil dauerhaft auf "Privat" stehen + +### Netzwerkprofil dauerhaft auf "Privat" fixieren + +Damit das Netzwerk nach Neustart nicht wieder auf "Öffentlich" springt: + +```powershell +# Variante 1: Per Registry (empfohlen) +# Erst die aktuelle Netzwerk-ID herausfinden: +Get-NetConnectionProfile + +# Dann in der Registry fixieren (ProfileType: 1=Privat, 0=Öffentlich): +$profiles = Get-ChildItem "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion\NetworkList\Profiles" +foreach ($profile in $profiles) { + Set-ItemProperty -Path $profile.PSPath -Name Category -Value 1 +} +``` + +--- + +## Erstinstallation + +### Schritt 1: Computername setzen PowerShell **als Administrator** öffnen (Rechtsklick → Als Administrator ausführen): @@ -134,15 +185,110 @@ mkdir C:\mana\models --- -## Schritt 9: Neustart +## Schritt 9: Server-Modus konfigurieren (Always-On) + +Windows ist standardmäßig als Desktop-PC konfiguriert und geht in den Ruhezustand — das führt dazu, dass SSH und alle AI-Services nicht mehr erreichbar sind. Für den Server-Betrieb muss das komplett deaktiviert werden. + +PowerShell **als Administrator**: + +```powershell +# ============================================ +# 1. Energiesparplan auf "Höchstleistung" setzen +# ============================================ +powercfg /setactive 8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c + +# ============================================ +# 2. Ruhezustand komplett deaktivieren +# ============================================ +powercfg /hibernate off + +# Standby deaktivieren (Netzbetrieb) +powercfg /change standby-timeout-ac 0 +# Bildschirm-Timeout (optional: 0 = nie, oder 30 = 30 Min) +powercfg /change monitor-timeout-ac 30 +# Festplatten-Timeout deaktivieren +powercfg /change disk-timeout-ac 0 + +# ============================================ +# 3. Netzwerkadapter darf NICHT in Energiesparmodus +# ============================================ +# Alle Netzwerkadapter: Energiesparen deaktivieren +Get-NetAdapter -Physical | ForEach-Object { + $name = $_.Name + # Disable "Allow the computer to turn off this device to save power" + $adapter = Get-WmiObject -Class Win32_NetworkAdapter | Where-Object { $_.NetConnectionID -eq $name } + if ($adapter) { + $deviceID = $adapter.PNPDeviceID + $key = "HKLM:\SYSTEM\CurrentControlSet\Enum\$deviceID\Device Parameters\WDF" + # Setze PnPCapabilities = 24 (disable power management) + $regPath = "HKLM:\SYSTEM\CurrentControlSet\Enum\$deviceID" + if (Test-Path "$regPath\Device Parameters") { + # Alternative: über Geräte-Manager manuell deaktivieren + Write-Host "Bitte im Geräte-Manager für '$name' Energiesparen manuell deaktivieren" + } + } +} + +# ============================================ +# 4. USB-Energiesparen deaktivieren (für Peripherie) +# ============================================ +powercfg /setacvalueindex SCHEME_CURRENT 2a737441-1930-4402-8d77-b2bebba308a3 48e6b7a6-50f5-4782-a5d4-53bb8f07e226 0 +powercfg /setactive SCHEME_CURRENT + +# ============================================ +# 5. Schnellstart deaktivieren (verursacht Probleme mit SSH nach Neustart) +# ============================================ +reg add "HKLM\SYSTEM\CurrentControlSet\Control\Session Manager\Power" /v HiberbootEnabled /t REG_DWORD /d 0 /f + +# ============================================ +# 6. Automatischen Neustart nach Windows Update verhindern +# ============================================ +# Aktive Stunden auf Maximum setzen (verhindert Neustarts tagsüber) +reg add "HKLM\SOFTWARE\Microsoft\WindowsUpdate\UX\Settings" /v ActiveHoursStart /t REG_DWORD /d 6 /f +reg add "HKLM\SOFTWARE\Microsoft\WindowsUpdate\UX\Settings" /v ActiveHoursEnd /t REG_DWORD /d 2 /f + +# ============================================ +# 7. Auto-Login nach Neustart (für Scheduled Tasks) +# ============================================ +# WICHTIG: Scheduled Tasks mit "AtLogOn" brauchen eine aktive Sitzung. +# Nach einem Neustart muss der User automatisch eingeloggt werden. +# Über GUI: netplwiz → Haken bei "Benutzer müssen Benutzernamen und Kennwort eingeben" entfernen +# Oder per Registry (Passwort wird hier im Klartext gespeichert!): +# reg add "HKLM\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Winlogon" /v AutoAdminLogon /t REG_SZ /d 1 /f +# reg add "HKLM\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Winlogon" /v DefaultUserName /t REG_SZ /d "tills" /f +# reg add "HKLM\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Winlogon" /v DefaultPassword /t REG_SZ /d "DEIN_PASSWORT" /f +``` + +### Manuell im Geräte-Manager prüfen + +1. **Geräte-Manager** öffnen → **Netzwerkadapter** → Rechtsklick auf den Ethernet-Adapter +2. **Eigenschaften** → **Energieverwaltung** +3. Haken bei **"Computer kann das Gerät ausschalten, um Energie zu sparen"** entfernen + +### Energiesparplan verifizieren + +```powershell +# Aktiven Energiesparplan anzeigen +powercfg /getactivescheme +# Sollte "Höchstleistung" zeigen + +# Alle Einstellungen prüfen +powercfg /query +``` + +--- + +## Schritt 10: Neustart ```powershell Restart-Computer ``` +Nach dem Neustart sollte der PC automatisch einloggen (falls Auto-Login konfiguriert) und die Energiespareinstellungen aktiv sein. + --- -## Schritt 10: SSH-Key einrichten (passwortloser Zugriff) +## Schritt 11: SSH-Key einrichten (passwortloser Zugriff) Auf dem **Windows-PC** in PowerShell **als Administrator** ausführen: @@ -166,7 +312,7 @@ icacls C:\ProgramData\ssh\administrators_authorized_keys /grant "${adminGroup}:( --- -## Schritt 11: SSH testen +## Schritt 12: SSH testen **Vom Mac (dev-Rechner)** aus testen: @@ -202,30 +348,402 @@ Nach diesen Schritten hat der Windows-PC: - [x] Python 3.11 - [x] Git - [x] Arbeitsverzeichnis `C:\mana\` +- [x] Server-Modus: Kein Ruhezustand, kein Standby, Höchstleistung +- [x] Auto-Login nach Neustart (für Scheduled Tasks) +- [x] Schnellstart deaktiviert (sauberer SSH-Boot) **Alles Weitere (Ollama, AI-Services, Cloudflare Tunnel) wird dann per SSH gemacht.** --- +## AI-Services Einrichtung (per SSH) + +Nach der Grundinstallation wurden folgende AI-Services eingerichtet: + +### Ollama (LLM Inference) + +- **Port**: 11434 +- **Host-Binding**: `0.0.0.0` (LAN-Zugriff via `OLLAMA_HOST` System-Umgebungsvariable) +- **Installierte Modelle**: + - `gemma3:4b` (3.3 GB) - Schnelles Chat-Modell + - `gemma3:12b` (8.1 GB) - Bessere Qualität + - `qwen2.5-coder:14b` (9.0 GB) - Code-Generierung + - `nomic-embed-text` - Embedding-Modell + +Ollama startet automatisch beim Login (System Tray). + +### mana-llm (Port 3025) + +Zentraler LLM-Gateway mit OpenAI-kompatiblem API. Routet Anfragen an Ollama und Cloud-Provider. + +- **Verzeichnis**: `C:\mana\services\mana-llm\` +- **venv**: `C:\mana\venvs\llm\` +- **Config**: `C:\mana\services\mana-llm\.env` +- **Log**: `C:\mana\services\mana-llm\service.log` +- **Autostart**: Windows Scheduled Task "ManaLLM" (AtLogOn) + +### mana-stt (Port 3020) + +Speech-to-Text mit faster-whisper (CUDA-beschleunigt auf RTX 3090). + +- **Verzeichnis**: `C:\mana\services\mana-stt\` +- **venv**: `C:\mana\venvs\stt\` +- **Config**: `C:\mana\services\mana-stt\.env` +- **Log**: `C:\mana\services\mana-stt\service.log` +- **Autostart**: Windows Scheduled Task "ManaSTT" (AtLogOn) +- **Backend**: `faster-whisper` mit CTranslate2 (CUDA float16) +- **Default-Modell**: `large-v3-turbo` (~1.6 GB, wird beim ersten Request geladen) + +### mana-tts (Port 3022) + +Text-to-Speech mit mehreren Backends: + +- **Verzeichnis**: `C:\mana\services\mana-tts\` +- **venv**: `C:\mana\venvs\tts\` (PyTorch 2.5.1+cu121) +- **Config**: `C:\mana\services\mana-tts\.env` +- **Log**: `C:\mana\services\mana-tts\service.log` +- **Autostart**: Windows Scheduled Task "ManaTTS" (AtLogOn) +- **Backends**: + - **Kokoro** (82M, CUDA) — Englische Stimmen, ~1s Latenz, 27 Stimmen + - **Edge TTS** (Cloud) — Deutsche Stimmen (Katja, Conrad, etc.), ~2s Latenz + - **Piper** (CPU/ONNX) — Lokale deutsche Stimmen (Thorsten, Kerstin), schnell + - **F5-TTS** (CUDA) — Voice Cloning mit Referenz-Audio + +### mana-image-gen (Port 3023) + +Bildgenerierung mit FLUX.1-schnell (12B Parameter) via HuggingFace diffusers. + +- **Verzeichnis**: `C:\mana\services\mana-image-gen\` +- **venv**: `C:\mana\venvs\image-gen\` (PyTorch 2.5.1+cu121) +- **Config**: `C:\mana\services\mana-image-gen\.env` +- **Log**: `C:\mana\services\mana-image-gen\service.log` +- **Autostart**: Windows Scheduled Task "ManaImageGen" (AtLogOn) +- **Modell**: FLUX.1-schnell (Apache 2.0, 4-bit quantisiert via BitsAndBytes) +- **HuggingFace**: Erfordert Login + Lizenzakzeptanz für gated Model + +### Management-Skripte + +```powershell +# Status aller Services anzeigen +python C:\mana\status.py + +# Alle Services starten (falls nicht via Scheduled Task) +python C:\mana\start-all.py + +# Alle Services stoppen +python C:\mana\stop-all.py + +# Scheduled Tasks manuell starten/stoppen +Start-ScheduledTask -TaskName "ManaLLM" +Start-ScheduledTask -TaskName "ManaSTT" +Start-ScheduledTask -TaskName "ManaTTS" +Start-ScheduledTask -TaskName "ManaImageGen" + +# Alle Scheduled Tasks auf einmal anzeigen +Get-ScheduledTask -TaskName "Mana*" | Format-Table TaskName, State +``` + +### Zugriff: Öffentliche URLs (von überall) + +Die GPU-Services sind über den Cloudflare Tunnel des Mac Mini öffentlich erreichbar. +Auf dem Mac Mini läuft ein TCP-Proxy (`~/gpu-proxy.py` als LaunchAgent), der den Traffic +an den GPU-Server im LAN weiterleitet. + +``` +Internet → Cloudflare → Mac Mini (gpu-proxy.py) → GPU Server (LAN) +``` + +| Service | Öffentliche URL | +|---------|----------------| +| mana-llm | `https://gpu-llm.mana.how` | +| mana-stt | `https://gpu-stt.mana.how` | +| mana-tts | `https://gpu-tts.mana.how` | +| mana-image-gen | `https://gpu-img.mana.how` | +| Ollama | `https://gpu-ollama.mana.how` | + +```bash +# LLM API +curl https://gpu-llm.mana.how/health +curl -X POST https://gpu-llm.mana.how/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"ollama/gemma3:12b","messages":[{"role":"user","content":"Hallo"}]}' + +# STT API (WhisperX mit Word-Timestamps + Speaker Diarization) +curl https://gpu-stt.mana.how/health +curl -X POST https://gpu-stt.mana.how/transcribe \ + -F "file=@recording.wav" -F "language=de" -F "align=true" -F "diarize=true" + +# TTS API +curl https://gpu-tts.mana.how/health +curl -X POST https://gpu-tts.mana.how/synthesize/auto \ + -H "Content-Type: application/json" \ + -d '{"text":"Hallo Welt","voice":"de_katja"}' --output hello.wav + +# Image Generation (FLUX.2 klein 4B) +curl -X POST https://gpu-img.mana.how/generate \ + -H "Content-Type: application/json" \ + -d '{"prompt":"A cat","width":1024,"height":1024}' + +# Ollama direkt +curl https://gpu-ollama.mana.how/api/tags +``` + +### Zugriff: LAN (direkt, schneller) + +Vom Mac Mini oder anderen Geräten im gleichen Netzwerk: + +```bash +curl http://192.168.178.11:3025/health # mana-llm +curl http://192.168.178.11:3020/health # mana-stt +curl http://192.168.178.11:3022/health # mana-tts +curl http://192.168.178.11:3023/health # mana-image-gen +curl http://192.168.178.11:11434/api/tags # Ollama +``` + +### Verzeichnisstruktur + +``` +C:\mana\ +├── services\ +│ ├── mana-llm\ # LLM Gateway Service (Port 3025) +│ │ ├── src\ # Python-Quellcode (FastAPI + Provider Router) +│ │ ├── .env # Konfiguration +│ │ ├── service.pyw # Service Runner +│ │ └── service.log # Log +│ ├── mana-stt\ # Speech-to-Text Service (Port 3020) +│ │ ├── app\ # Python-Quellcode (faster-whisper CUDA) +│ │ ├── .env +│ │ ├── service.pyw +│ │ └── service.log +│ ├── mana-tts\ # Text-to-Speech Service (Port 3022) +│ │ ├── app\ # Python-Quellcode (Kokoro + Edge TTS + Piper) +│ │ ├── .env +│ │ ├── service.pyw +│ │ └── service.log +│ └── mana-image-gen\ # Bildgenerierung (Port 3023) +│ ├── app\ # Python-Quellcode (diffusers + FLUX) +│ ├── output\ # Generierte Bilder (temporär) +│ ├── .env +│ ├── service.pyw +│ └── service.log +├── venvs\ +│ ├── llm\ # Python venv für mana-llm +│ ├── stt\ # Python venv für mana-stt (faster-whisper) +│ ├── tts\ # Python venv für mana-tts (PyTorch+CUDA) +│ └── image-gen\ # Python venv für mana-image-gen (PyTorch+CUDA+diffusers) +├── models\ # (reserviert für lokale Modelle) +├── start-all.py # Alle Services starten +├── stop-all.py # Alle Services stoppen +├── status.py # Status-Übersicht +├── healthcheck.py # Health Check + Auto-Restart (alle 5 Min) +├── healthcheck.log # Health Check Log +├── log-rotate.py # Log-Rotation (>10MB → .log.1/.2/.3) +├── log-shipper.py # Logs an Loki auf Mac Mini senden +└── log-shipper-state.json # Letzte Leseposition pro Log +``` + +--- + +## Health Check & Auto-Restart + +Ein Health-Check-Skript prüft alle 5 Minuten ob die Services laufen und startet gefallene neu. +Zusätzlich werden Log-Rotation und Log-Shipping (an Loki) ausgeführt. + +- **Skript**: `C:\mana\healthcheck.py` +- **Log**: `C:\mana\healthcheck.log` +- **Scheduled Task**: `ManaHealthCheck` (alle 5 Min) +- **Ausführungsreihenfolge**: Health Check → Log-Rotation → Log-Shipping + +```powershell +# Manuell ausführen +python C:\mana\healthcheck.py + +# Log ansehen +Get-Content C:\mana\healthcheck.log -Tail 20 +``` + +### Log-Rotation + +- **Skript**: `C:\mana\log-rotate.py` +- **Trigger**: Wird vom Health-Check alle 5 Min aufgerufen +- **Schwelle**: Rotiert ab 10 MB +- **Backups**: 3 Kopien (service.log.1, .2, .3) + +### Monitoring & Logs (Loki + Grafana) + +GPU-Service-Logs werden alle 5 Minuten an **Loki** auf dem Mac Mini geschickt und sind über **Grafana** durchsuchbar. + +``` +GPU Server (healthcheck.py → log-shipper.py) + → HTTP POST → Mac Mini (Loki :3100) + → Grafana (grafana.mana.how) +``` + +**Komponenten:** + +| Komponente | Wo | Beschreibung | +|---|---|---| +| `C:\mana\log-shipper.py` | GPU Server | Liest service.log Dateien, pusht neue Zeilen an Loki | +| `C:\mana\log-shipper-state.json` | GPU Server | Merkt sich letzte Leseposition pro Log | +| Loki (Docker) | Mac Mini | Log-Aggregator, 30 Tage Retention | +| VictoriaMetrics | Mac Mini | Scraped `/metrics` und `/health` der GPU-Services alle 15-30s | +| Grafana | Mac Mini | Dashboard unter `grafana.mana.how` | + +**Loki-Queries in Grafana** (Explore → Datasource: Loki): + +``` +{job="mana-stt"} # Alle STT-Logs +{job="mana-llm"} |= "error" # LLM-Fehler +{host="gpu-server"} # Alle GPU-Server-Logs +{job="healthcheck"} |= "DOWN" # Service-Ausfälle +``` + +**Prometheus-Metriken** (VictoriaMetrics scraped über LAN): + +| Target | Job | Port | +|---|---|---| +| GPU LLM | `gpu-llm` | 3025 (`/metrics`) | +| GPU STT | `gpu-stt` | 3020 (`/health`) | +| GPU TTS | `gpu-tts` | 3022 (`/health`) | +| GPU Image Gen | `gpu-image-gen` | 3023 (`/health`) | + +--- + +## TypeScript Client (`@manacore/shared-gpu`) + +Shared Package im Monorepo (`packages/shared-gpu/`) für alle GPU-Services: + +```typescript +import { GpuClient, GPU_PUBLIC_URLS } from '@manacore/shared-gpu'; + +// Öffentlich (von überall) +const gpu = new GpuClient({ baseUrl: 'https://gpu.mana.how' }); + +// Oder LAN (direkt, schneller) +const gpuLan = new GpuClient({ baseUrl: 'http://192.168.178.11' }); + +// Speech-to-Text (mit Word-Timestamps + Speaker Diarization) +const transcript = await gpu.stt.transcribe(audioBuffer, 'recording.wav', { + language: 'de', + diarize: true, + maxSpeakers: 3, +}); +// → { text, words: [{ word, start, end, speaker }], speakers: ['SPEAKER_00', ...] } + +// Text-to-Speech (Deutsch: Edge TTS, Englisch: Kokoro) +const { audio } = await gpu.tts.synthesize({ text: 'Hallo Welt', voice: 'de_katja' }); + +// Image Generation (FLUX.2 klein 4B, ~3s @ 1024x1024) +const image = await gpu.image.generate({ prompt: 'A futuristic city', width: 1024, height: 1024 }); +const imageUrl = gpu.image.imageUrl(image.image_url); + +// Health Check aller Services +const health = await gpu.healthCheck(); +// → { stt: true, tts: true, image: true } +``` + +--- + ## Fehlerbehebung -### SSH verbindet nicht +### Server nicht erreichbar (kein Ping, kein SSH) + +**Häufigste Ursache: Ruhezustand/Energiesparen nicht deaktiviert.** + +1. Am PC physisch aufwecken (Taste drücken) +2. Schritt 9 (Server-Modus) erneut durchführen +3. Prüfen: + +```powershell +# Aktiven Energiesparplan anzeigen +powercfg /getactivescheme +# Muss "Höchstleistung" zeigen + +# Ruhezustand-Status prüfen +powercfg /availablesleepstates +# "Ruhezustand" sollte NICHT aufgeführt sein + +# Standby-Timeout prüfen (muss 0 sein) +powercfg /query SCHEME_CURRENT SUB_SLEEP STANDBYIDLE +``` + +### SSH verbindet nicht (PC ist aber an) ```powershell # Auf dem Windows-PC prüfen: Get-Service sshd # Muss "Running" zeigen Test-NetConnection -ComputerName localhost -Port 22 # Muss "TcpTestSucceeded: True" zeigen + +# SSH-Dienst neu starten +Restart-Service sshd +``` + +### Services laufen nicht nach Neustart + +Die Services nutzen Scheduled Tasks mit `AtLogOn` — der User muss eingeloggt sein. + +```powershell +# Prüfen ob Tasks registriert sind +Get-ScheduledTask -TaskName "Mana*" | Format-Table TaskName, State + +# Manuell starten +Start-ScheduledTask -TaskName "ManaLLM" +Start-ScheduledTask -TaskName "ManaSTT" +Start-ScheduledTask -TaskName "ManaTTS" +Start-ScheduledTask -TaskName "ManaImageGen" + +# Status prüfen +python C:\mana\status.py +``` + +Falls Tasks als "Ready" statt "Running" angezeigt werden: +- Auto-Login ist nicht konfiguriert → Schritt 9, Punkt 7 +- Oder: manuell am PC einloggen + +### Service-Logs prüfen + +```powershell +# Letzten 20 Zeilen eines Logs anzeigen +Get-Content C:\mana\services\mana-llm\service.log -Tail 20 +Get-Content C:\mana\services\mana-stt\service.log -Tail 20 +Get-Content C:\mana\services\mana-tts\service.log -Tail 20 +Get-Content C:\mana\services\mana-image-gen\service.log -Tail 20 ``` ### nvidia-smi zeigt Fehler -- Treiber neu installieren +- Treiber neu installieren: https://www.nvidia.com/Download/index.aspx - PC neu starten - Prüfen ob die GPU im Geräte-Manager sichtbar ist +### GPU VRAM voll + +```powershell +# GPU-Auslastung prüfen +nvidia-smi + +# Einzelnen Service stoppen um VRAM freizugeben +python C:\mana\stop-all.py + +# Oder gezielt einen Service neu starten +Stop-ScheduledTask -TaskName "ManaImageGen" +Start-ScheduledTask -TaskName "ManaImageGen" +``` + ### IP-Adresse stimmt nicht ```powershell ipconfig -# → Ethernet-Adapter prüfen, IPv4-Adresse muss die statische sein +# → Ethernet-Adapter prüfen, IPv4-Adresse muss 192.168.178.11 sein +``` + +### Port-Konflikte prüfen + +```powershell +# Alle lauschenden Ports anzeigen (deutsch: "ABHÖREN") +netstat -ano | Select-String "ABHOR" + +# Welcher Prozess nutzt einen bestimmten Port? +netstat -ano | Select-String "3025" +Get-Process -Id ``` diff --git a/packages/shared-gpu/package.json b/packages/shared-gpu/package.json new file mode 100644 index 000000000..6f8be5be2 --- /dev/null +++ b/packages/shared-gpu/package.json @@ -0,0 +1,28 @@ +{ + "name": "@manacore/shared-gpu", + "version": "1.0.0", + "private": true, + "description": "Client library for Mana GPU services (STT, TTS, Image Generation)", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "require": "./dist/index.js" + } + }, + "scripts": { + "build": "tsc", + "dev": "tsc --watch", + "clean": "rm -rf dist", + "type-check": "tsc --noEmit" + }, + "devDependencies": { + "@types/node": "^20.0.0", + "typescript": "^5.0.0" + }, + "files": [ + "dist" + ] +} diff --git a/packages/shared-gpu/src/gpu-client.ts b/packages/shared-gpu/src/gpu-client.ts new file mode 100644 index 000000000..1f292642e --- /dev/null +++ b/packages/shared-gpu/src/gpu-client.ts @@ -0,0 +1,56 @@ +import type { GpuServiceConfig } from './types'; +import { SttClient } from './stt-client'; +import { TtsClient } from './tts-client'; +import { ImageClient } from './image-client'; + +/** + * Unified client for all Mana GPU services. + * + * @example Public URLs (from anywhere): + * ```ts + * const gpu = new GpuClient({ baseUrl: 'https://gpu.mana.how' }); + * ``` + * + * @example LAN (direct): + * ```ts + * const gpu = new GpuClient({ baseUrl: 'http://192.168.178.11' }); + * ``` + * + * @example Custom URLs: + * ```ts + * const gpu = new GpuClient({ + * baseUrl: '', + * urls: { stt: 'https://gpu-stt.mana.how', tts: 'https://gpu-tts.mana.how' }, + * }); + * ``` + */ +export class GpuClient { + public readonly stt: SttClient; + public readonly tts: TtsClient; + public readonly image: ImageClient; + + constructor(config: GpuServiceConfig) { + this.stt = new SttClient(config); + this.tts = new TtsClient(config); + this.image = new ImageClient(config); + } + + /** Check health of all GPU services. */ + async healthCheck(): Promise<{ + stt: boolean; + tts: boolean; + image: boolean; + }> { + const [sttHealth, ttsHealth, imageHealth] = await Promise.allSettled([ + this.stt.health(), + this.tts.health(), + this.image.health(), + ]); + + return { + stt: sttHealth.status === 'fulfilled' && sttHealth.value.status === 'healthy', + tts: ttsHealth.status === 'fulfilled' && ttsHealth.value.status === 'healthy', + image: imageHealth.status === 'fulfilled' && imageHealth.value.status === 'healthy', + }; + } +} diff --git a/packages/shared-gpu/src/image-client.ts b/packages/shared-gpu/src/image-client.ts new file mode 100644 index 000000000..fbb5953f3 --- /dev/null +++ b/packages/shared-gpu/src/image-client.ts @@ -0,0 +1,72 @@ +import type { + GenerateImageOptions, + GenerateImageResult, + ImageGenHealthResponse, + GpuServiceConfig, +} from './types'; +import { resolveServiceUrl } from './resolve-url'; + +export class ImageClient { + private baseUrl: string; + private timeout: number; + + constructor(config: GpuServiceConfig) { + this.baseUrl = resolveServiceUrl(config, 'image'); + this.timeout = config.timeout ?? 120_000; + } + + /** Generate an image from a text prompt. */ + async generate(options: GenerateImageOptions): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeout); + + try { + const response = await fetch(`${this.baseUrl}/generate`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + prompt: options.prompt, + width: options.width ?? 1024, + height: options.height ?? 1024, + steps: options.steps ?? 4, + seed: options.seed, + output_format: options.outputFormat ?? 'png', + }), + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.json().catch(() => ({ detail: response.statusText })); + throw new Error( + `Image generation error ${response.status}: ${(error as { detail: string }).detail}` + ); + } + + return (await response.json()) as GenerateImageResult; + } finally { + clearTimeout(timer); + } + } + + /** Get the full URL for a generated image. */ + imageUrl(relativePath: string): string { + return `${this.baseUrl}${relativePath}`; + } + + /** Download a generated image as ArrayBuffer. */ + async downloadImage(relativePath: string): Promise { + const response = await fetch(this.imageUrl(relativePath), { + signal: AbortSignal.timeout(30_000), + }); + if (!response.ok) throw new Error(`Failed to download image: ${response.status}`); + return response.arrayBuffer(); + } + + /** Check if the image generation service is healthy. */ + async health(): Promise { + const response = await fetch(`${this.baseUrl}/health`, { + signal: AbortSignal.timeout(5000), + }); + return (await response.json()) as ImageGenHealthResponse; + } +} diff --git a/packages/shared-gpu/src/index.ts b/packages/shared-gpu/src/index.ts new file mode 100644 index 000000000..c1e7514ca --- /dev/null +++ b/packages/shared-gpu/src/index.ts @@ -0,0 +1,24 @@ +export { GpuClient } from './gpu-client'; +export { SttClient } from './stt-client'; +export { TtsClient } from './tts-client'; +export { ImageClient } from './image-client'; +export { resolveServiceUrl } from './resolve-url'; +export { GPU_PUBLIC_URLS, GPU_LAN_URLS } from './types'; +export type { + // Config + GpuServiceConfig, + // STT + TranscriptionResult, + TranscribeOptions, + WordTimestamp, + Segment, + // TTS + SynthesizeOptions, + TTSVoice, + TTSVoiceType, + TTSHealthResponse, + // Image + GenerateImageOptions, + GenerateImageResult, + ImageGenHealthResponse, +} from './types'; diff --git a/packages/shared-gpu/src/resolve-url.ts b/packages/shared-gpu/src/resolve-url.ts new file mode 100644 index 000000000..a9205b69f --- /dev/null +++ b/packages/shared-gpu/src/resolve-url.ts @@ -0,0 +1,31 @@ +import type { GpuServiceConfig } from './types'; +import { GPU_PUBLIC_URLS } from './types'; + +type ServiceKey = 'llm' | 'stt' | 'tts' | 'image' | 'ollama'; + +const LAN_PORTS: Record = { + llm: 3025, + stt: 3020, + tts: 3022, + image: 3023, + ollama: 11434, +}; + +/** Resolve the URL for a specific GPU service based on config. */ +export function resolveServiceUrl(config: GpuServiceConfig, service: ServiceKey): string { + // 1. Explicit override + if (config.urls?.[service]) { + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + return config.urls[service]!; + } + + const base = config.baseUrl; + + // 2. Public mode: "https://gpu.mana.how" → "https://gpu-stt.mana.how" + if (base.includes('gpu.mana.how')) { + return GPU_PUBLIC_URLS[service]; + } + + // 3. LAN mode: "http://192.168.178.11" → "http://192.168.178.11:3020" + return `${base.replace(/\/$/, '')}:${LAN_PORTS[service]}`; +} diff --git a/packages/shared-gpu/src/stt-client.ts b/packages/shared-gpu/src/stt-client.ts new file mode 100644 index 000000000..2ffc36c86 --- /dev/null +++ b/packages/shared-gpu/src/stt-client.ts @@ -0,0 +1,59 @@ +import type { TranscriptionResult, TranscribeOptions, GpuServiceConfig } from './types'; +import { resolveServiceUrl } from './resolve-url'; + +export class SttClient { + private baseUrl: string; + private timeout: number; + + constructor(config: GpuServiceConfig) { + this.baseUrl = resolveServiceUrl(config, 'stt'); + this.timeout = config.timeout ?? 60_000; + } + + /** Transcribe audio with optional word timestamps and speaker diarization. */ + async transcribe( + audioBuffer: Buffer | Blob, + filename: string, + options: TranscribeOptions = {} + ): Promise { + const formData = new FormData(); + const blob = + audioBuffer instanceof Blob ? audioBuffer : new Blob([new Uint8Array(audioBuffer)]); + formData.append('file', blob, filename); + + if (options.language) formData.append('language', options.language); + if (options.model) formData.append('model', options.model); + formData.append('align', String(options.align ?? true)); + formData.append('diarize', String(options.diarize ?? false)); + if (options.minSpeakers != null) formData.append('min_speakers', String(options.minSpeakers)); + if (options.maxSpeakers != null) formData.append('max_speakers', String(options.maxSpeakers)); + + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeout); + + try { + const response = await fetch(`${this.baseUrl}/transcribe`, { + method: 'POST', + body: formData, + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.json().catch(() => ({ detail: response.statusText })); + throw new Error(`STT error ${response.status}: ${(error as { detail: string }).detail}`); + } + + return (await response.json()) as TranscriptionResult; + } finally { + clearTimeout(timer); + } + } + + /** Check if the STT service is healthy. */ + async health(): Promise<{ status: string; whisperx: boolean }> { + const response = await fetch(`${this.baseUrl}/health`, { + signal: AbortSignal.timeout(5000), + }); + return (await response.json()) as { status: string; whisperx: boolean }; + } +} diff --git a/packages/shared-gpu/src/tts-client.ts b/packages/shared-gpu/src/tts-client.ts new file mode 100644 index 000000000..789c85ad5 --- /dev/null +++ b/packages/shared-gpu/src/tts-client.ts @@ -0,0 +1,67 @@ +import type { SynthesizeOptions, TTSVoice, TTSHealthResponse, GpuServiceConfig } from './types'; +import { resolveServiceUrl } from './resolve-url'; + +export class TtsClient { + private baseUrl: string; + private timeout: number; + + constructor(config: GpuServiceConfig) { + this.baseUrl = resolveServiceUrl(config, 'tts'); + this.timeout = config.timeout ?? 30_000; + } + + /** Synthesize speech. Returns audio as ArrayBuffer. */ + async synthesize(options: SynthesizeOptions): Promise<{ + audio: ArrayBuffer; + contentType: string; + voice: string; + duration: number; + }> { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeout); + + try { + const response = await fetch(`${this.baseUrl}/synthesize/auto`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: options.text, + voice: options.voice, + speed: options.speed ?? 1.0, + output_format: options.outputFormat ?? 'wav', + }), + signal: controller.signal, + }); + + if (!response.ok) { + const error = await response.json().catch(() => ({ detail: response.statusText })); + throw new Error(`TTS error ${response.status}: ${(error as { detail: string }).detail}`); + } + + return { + audio: await response.arrayBuffer(), + contentType: response.headers.get('content-type') ?? 'audio/wav', + voice: response.headers.get('x-voice') ?? options.voice ?? 'default', + duration: parseFloat(response.headers.get('x-duration') ?? '0'), + }; + } finally { + clearTimeout(timer); + } + } + + /** Get available voices. */ + async voices(): Promise<{ kokoro_voices: TTSVoice[]; custom_voices: TTSVoice[] }> { + const response = await fetch(`${this.baseUrl}/voices`, { + signal: AbortSignal.timeout(5000), + }); + return (await response.json()) as { kokoro_voices: TTSVoice[]; custom_voices: TTSVoice[] }; + } + + /** Check if the TTS service is healthy. */ + async health(): Promise { + const response = await fetch(`${this.baseUrl}/health`, { + signal: AbortSignal.timeout(5000), + }); + return (await response.json()) as TTSHealthResponse; + } +} diff --git a/packages/shared-gpu/src/types.ts b/packages/shared-gpu/src/types.ts new file mode 100644 index 000000000..b1dec81c4 --- /dev/null +++ b/packages/shared-gpu/src/types.ts @@ -0,0 +1,142 @@ +// ============================================================================ +// STT Types +// ============================================================================ + +export interface WordTimestamp { + word: string; + start: number; + end: number; + score?: number; + speaker?: string; +} + +export interface Segment { + start: number; + end: number; + text: string; + speaker?: string; +} + +export interface TranscriptionResult { + text: string; + language?: string; + model: string; + latency_ms?: number; + duration_seconds?: number; + words?: WordTimestamp[]; + segments?: Segment[]; + speakers?: string[]; +} + +export interface TranscribeOptions { + language?: string; + model?: string; + /** Enable word-level timestamp alignment (default: true) */ + align?: boolean; + /** Enable speaker diarization (default: false) */ + diarize?: boolean; + minSpeakers?: number; + maxSpeakers?: number; +} + +// ============================================================================ +// TTS Types +// ============================================================================ + +export interface SynthesizeOptions { + text: string; + voice?: string; + speed?: number; + outputFormat?: 'wav' | 'mp3'; +} + +export type TTSVoiceType = 'kokoro' | 'piper' | 'edge' | 'f5_custom'; + +export interface TTSVoice { + id: string; + name: string; + description: string; + type: TTSVoiceType; +} + +export interface TTSHealthResponse { + status: string; + service: string; + models_loaded: Record; + auth_required: boolean; +} + +// ============================================================================ +// Image Generation Types +// ============================================================================ + +export interface GenerateImageOptions { + prompt: string; + width?: number; + height?: number; + steps?: number; + seed?: number; + outputFormat?: 'png' | 'jpg'; +} + +export interface GenerateImageResult { + success: boolean; + image_url: string; + prompt: string; + width: number; + height: number; + steps: number; + seed: number; + generation_time: number; +} + +export interface ImageGenHealthResponse { + status: string; + service: string; + flux_available: boolean; +} + +// ============================================================================ +// GPU Service Config +// ============================================================================ + +export interface GpuServiceConfig { + /** + * Base URL of the GPU server. + * + * LAN mode (single host, different ports): + * `http://192.168.178.11` → :3025, :3020, :3022, :3023 + * + * Public mode (different hostnames): + * `https://gpu.mana.how` → gpu-llm.mana.how, gpu-stt.mana.how, etc. + */ + baseUrl: string; + /** Override individual service URLs (takes precedence over baseUrl) */ + urls?: { + llm?: string; + stt?: string; + tts?: string; + image?: string; + ollama?: string; + }; + /** Request timeout in ms (default: 30000) */ + timeout?: number; +} + +/** Default public URLs */ +export const GPU_PUBLIC_URLS = { + llm: 'https://gpu-llm.mana.how', + stt: 'https://gpu-stt.mana.how', + tts: 'https://gpu-tts.mana.how', + image: 'https://gpu-img.mana.how', + ollama: 'https://gpu-ollama.mana.how', +} as const; + +/** Default LAN URLs */ +export const GPU_LAN_URLS = { + llm: 'http://192.168.178.11:3025', + stt: 'http://192.168.178.11:3020', + tts: 'http://192.168.178.11:3022', + image: 'http://192.168.178.11:3023', + ollama: 'http://192.168.178.11:11434', +} as const; diff --git a/packages/shared-gpu/tsconfig.json b/packages/shared-gpu/tsconfig.json new file mode 100644 index 000000000..c546d3ba4 --- /dev/null +++ b/packages/shared-gpu/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "commonjs", + "lib": ["ES2022", "DOM", "DOM.Iterable"], + "declaration": true, + "strict": true, + "noImplicitAny": true, + "strictNullChecks": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "esModuleInterop": true, + "outDir": "./dist", + "rootDir": "./src", + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/services/mana-stt/app/whisper_service_cuda.py b/services/mana-stt/app/whisper_service_cuda.py new file mode 100644 index 000000000..90460a454 --- /dev/null +++ b/services/mana-stt/app/whisper_service_cuda.py @@ -0,0 +1,175 @@ +""" +Whisper STT Service using faster-whisper (CUDA) +Optimized for NVIDIA GPUs (RTX 3090 etc.) + +Drop-in replacement for whisper_service.py (MLX version). +Uses faster-whisper with CTranslate2 for GPU-accelerated inference. +""" + +import os +import tempfile +import logging +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# Lazy load to avoid import errors if not installed +_whisper_model = None + + +@dataclass +class TranscriptionResult: + text: str + language: Optional[str] = None + duration: Optional[float] = None + segments: Optional[list] = None + + +def get_whisper_model(model_name: str = "large-v3", **kwargs): + """Get or create Whisper model instance (singleton pattern).""" + global _whisper_model + + if _whisper_model is None: + logger.info(f"Loading Whisper model: {model_name}") + try: + from faster_whisper import WhisperModel + + # Use CUDA with float16 for RTX 3090 + compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16") + device = os.getenv("WHISPER_DEVICE", "cuda") + + _whisper_model = WhisperModel( + model_name, + device=device, + compute_type=compute_type, + ) + logger.info(f"Whisper model loaded: {model_name} on {device} ({compute_type})") + except ImportError as e: + logger.error(f"Failed to import faster_whisper: {e}") + raise RuntimeError( + "faster-whisper not installed. " + "Run: pip install faster-whisper" + ) + except Exception as e: + logger.error(f"Failed to load Whisper model: {e}") + raise + + return _whisper_model + + +def transcribe_audio( + audio_path: str, + language: Optional[str] = None, + model_name: str = "large-v3", +) -> TranscriptionResult: + """ + Transcribe audio file using faster-whisper (CUDA). + + Args: + audio_path: Path to audio file (mp3, wav, m4a, etc.) + language: Optional language code (e.g., 'de', 'en'). Auto-detect if None. + model_name: Whisper model to use + + Returns: + TranscriptionResult with text and metadata + """ + model = get_whisper_model(model_name) + + logger.info(f"Transcribing: {audio_path}") + + try: + segments, info = model.transcribe( + audio_path, + language=language, + beam_size=5, + vad_filter=True, # Filter out silence + ) + + # Collect all segments + all_segments = [] + full_text_parts = [] + for segment in segments: + full_text_parts.append(segment.text) + all_segments.append({ + "start": segment.start, + "end": segment.end, + "text": segment.text, + }) + + text = " ".join(full_text_parts) + detected_language = info.language if info else language + + logger.info(f"Transcription complete: {len(text)} characters, language={detected_language}") + + return TranscriptionResult( + text=text.strip(), + language=detected_language, + duration=info.duration if info else None, + segments=all_segments, + ) + + except Exception as e: + logger.error(f"Transcription failed: {e}") + raise + + +async def transcribe_audio_bytes( + audio_bytes: bytes, + filename: str, + language: Optional[str] = None, + model_name: str = "large-v3", +) -> TranscriptionResult: + """ + Transcribe audio from bytes (for API uploads). + + Args: + audio_bytes: Raw audio file bytes + filename: Original filename (for extension detection) + language: Optional language code + model_name: Whisper model to use + + Returns: + TranscriptionResult + """ + # Get file extension + ext = Path(filename).suffix or ".wav" + + # Write to temp file + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: + tmp.write(audio_bytes) + tmp_path = tmp.name + + try: + result = transcribe_audio( + audio_path=tmp_path, + language=language, + model_name=model_name, + ) + return result + finally: + # Clean up temp file + try: + os.unlink(tmp_path) + except Exception: + pass + + +# Available models for faster-whisper +AVAILABLE_MODELS = [ + "tiny", + "tiny.en", + "base", + "base.en", + "small", + "small.en", + "medium", + "medium.en", + "large-v1", + "large-v2", + "large-v3", + "large-v3-turbo", + "distil-large-v2", + "distil-large-v3", +]