/** * Mana-server backend — calls services/mana-llm with an Ollama model * string. mana-llm's ProviderRouter recognizes plain Ollama model names * (no provider prefix) and routes them to its configured Ollama * instance, with automatic Google Gemini fallback if Ollama is * overloaded. * * Where the inference actually runs (subtle, easy to misread): * * mana-llm container's `OLLAMA_URL` points at * `host.docker.internal:13434`. That is NOT the Mac Mini's local * Ollama — it's a Python TCP forwarder (`~/gpu-proxy.py`, running * as a LaunchAgent on the Mac Mini host) that pipes the traffic to * `192.168.178.11:11434` over the LAN, where Ollama is running on * the Windows GPU server with the RTX 3090 (24 GB VRAM). All * inference happens there, not on the Mac Mini's M4 Metal GPU. * * See docs/MAC_MINI_SERVER.md and docs/WINDOWS_GPU_SERVER_SETUP.md * (specifically the "Auf dem Mac Mini läuft ein TCP-Proxy" section) * for the full topology. The Mac Mini's brew-installed Ollama * binary is NOT on the inference path — it's just a local CLI for * inspecting the proxied daemon. * * The default model is gemma4:e4b — Google's Gemma 4 "Effective 4B" * variant, released 2026-04-02. Same family as @mana/local-llm's * browser tier model (Gemma 4 E2B is the smaller sibling) so prompts * behave consistently when a task auto-falls between tiers. e4b is * the right Mana-Server default because: * - 9.6 GB on disk fits comfortably on the 3090's 24 GB VRAM * - 128K context window covers all current title/summarize tasks * - The "Effective 4B" architecture punches well above its weight * class (better than gemma3:4b on most German prompts) * - It's a reasoning model — uses message.reasoning for chain-of- * thought when given enough max_tokens budget; remote.ts has a * fallback parser for that field * - The tier name we surface in the source label stays "Gemma 4" * family for both browser and mana-server, so the UX is coherent */ import type { GenerateResult, LlmBackend, LlmTaskRequest } from '../types'; import { callManaLlmStreaming, resolveLlmBaseUrl } from './remote'; export interface ManaServerBackendOptions { /** Ollama model name to send to mana-llm. Default 'gemma4:e4b'. */ defaultModel?: string; } export class ManaServerBackend implements LlmBackend { readonly tier = 'mana-server' as const; private readonly defaultModel: string; constructor(opts: ManaServerBackendOptions = {}) { this.defaultModel = opts.defaultModel ?? 'gemma4:e4b'; } isAvailable(): boolean { // Available if we have a base URL configured at all. We don't // ping /health here — that adds latency to every isAvailable() // check. The first real call will fail loudly if mana-llm is down. return resolveLlmBaseUrl().length > 0; } isReady(): boolean { // Stateless from our side — assume ready if available. return this.isAvailable(); } async generate(req: LlmTaskRequest): Promise { return callManaLlmStreaming(this.tier, this.defaultModel, req); } }