diff --git a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts index 9dce42673..ff0c17dbc 100644 --- a/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts +++ b/apps/mana/apps/web/src/lib/data/ai/missions/runner.ts @@ -51,6 +51,11 @@ const RESEARCH_TRIGGER = /\b(recherchier|research|news|finde|suche|aktuelle|neue * 5 is generous for read-act-refine patterns ("list_notes → tag them") * without running the LLM bill dry on stuck missions. */ const MAX_REASONING_LOOP_ITERATIONS = 5; + +/** Min interval between Dexie phaseDetail writes during streaming. + * 50 tokens/s × 500ms = ~25 tokens between writes — frequent enough + * for the UI to feel live, infrequent enough to avoid Dexie thrashing. */ +const STREAMING_PHASE_THROTTLE_MS = 500; /** Singleton row id of the kontext doc — kept in sync with * `modules/kontext/types.ts` (KONTEXT_SINGLETON_ID). */ const KONTEXT_SINGLETON_ID = 'singleton'; @@ -274,8 +279,32 @@ export async function runMission( : `Planner Runde ${loopIndex + 1}/${MAX_REASONING_LOOP_ITERATIONS}` ); let plan: AiPlanOutput; + + // Streaming: show live token progress while waiting for the + // planner response. Throttled to avoid Dexie write floods. + let streamTokenCount = 0; + let lastStreamWrite = 0; + const roundLabel = loopIndex === 0 ? '' : ` (Runde ${loopIndex + 1})`; + const onToken = (_delta: string) => { + streamTokenCount++; + const now = Date.now(); + if (now - lastStreamWrite < STREAMING_PHASE_THROTTLE_MS) return; + lastStreamWrite = now; + void setIterationPhase( + mission!.id, + iterationId, + 'calling-llm', + `empfange Plan${roundLabel}… ${streamTokenCount} tokens` + ); + }; + try { - plan = await deps.plan({ mission: mission!, resolvedInputs: loopInputs, availableTools }); + plan = await deps.plan({ + mission: mission!, + resolvedInputs: loopInputs, + availableTools, + onToken, + }); } catch (err) { if (isAiDebugEnabled()) { void recordAiDebug({ diff --git a/apps/mana/apps/web/src/lib/llm-tasks/ai-plan.ts b/apps/mana/apps/web/src/lib/llm-tasks/ai-plan.ts index ee7104fbc..edfb7b135 100644 --- a/apps/mana/apps/web/src/lib/llm-tasks/ai-plan.ts +++ b/apps/mana/apps/web/src/lib/llm-tasks/ai-plan.ts @@ -51,6 +51,7 @@ export const aiPlanTask: LlmTask = { // (e.g. 10 notes → 10 add_tag_to_note calls). 4096 fits ~15-20 // step objects while still fast on browser tier. maxTokens: 4096, + onToken: input.onToken, }); // Always populate debug payload (cheap — strings already in memory). diff --git a/apps/mana/apps/web/src/lib/modules/credits/ListView.svelte b/apps/mana/apps/web/src/lib/modules/credits/ListView.svelte index fc075be83..89fba5632 100644 --- a/apps/mana/apps/web/src/lib/modules/credits/ListView.svelte +++ b/apps/mana/apps/web/src/lib/modules/credits/ListView.svelte @@ -27,6 +27,7 @@ type CreditOperationType, } from '@mana/credits'; import { ManaEvents } from '@mana/shared-utils/analytics'; + import { authStore } from '$lib/stores/auth.svelte'; let balance = $state(null); let transactions = $state([]); @@ -135,21 +136,35 @@ async function loadData() { loading = true; error = null; - try { - const [balanceData, transactionsData, packagesData] = await Promise.all([ - creditsService.getBalance(), - creditsService.getTransactions(20), - creditsService.getPackages(), - ]); - balance = balanceData; - transactions = transactionsData; - packages = packagesData.filter((p) => p.active).sort((a, b) => a.sortOrder - b.sortOrder); - } catch (e) { - error = e instanceof Error ? e.message : $_('common.error_loading'); - console.error('Failed to load credits data:', e); - } finally { + + // API calls require auth — skip for guests, still show costs tab (static data) + if (!authStore.isAuthenticated) { loading = false; + return; } + + // Load each independently so a single 401/failure doesn't blank the whole page + const results = await Promise.allSettled([ + creditsService.getBalance(), + creditsService.getTransactions(20), + creditsService.getPackages(), + ]); + + if (results[0].status === 'fulfilled') balance = results[0].value; + if (results[1].status === 'fulfilled') transactions = results[1].value; + if (results[2].status === 'fulfilled') { + packages = results[2].value.filter((p) => p.active).sort((a, b) => a.sortOrder - b.sortOrder); + } + + // Only show error if ALL three failed + const allFailed = results.every((r) => r.status === 'rejected'); + if (allFailed) { + const firstErr = results.find((r) => r.status === 'rejected') as PromiseRejectedResult; + error = + firstErr.reason instanceof Error ? firstErr.reason.message : $_('common.error_loading'); + } + + loading = false; } // ── Helpers ──────────────────────────────────────────── diff --git a/apps/mana/apps/web/src/lib/modules/playground/llm.ts b/apps/mana/apps/web/src/lib/modules/playground/llm.ts index f6df7558f..7c6b2b1a9 100644 --- a/apps/mana/apps/web/src/lib/modules/playground/llm.ts +++ b/apps/mana/apps/web/src/lib/modules/playground/llm.ts @@ -2,16 +2,12 @@ * Playground LLM client — thin wrapper around mana-llm's OpenAI-compatible * `/v1/chat/completions` (streaming) and `/v1/models` endpoints. * - * Lives next to the playground UI rather than in a shared package because - * the playground is the only consumer right now. If chat / todo enrichment - * / period insights end up calling the same surface in the future, lift - * this into `$lib/data/llm-client.ts`. - * - * The chunk parser is hand-rolled rather than pulled from a library: the - * SSE wire format from mana-llm is straight OpenAI (`data: {…}\n\n` lines - * with a sentinel `[DONE]`), so a 30-line reader is simpler than a dep. + * The SSE chunk parser lives in `@mana/shared-llm/sse-parser` and is shared + * with the LLM orchestrator's remote backend (backends/remote.ts). */ +import { consumeSSEStream } from '@mana/shared-llm/sse-parser'; + const DEFAULT_LLM_URL = 'http://localhost:3025'; /** Resolve the mana-llm base URL from the window-injected env, falling @@ -91,49 +87,42 @@ export async function* streamCompletion(opts: CompletionOptions): AsyncGenerator throw new Error(`mana-llm: ${res.status} ${res.statusText}${text ? ` — ${text}` : ''}`); } - const reader = res.body.getReader(); - const decoder = new TextDecoder(); - let buffer = ''; + // Collect chunks via the shared SSE parser, then yield them. + // We use a queue pattern so the async generator can yield chunks as + // they arrive from the callback-based consumeSSEStream. + const chunks: StreamChunk[] = []; + let resolve: (() => void) | null = null; + let done = false; - while (true) { - const { value, done } = await reader.read(); - if (done) break; - buffer += decoder.decode(value, { stream: true }); + const streamPromise = consumeSSEStream( + res.body, + (content) => { + chunks.push({ type: 'delta', content }); + resolve?.(); + }, + (usage) => { + chunks.push({ + type: 'usage', + promptTokens: usage.promptTokens, + completionTokens: usage.completionTokens, + }); + resolve?.(); + } + ).then(() => { + done = true; + resolve?.(); + }); - // SSE frames are separated by blank lines. Process complete frames - // and leave any partial trailing frame in the buffer for the next - // chunk. - let sep: number; - while ((sep = buffer.indexOf('\n\n')) !== -1) { - const frame = buffer.slice(0, sep); - buffer = buffer.slice(sep + 2); - - for (const line of frame.split('\n')) { - if (!line.startsWith('data:')) continue; - const data = line.slice(5).trim(); - if (!data || data === '[DONE]') continue; - try { - const json = JSON.parse(data) as { - choices?: Array<{ delta?: { content?: string } }>; - usage?: { prompt_tokens?: number; completion_tokens?: number }; - }; - const delta = json.choices?.[0]?.delta?.content; - if (delta) yield { type: 'delta', content: delta }; - - // Usage stats — typically in the final chunk - if (json.usage?.prompt_tokens != null) { - yield { - type: 'usage', - promptTokens: json.usage.prompt_tokens, - completionTokens: json.usage.completion_tokens ?? 0, - }; - } - } catch { - // Malformed frame — skip silently. mana-llm occasionally - // emits keepalive comments and we don't want them to - // crash the stream. - } - } + while (!done || chunks.length > 0) { + if (chunks.length > 0) { + yield chunks.shift()!; + } else { + await new Promise((r) => { + resolve = r; + }); } } + + // Ensure the stream promise settles (propagate any errors). + await streamPromise; } diff --git a/docs/reports/ai-agent-architecture-comparison.md b/docs/reports/ai-agent-architecture-comparison.md new file mode 100644 index 000000000..677a5f313 --- /dev/null +++ b/docs/reports/ai-agent-architecture-comparison.md @@ -0,0 +1,416 @@ +# AI-Agent-Architektur: Mana vs. Industrie-Frameworks + +**Stand:** 2026-04-16 +**Zweck:** Vergleich der Mana AI-Workbench-Architektur mit Google A2A, MCP, OpenAI Agents SDK, LangGraph, CrewAI und Microsoft Agent Framework. Stärken identifizieren, Verbesserungspotenzial aufzeigen. + +--- + +## 1. Zusammenfassung der Mana-Architektur + +### Was wir haben + +Mana implementiert ein **Dual-Runtime AI-Agent-System**: + +| Komponente | Beschreibung | +|------------|-------------| +| **Foreground Runner** (Browser) | `runner.ts` — Reasoning-Loop mit bis zu 5 Planner-Iterationen, direkte Dexie-Writes, E2E-verschlüsselt | +| **Background Runner** (mana-ai, Port 3067) | `tick.ts` — 60s Cron-Tick, scannt fällige Missions, plant via mana-llm, schreibt über sync_changes zurück | +| **Planner** | Shared Prompt-Template (`@mana/shared-ai/src/planner/`) → OpenAI-kompatible API auf mana-llm | +| **Tool-System** | 13 Tools (todo, calendar, places, notes, news, drink), policy-gated (auto/propose/deny) | +| **Agents** | Named Personas mit eigener systemPrompt, memory, policy, Budget, Concurrency-Limits | +| **Proposals** | Mutationen unter `propose`-Policy erzeugen Proposals → User muss approven | +| **Actor-System** | Jeder Write trägt einen immutablen Actor (user/ai/system) mit frozen displayName | +| **Encryption** | Mission Key-Grants für serverseitige Entschlüsselung, AES-GCM-256, HKDF-scoped | + +### Execution Flow + +``` +Mission (title, objective, inputs, cadence) + → Input Resolution (Dexie oder DB + optionale Grant-Entschlüsselung) + → Planner Call (system prompt + tools + inputs → JSON plan) + → Policy Gate pro Step (auto → execute, propose → Proposal, deny → reject) + → Iteration Write-back (LWW sync via mana-sync) + → Optional: Chained Reasoning (tool output → nächster Planner Call) +``` + +--- + +## 2. Industrie-Frameworks im Überblick + +### 2.1 Google A2A (Agent-to-Agent Protocol) + +**Was es ist:** Offenes Protokoll für Inter-Agent-Kommunikation. Linux Foundation, 150+ Organisationen, v1.0 seit 2026. + +**Kernkonzepte:** +- **Agent Cards** (`/.well-known/a2a/agent-card`) — JSON-Selbstbeschreibung: Capabilities, Skills (mit Input/Output JSON-Schemas), Security, Protocol Bindings. Signiert für dezentrales Trust. +- **Tasks** — Kern-Arbeitseinheit: `SUBMITTED → WORKING → COMPLETED/FAILED`. Plus `INPUT_REQUIRED` und `AUTH_REQUIRED` für HITL-Flows. +- **Messages & Parts** — Modalitätsagnostisch: text, raw (binary), url (Datei), data (strukturiertes JSON). +- **Artifacts** — Task-Outputs, getrennt von Konversation. Streambar (`append: true`, `lastChunk: true`). + +**Kommunikationsmuster:** +- Blocking (Request/Response) +- Streaming (SSE, gRPC, JSON-RPC) +- Push Notifications (Webhooks für long-running Tasks) + +**Protocol Bindings:** JSON-RPC 2.0, HTTP/REST, gRPC. + +### 2.2 Anthropic MCP (Model Context Protocol) + +**Was es ist:** Offenes Protokoll für die Verbindung von LLM-Anwendungen mit externen Tools/Daten. Linux Foundation, 10.000+ Server, 97M monatliche SDK-Downloads. + +**Architektur (Host → Client → Server):** +- **Host** = LLM-Anwendung, managed Security +- **Client** = 1:1 zu Server, JSON-RPC 2.0 Session +- **Server** = Lightweight Adapter, exponiert drei Primitives: + +| Primitive | Gesteuert von | Zweck | +|-----------|---------------|-------| +| **Resources** | Application | Kontextdaten (Dateien, DB-Einträge) | +| **Tools** | Model | Ausführbare Funktionen | +| **Prompts** | User | Vordefinierte Templates | + +**Entscheidende Design-Regel:** Server sehen nie die volle Konversation oder andere Server. Host kontrolliert alle Cross-Server-Interaktionen. + +### 2.3 OpenAI Agents SDK + +**Kernprimitive:** +- **Agent** — LLM + instructions + tools + handoffs +- **Runner** — Orchestriert den Execution Loop +- **Handoffs** — First-class Primitive für Multi-Agent-Delegation (kein separater Orchestrator nötig) +- **Guardrails** — Input/Output-Validierung parallel zur Ausführung +- **Sessions** — Persistente Memory (SQLite, Redis, verschlüsselt) +- **Tracing** — Built-in Observability (LLM calls, tool calls, handoffs, OpenTelemetry) + +### 2.4 LangGraph + +**Kernarchitektur — StateGraph:** +- **Nodes** = Funktionen (Agents, Tools, Logik) +- **Edges** = Kontrollfluss (statisch oder conditional) +- **State** = Zentraler TypedDict, immutable Updates +- **Checkpointing** = State-Persistenz mit Time-Travel-Debugging + +**Multi-Agent-Patterns:** Subagents, Handoffs, Router, Supervisor, Scatter-Gather, Subgraphs. + +### 2.5 CrewAI + +**Kernarchitektur:** +- **Agent** = role + goal + backstory + tools + LLM +- **Task** = Beschreibung + erwartetes Output-Format + Guardrails + HITL +- **Crew** = Agents + Tasks + Prozesstyp (sequential, hierarchical, consensual) +- **Flows** = Event-driven Orchestrierung für Produktion +- **Memory** = 4 Typen: Short-term, Long-term (Embeddings), Entity, Contextual + +### 2.6 Microsoft Agent Framework + +**Vereinigung von AutoGen + Semantic Kernel:** +- **Agents** = LLM + Tools + MCP-Server + Middleware +- **Workflows** = Graph-basiert mit Type-safe Routing, Checkpointing, HITL +- **Sessions** = Enterprise-grade State Management +- Multi-Provider (Anthropic, Azure OpenAI, OpenAI, Ollama) + +--- + +## 3. Vergleichsmatrix + +| Dimension | Mana | A2A | MCP | OpenAI SDK | LangGraph | CrewAI | +|-----------|------|-----|-----|------------|-----------|--------| +| **Agent-Definition** | Agent(name, role, systemPrompt, memory, policy) | Agent Card (JSON, signiert) | N/A (Protokoll) | Agent(instructions, tools, handoffs) | Node-Funktionen | Agent(role, goal, backstory) | +| **Tool-Registration** | Hardcoded Allow-List (13 Tools) | Skills in Agent Card | tools/list + tools/call | @function_tool + MCP | Node context | tools= Parameter | +| **Agent↔Agent** | ❌ Nicht vorhanden | ✅ Kernzweck | ❌ Nicht designed dafür | Handoffs | Edges/Routing | Delegation + Hierarchie | +| **Agent↔Tool** | Policy-gated Executor | Via MCP | ✅ Kernzweck | Function calls + MCP | Node-Aufrufe | Direkte Zuweisung | +| **State/Memory** | LWW Sync + encrypted IndexedDB | Task contextId | Stateful Sessions | Sessions (SQLite/Redis) | StateGraph + Checkpoints | 4 Memory-Typen | +| **Orchestrierung** | Dual-Runtime (Browser + Server Cron) | Task Lifecycle | Host koordiniert | Runner Loop | DAG Engine | Sequential/Hierarchical | +| **Streaming** | ❌ Kein Streaming | SSE, gRPC, JSON-RPC | JSON-RPC Notifications | Built-in | Native Token-Stream | Log-basiert | +| **Observability** | Prometheus Metrics + Debug Logs | Agent Cards Metadata | Server Logging | Built-in Tracing (OTel) | LangSmith | Built-in Logging | +| **HITL** | Proposal-System (approve/reject) | INPUT_REQUIRED State | Elicitation | Guardrails | Interrupt/Resume | Task-Guardrails | +| **Encryption** | ✅ AES-GCM + Key-Grants | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Local-First** | ✅ Dexie + Offline | ❌ Server-to-Server | ❌ | ❌ | ❌ | ❌ | +| **Multi-Device** | ✅ LWW Sync | ❌ | ❌ | ❌ | ❌ | ❌ | + +--- + +## 4. Was Mana gut macht + +### 4.1 Privacy-First Architektur (einzigartig in der Industrie) + +Kein einziges der verglichenen Frameworks hat ein vergleichbares Verschlüsselungskonzept: +- **AES-GCM-256 at rest** für 27 Tabellen +- **Mission Key-Grants** mit HKDF-Scoping, Audit Trail, TTL +- **Zero-Knowledge-Modus** optional +- Server sieht nie Klartext ohne expliziten Grant + +**Bewertung:** Das ist ein echter Wettbewerbsvorteil. A2A, MCP, OpenAI SDK — alle gehen davon aus, dass der Server alles sehen darf. + +### 4.2 Dual-Runtime mit Graceful Degradation + +- Browser-Runner funktioniert offline und hat Zugriff auf alle verschlüsselten Daten +- Server-Runner läuft scheduled auch wenn kein Browser offen ist +- Bei fehlendem Key-Grant degradiert Server gracefully → Browser übernimmt + +**Bewertung:** Kein anderes Framework hat diese Browser/Server-Dualität. LangGraph und CrewAI sind rein serverseitig. OpenAI SDK hat keinen Offline-Modus. + +### 4.3 Immutable Actor Attribution + +Jeder Write trägt einen eingefrorenen Actor mit `kind`, `principalId`, `displayName`, `missionId`, `iterationId`, `rationale`. Das ist besser als jedes der verglichenen Frameworks: +- OpenAI SDK hat Tracing, aber keine Write-Level Attribution +- LangGraph hat State-Checkpoints, aber keine Actor-Zuordnung pro Mutation +- CrewAI hat keine per-Write-Attribution + +**Bewertung:** Exzellent für Audit, Undo, und Vertrauen. Ermöglicht "Wer hat was warum geändert?" auf Feldebene. + +### 4.4 Proposal-System (Human-in-the-Loop) + +Das dreistufige Policy-System (auto/propose/deny) mit dem Proposal-Lifecycle ist durchdacht: +- Granular pro Tool konfigurierbar +- User-Feedback bei Rejection fließt zurück in den Planner +- Pro-Agent Policies (Phase 3) + +**Vergleich:** OpenAI hat Guardrails (aber binär: pass/fail). A2A hat `INPUT_REQUIRED` (aber kein propose/approve-Workflow). LangGraph hat Interrupt/Resume (ähnlich, aber weniger formalisiert). + +### 4.5 Local-First + Multi-Device Sync + +Field-Level LWW über mana-sync ist einzigartig unter AI-Agent-Frameworks. Missions, Agents und ihre Ergebnisse synchen automatisch über Geräte. + +--- + +## 5. Was Mana verbessern könnte + +### 5.1 🔴 Kein Agent-to-Agent-Protokoll + +**Problem:** Agents können nicht miteinander kommunizieren. Jeder Agent ist eine isolierte Insel. Ein "Cashflow Watcher" kann keinen "Todo Manager" bitten, eine Aufgabe zu erstellen. + +**Was die Industrie macht:** +- **A2A**: Agents discovern sich über Agent Cards und delegieren Tasks +- **OpenAI SDK**: Handoffs als First-class Primitive +- **LangGraph**: Edges zwischen Agent-Nodes +- **CrewAI**: Hierarchical delegation + shared context + +**Empfehlung:** Ein leichtgewichtiges internes Agent-to-Agent-Protokoll einführen: + +```typescript +interface AgentDelegation { + fromAgentId: string; + toAgentId: string; + intent: ToolCallIntent; // Was soll der andere Agent tun? + context: string; // Warum? + policy: 'auto' | 'propose'; // User muss bestätigen? +} +``` + +Langfristig: A2A-kompatible Agent Cards für externe Integration (z.B. Mana-Agents mit Google Agents kommunizieren lassen). + +### 5.2 🔴 Kein Streaming + +**Problem:** Der aktuelle Flow ist Request/Response. User sieht nichts, bis die gesamte Iteration fertig ist. + +**Was die Industrie macht:** +- **A2A**: SSE + gRPC Streaming mit `TaskStatusUpdateEvent` und `TaskArtifactUpdateEvent` +- **OpenAI SDK**: Built-in Token-Streaming +- **LangGraph**: Native Token-by-Token Streaming +- **MCP**: JSON-RPC Notifications + +**Empfehlung:** SSE-basiertes Streaming für den Foreground Runner: +1. Planner-Response streamen (Token für Token) +2. Step-Ausführung live anzeigen ("Erstelle Task... ✓", "Tagge Note... ⏳") +3. Server-Runner: SSE-Events an verbundene Clients pushen + +### 5.3 🟡 Statisches Tool-System + +**Problem:** 13 hardcoded Tools in einer Allow-List. Neue Tools erfordern Code-Änderungen an mindestens 3 Stellen (server tools.ts, shared-ai proposable-tools, executor). + +**Was die Industrie macht:** +- **MCP**: Dynamische Tool-Discovery via `tools/list`, Hot-Reload via `notifications/tools/list_changed` +- **OpenAI SDK**: `@function_tool` Decorator, automatische Schema-Generierung +- **LangGraph**: Tools als Node-Funktionen, dynamisch registrierbar +- **A2A**: Skills in Agent Cards mit JSON-Schema + +**Empfehlung:** MCP-kompatibles Tool-Registry einführen: +```typescript +// Jedes Modul registriert seine Tools deklarativ +// apps/mana/apps/web/src/lib/modules/todo/ai-tools.ts +export const todoTools: AiToolDefinition[] = [ + { + name: 'create_task', + description: 'Create a new task', + inputSchema: { /* JSON Schema */ }, + module: 'todo', + annotations: { destructiveHint: false, idempotentHint: false } + } +]; +``` +Module registrieren Tools via Registry → Planner bekommt dynamische Tool-Liste → Server synchronisiert über shared-ai Package. **Bonus:** MCP-Server-Export ermöglicht externe Tool-Nutzung. + +### 5.4 🟡 Kein Graph-basierter Workflow + +**Problem:** Der Reasoning Loop ist linear (Planner → Steps → optional Loop). Komplexe Workflows (Branching, Parallelisierung, Conditional Logic) sind nicht möglich. + +**Was die Industrie macht:** +- **LangGraph**: DAG mit Conditional Edges, Parallel Nodes, Subgraphs +- **CrewAI**: Sequential + Hierarchical + Consensual Prozesse +- **Microsoft Agent Framework**: Type-safe Graph Workflows + +**Empfehlung:** Für die meisten Mana-Usecases ist der lineare Loop ausreichend. Aber für komplexere Missions (z.B. "Analysiere meine Finanzen, erstelle einen Bericht, und plane Aufgaben basierend auf dem Ergebnis") wäre ein einfacher DAG sinnvoll: + +```typescript +interface MissionGraph { + nodes: MissionStep[]; // Jeder Node = ein Planner-Call oder Tool-Execution + edges: MissionEdge[]; // Conditional routing + parallelGroups?: string[][]; // Steps die parallel laufen können +} +``` + +**Priorität:** Niedrig. Der aktuelle Loop deckt 90% der Usecases ab. + +### 5.5 🟡 Keine Agent-Discovery + +**Problem:** Agents sind nur ihrem Ersteller bekannt. Kein Mechanismus für: +- "Welche Agents gibt es?" +- "Welcher Agent kann X?" +- Automatische Delegation basierend auf Capabilities + +**Was die Industrie macht:** +- **A2A**: Agent Cards mit Skills + JSON-Schemas → automatische Discovery +- **MCP**: Resource Discovery via URI-Schemes + +**Empfehlung:** Agent Capabilities als strukturierte Metadaten: +```typescript +interface AgentCapability { + module: string; // 'todo', 'notes', 'calendar' + actions: string[]; // ['create', 'update', 'query'] + inputTypes: string[]; // Welche Inputs kann der Agent verarbeiten? +} +``` +Ermöglicht: "Finde den Agent, der Kalender-Events erstellen kann" → automatisches Routing. + +### 5.6 🟡 Kein Guardrail-System + +**Problem:** Policy (auto/propose/deny) ist ein Gating-Mechanismus, aber kein Guardrail. Es gibt keine: +- Input-Validierung (ist der Prompt safe?) +- Output-Validierung (ist das Ergebnis korrekt/sicher?) +- Budget-Enforcement serverseitig (nur client-seitig) + +**Was die Industrie macht:** +- **OpenAI SDK**: Input + Output Guardrails parallel zur Execution +- **CrewAI**: Task-Level Guardrails +- **LangGraph**: Custom Validation Nodes + +**Empfehlung:** Guardrails als separate Schicht: +```typescript +interface Guardrail { + name: string; + phase: 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute'; + check: (context: GuardrailContext) => GuardrailResult; +} +// Beispiele: +// - PII-Detection vor Planner-Call +// - Budget-Check vor Execution +// - Output-Schema-Validierung nach Tool-Call +``` + +### 5.7 🟢 Tracing & Debugging verbessern + +**Problem:** Debug-Info ist in `_aiDebugLog` (localStorage, nie gesynct) + Prometheus Metrics. Kein zusammenhängendes Tracing einer Mission über Browser + Server. + +**Was die Industrie macht:** +- **OpenAI SDK**: Built-in Tracing mit visuellen DAGs, OpenTelemetry Export +- **LangGraph**: LangSmith Integration, Step-by-Step Debugging, Time-Travel +- **Microsoft**: Semantic Kernel Telemetry + +**Empfehlung:** OpenTelemetry-Spans für den gesamten Mission-Lifecycle: +``` +Mission.run (span) + ├── Input.resolve (span, per input) + ├── Planner.call (span, mit prompt + response) + ├── Step[0].execute (span, tool name + params + result) + ├── Step[1].propose (span, proposal created) + └── Iteration.write (span, sync write) +``` +Exportierbar nach Grafana Tempo oder ähnlichem. + +--- + +## 6. Strategische Empfehlungen (priorisiert) + +### Kurzfristig (nächste 2-4 Wochen) + +| # | Maßnahme | Aufwand | Impact | +|---|----------|---------|--------| +| 1 | **Streaming für Foreground Runner** — SSE vom Planner, live Step-Status im UI | Mittel | Hoch — UX-Sprung | +| 2 | **Dynamisches Tool-Registry** — Module registrieren Tools deklarativ, Server synchronisiert | Mittel | Hoch — Skalierbarkeit | +| 3 | **Budget-Enforcement serverseitig** — Token-Counting pro Agent im tick.ts | Klein | Mittel — Sicherheit | + +### Mittelfristig (1-3 Monate) + +| # | Maßnahme | Aufwand | Impact | +|---|----------|---------|--------| +| 4 | **Agent-to-Agent Delegation** — Internes Protokoll, ein Agent kann einen anderen beauftragen | Groß | Hoch — Multi-Agent | +| 5 | **MCP-Server-Export** — Mana-Tools als MCP-Server exponieren | Mittel | Hoch — Ökosystem | +| 6 | **Guardrail-Layer** — Pre/Post-Execution Checks | Mittel | Mittel — Sicherheit | +| 7 | **OpenTelemetry Tracing** — End-to-End Mission Spans | Mittel | Mittel — Debugging | + +### Langfristig (3-6 Monate) + +| # | Maßnahme | Aufwand | Impact | +|---|----------|---------|--------| +| 8 | **A2A-kompatible Agent Cards** — Mana-Agents extern discoverable machen | Groß | Hoch — Interop | +| 9 | **Graph-basierte Workflows** — DAG für komplexe Missions | Groß | Mittel — Power-User | +| 10 | **Agent Memory (Embeddings)** — Long-term Memory à la CrewAI | Groß | Mittel — Intelligenz | + +--- + +## 7. Architektur-Diagramm: Ist vs. Soll + +### Ist-Zustand +``` +┌──────────────────────────────────────────────────────┐ +│ Browser (Foreground Runner) │ +│ Mission → Planner → Policy Gate → Execute/Propose │ +│ ↕ Dexie (encrypted) ↕ mana-sync (LWW) │ +└──────────────────┬───────────────────────────────────┘ + │ sync_changes +┌──────────────────▼───────────────────────────────────┐ +│ mana-ai (Background Runner) │ +│ Tick → Due Missions → Planner → Write Iteration │ +│ ↕ PostgreSQL (RLS) ↕ mana-llm │ +└──────────────────────────────────────────────────────┘ +``` + +### Soll-Zustand (mittelfristig) +``` +┌──────────────────────────────────────────────────────────────┐ +│ Browser │ +│ Mission → [Guardrails] → Planner (streaming) → Policy Gate │ +│ Agent A ←→ Agent B (delegation) │ +│ ↕ Dexie (encrypted) ↕ mana-sync (LWW + SSE) │ +└──────────────────┬───────────────────────────────────────────┘ + │ sync_changes + OTel spans +┌──────────────────▼───────────────────────────────────────────┐ +│ mana-ai (Background Runner) │ +│ Tick → [Budget Check] → Due Missions → Planner → Write │ +│ Tool Registry (dynamic, MCP-compatible) │ +│ ↕ PostgreSQL (RLS) ↕ mana-llm ↕ Grafana Tempo │ +└──────────────────┬───────────────────────────────────────────┘ + │ A2A Agent Cards (langfristig) +┌──────────────────▼───────────────────────────────────────────┐ +│ Externe Agents (Google ADK, OpenAI, etc.) │ +│ Discovery via /.well-known/a2a/agent-card │ +└──────────────────────────────────────────────────────────────┘ +``` + +--- + +## 8. Fazit + +**Mana's AI-Agent-Architektur ist in drei Bereichen führend:** +1. **Privacy/Encryption** — Kein vergleichbares Framework hat Key-Grants, Zero-Knowledge, oder at-rest Encryption für Agent-Daten +2. **Local-First + Multi-Device** — Einzigartige Dual-Runtime mit Offline-Fähigkeit +3. **Actor Attribution** — Bestes Audit-Trail-System unter allen verglichenen Frameworks + +**Die größten Lücken gegenüber der Industrie:** +1. **Kein Agent-to-Agent** — Die wichtigste fehlende Capability für echte Multi-Agent-Systeme +2. **Kein Streaming** — Standard in allen modernen Frameworks, fehlt komplett +3. **Statisches Tool-System** — Skaliert nicht, wenn neue Module Tools brauchen + +**Die Kernempfehlung:** Mana sollte nicht versuchen, ein General-Purpose-Agent-Framework zu werden (das machen LangGraph/CrewAI besser). Stattdessen die einzigartigen Stärken (Privacy, Local-First, Attribution) ausbauen und gezielt die Industrie-Standards adoptieren, die den größten UX-Impact haben: **Streaming**, **dynamische Tools**, und **Agent-Delegation**. + +MCP-Kompatibilität als mittelfristiges Ziel ist strategisch richtig — es ist das Protokoll, das sich als Standard für Agent↔Tool durchgesetzt hat (97M Downloads/Monat). A2A für Agent↔Agent ist das natürliche Pendant, aber erst relevant, wenn interne Multi-Agent-Kommunikation steht. diff --git a/packages/shared-ai/src/planner/types.ts b/packages/shared-ai/src/planner/types.ts index 7d2ee2bdd..21b21c126 100644 --- a/packages/shared-ai/src/planner/types.ts +++ b/packages/shared-ai/src/planner/types.ts @@ -30,6 +30,13 @@ export interface AiPlanInput { readonly mission: Mission; readonly resolvedInputs: readonly ResolvedInput[]; readonly availableTools: readonly AvailableTool[]; + /** + * Optional streaming callback — called with each token delta as it + * arrives from the LLM. The Runner uses this to show live progress + * during the "calling-llm" phase. Only effective when the backend + * supports streaming (mana-server / cloud tiers). + */ + readonly onToken?: (delta: string) => void; } export interface PlannedStep { diff --git a/packages/shared-llm/package.json b/packages/shared-llm/package.json index 9b14f5ebb..02a5ad560 100644 --- a/packages/shared-llm/package.json +++ b/packages/shared-llm/package.json @@ -7,7 +7,8 @@ "main": "./src/index.ts", "types": "./src/index.ts", "exports": { - ".": "./src/index.ts" + ".": "./src/index.ts", + "./sse-parser": "./src/sse-parser.ts" }, "scripts": { "type-check": "tsc --noEmit", diff --git a/packages/shared-llm/src/backends/remote.ts b/packages/shared-llm/src/backends/remote.ts index 80385de78..aaba8a91b 100644 --- a/packages/shared-llm/src/backends/remote.ts +++ b/packages/shared-llm/src/backends/remote.ts @@ -7,13 +7,12 @@ * * The endpoint is `/v1/chat/completions` and the wire format is * straight OpenAI SSE: `data: {…}\n\n` lines, terminated by - * `data: [DONE]`. The hand-rolled parser is the same shape as the - * existing playground client (apps/mana/apps/web/src/lib/modules/ - * playground/llm.ts) so the two consumers stay aligned and can be - * unified later if we want. + * `data: [DONE]`. The SSE parser lives in `../sse-parser.ts` and is + * shared with the playground client. */ import { BackendUnreachableError, ProviderBlockedError } from '../errors'; +import { consumeSSEStream } from '../sse-parser'; import type { LlmTier } from '../tiers'; import type { GenerateResult, LlmTaskRequest } from '../types'; @@ -33,31 +32,20 @@ export function resolveLlmBaseUrl(): string { /** * Send a chat completion to mana-llm and return the result. * - * Implementation notes: + * When `req.onToken` is set, uses SSE streaming (`stream: true`) so + * the caller receives per-token callbacks as they arrive — used by the + * Mission Runner to show live progress during the "calling-llm" phase. * - * - We use the NON-streaming endpoint (`stream: false`). Curl tests - * from the same hostname showed that mana-llm's streaming endpoint - * works perfectly when called from outside the browser, but the - * browser receives `totalFrames=0` (an empty response body) for - * reasons that almost certainly trace back to CORS + credentials - * + streaming-body interactions. Non-streaming is a single JSON - * response, much friendlier to the browser fetch API. + * When `req.onToken` is absent, uses the non-streaming endpoint + * (`stream: false`) which returns a single JSON response — simpler and + * sufficient for tasks that only care about the final result. * - * - We do NOT pass `credentials: 'include'`. The mana-llm service - * doesn't require user auth (the API key middleware accepts - * anonymous requests), and `credentials: 'include'` plus - * `Access-Control-Allow-Origin: *` is one of the patterns that - * silently breaks the response body in browsers. Verified by - * comparing curl-from-server (no creds, works) vs browser fetch - * (with creds, empty body). - * - * - For tasks that registered an `onToken` callback (legacy chat- - * style streaming UX), we fire it ONCE with the full content at - * the end. That's a degraded streaming experience, but no current - * shared-llm caller actually consumes the per-token stream — the - * queue + watcher model only cares about the final result. The - * playground module uses its own client (apps/.../modules/ - * playground/llm.ts) which keeps real streaming for live UX. + * We do NOT pass `credentials: 'include'` — the mana-llm service + * accepts anonymous requests, and `credentials: 'include'` plus + * `Access-Control-Allow-Origin: *` silently breaks the response body + * in browsers (verified by comparing curl vs browser fetch). The + * playground module uses the same no-credentials pattern with + * `stream: true` and it works fine. * * `tier` is only used for error tagging — both 'mana-server' and * 'cloud' call the same endpoint with different model strings. @@ -69,6 +57,7 @@ export async function callManaLlmStreaming( ): Promise { const url = `${resolveLlmBaseUrl()}/v1/chat/completions`; const start = performance.now(); + const useStreaming = typeof req.onToken === 'function'; let res: Response; try { @@ -80,7 +69,7 @@ export async function callManaLlmStreaming( messages: req.messages, temperature: req.temperature ?? 0.7, max_tokens: req.maxTokens ?? 1024, - stream: false, + stream: useStreaming, }), }); } catch (err) { @@ -102,6 +91,25 @@ export async function callManaLlmStreaming( throw new BackendUnreachableError(tier, res.status, text); } + // ── Streaming path: SSE with per-token callbacks ─────────── + if (useStreaming && res.body) { + let usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 }; + const content = await consumeSSEStream(res.body, req.onToken, (u) => { + usage = { + promptTokens: u.promptTokens, + completionTokens: u.completionTokens, + totalTokens: u.promptTokens + u.completionTokens, + }; + }); + + if (!content) { + console.warn(`[shared-llm:${tier}] empty streaming content`, { model }); + } + + return { content, usage, latencyMs: Math.round(performance.now() - start) }; + } + + // ── Non-streaming path: single JSON response ────────────── let json: { choices?: Array<{ message?: { content?: string; reasoning?: string }; @@ -119,12 +127,7 @@ export async function callManaLlmStreaming( // Field ordering: prefer the canonical OpenAI `message.content` first. // If that's empty AND `message.reasoning` is set, fall back to it — // reasoning models like Gemma 4 emit their thought process there - // when given too few tokens to also produce a final answer (we hit - // this with max_tokens=10 / no system prompt: content was "" while - // reasoning had the half-finished thought). For our title task this - // rarely happens because the system prompt is directive, but the - // fallback is cheap and protects against future tasks that might - // trigger longer reasoning chains. + // when given too few tokens to also produce a final answer. const choice = json.choices?.[0]; const content = choice?.message?.content ?? choice?.message?.reasoning ?? choice?.text ?? ''; @@ -132,14 +135,6 @@ export async function callManaLlmStreaming( console.warn(`[shared-llm:${tier}] empty completion content`, { model, json }); } - // One-shot "streaming" for any caller that registered onToken: emit - // the whole content as a single chunk at the end. The current - // orchestrator + queue model never reads tokens incrementally for - // remote tiers anyway. - if (content && req.onToken) { - req.onToken(content); - } - return { content, usage: { diff --git a/packages/shared-llm/src/sse-parser.ts b/packages/shared-llm/src/sse-parser.ts new file mode 100644 index 000000000..ea262070e --- /dev/null +++ b/packages/shared-llm/src/sse-parser.ts @@ -0,0 +1,74 @@ +/** + * Shared SSE parser for OpenAI-compatible streaming responses. + * + * The wire format from mana-llm is straight OpenAI: `data: {…}\n\n` + * lines with a sentinel `data: [DONE]`. This ~40-line reader is simpler + * than adding a dependency and is shared between the LLM orchestrator + * (backends/remote.ts) and the playground module. + */ + +export interface SSEUsage { + readonly promptTokens: number; + readonly completionTokens: number; +} + +/** + * Consume a ReadableStream of SSE chunks from an OpenAI-compatible + * `/v1/chat/completions` endpoint. + * + * Calls `onDelta` for each content token and `onUsage` (if provided) + * when the final usage stats arrive. Returns the accumulated full + * content string once the stream is done. + */ +export async function consumeSSEStream( + body: ReadableStream, + onDelta?: (content: string) => void, + onUsage?: (usage: SSEUsage) => void +): Promise { + const reader = body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + let content = ''; + + while (true) { + const { value, done } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + + // SSE frames are separated by blank lines. Process complete frames + // and leave any partial trailing frame in the buffer. + let sep: number; + while ((sep = buffer.indexOf('\n\n')) !== -1) { + const frame = buffer.slice(0, sep); + buffer = buffer.slice(sep + 2); + + for (const line of frame.split('\n')) { + if (!line.startsWith('data:')) continue; + const data = line.slice(5).trim(); + if (!data || data === '[DONE]') continue; + try { + const json = JSON.parse(data) as { + choices?: Array<{ delta?: { content?: string } }>; + usage?: { prompt_tokens?: number; completion_tokens?: number }; + }; + const delta = json.choices?.[0]?.delta?.content; + if (delta) { + content += delta; + onDelta?.(delta); + } + if (json.usage?.prompt_tokens != null) { + onUsage?.({ + promptTokens: json.usage.prompt_tokens, + completionTokens: json.usage.completion_tokens ?? 0, + }); + } + } catch { + // Malformed frame — skip silently. mana-llm occasionally + // emits keepalive comments. + } + } + } + } + + return content; +}