mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 22:21:10 +02:00
feat(ai): SSE streaming for foreground Mission Runner
Enable real-time token streaming during the planner "calling-llm" phase
so the user sees live progress ("empfange Plan… 128 tokens") instead of
a static spinner. The parser still receives the full text once complete —
no partial-JSON risk.
Changes:
- Extract shared SSE parser from playground into @mana/shared-llm/sse-parser
- remote.ts: use stream:true when onToken callback is provided
- AiPlanInput: add optional onToken field (shared-ai)
- ai-plan task: pass onToken through to backend.generate()
- runner.ts: throttled (500ms) phaseDetail updates during streaming
- Playground: refactored to use shared SSE parser
Also includes: AI agent architecture comparison report (docs/reports/)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8a0bf93699
commit
be81d11dc3
9 changed files with 633 additions and 106 deletions
|
|
@ -51,6 +51,11 @@ const RESEARCH_TRIGGER = /\b(recherchier|research|news|finde|suche|aktuelle|neue
|
|||
* 5 is generous for read-act-refine patterns ("list_notes → tag them")
|
||||
* without running the LLM bill dry on stuck missions. */
|
||||
const MAX_REASONING_LOOP_ITERATIONS = 5;
|
||||
|
||||
/** Min interval between Dexie phaseDetail writes during streaming.
|
||||
* 50 tokens/s × 500ms = ~25 tokens between writes — frequent enough
|
||||
* for the UI to feel live, infrequent enough to avoid Dexie thrashing. */
|
||||
const STREAMING_PHASE_THROTTLE_MS = 500;
|
||||
/** Singleton row id of the kontext doc — kept in sync with
|
||||
* `modules/kontext/types.ts` (KONTEXT_SINGLETON_ID). */
|
||||
const KONTEXT_SINGLETON_ID = 'singleton';
|
||||
|
|
@ -274,8 +279,32 @@ export async function runMission(
|
|||
: `Planner Runde ${loopIndex + 1}/${MAX_REASONING_LOOP_ITERATIONS}`
|
||||
);
|
||||
let plan: AiPlanOutput;
|
||||
|
||||
// Streaming: show live token progress while waiting for the
|
||||
// planner response. Throttled to avoid Dexie write floods.
|
||||
let streamTokenCount = 0;
|
||||
let lastStreamWrite = 0;
|
||||
const roundLabel = loopIndex === 0 ? '' : ` (Runde ${loopIndex + 1})`;
|
||||
const onToken = (_delta: string) => {
|
||||
streamTokenCount++;
|
||||
const now = Date.now();
|
||||
if (now - lastStreamWrite < STREAMING_PHASE_THROTTLE_MS) return;
|
||||
lastStreamWrite = now;
|
||||
void setIterationPhase(
|
||||
mission!.id,
|
||||
iterationId,
|
||||
'calling-llm',
|
||||
`empfange Plan${roundLabel}… ${streamTokenCount} tokens`
|
||||
);
|
||||
};
|
||||
|
||||
try {
|
||||
plan = await deps.plan({ mission: mission!, resolvedInputs: loopInputs, availableTools });
|
||||
plan = await deps.plan({
|
||||
mission: mission!,
|
||||
resolvedInputs: loopInputs,
|
||||
availableTools,
|
||||
onToken,
|
||||
});
|
||||
} catch (err) {
|
||||
if (isAiDebugEnabled()) {
|
||||
void recordAiDebug({
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ export const aiPlanTask: LlmTask<AiPlanInput, AiPlanOutput> = {
|
|||
// (e.g. 10 notes → 10 add_tag_to_note calls). 4096 fits ~15-20
|
||||
// step objects while still fast on browser tier.
|
||||
maxTokens: 4096,
|
||||
onToken: input.onToken,
|
||||
});
|
||||
|
||||
// Always populate debug payload (cheap — strings already in memory).
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
type CreditOperationType,
|
||||
} from '@mana/credits';
|
||||
import { ManaEvents } from '@mana/shared-utils/analytics';
|
||||
import { authStore } from '$lib/stores/auth.svelte';
|
||||
|
||||
let balance = $state<CreditBalance | null>(null);
|
||||
let transactions = $state<CreditTransaction[]>([]);
|
||||
|
|
@ -135,21 +136,35 @@
|
|||
async function loadData() {
|
||||
loading = true;
|
||||
error = null;
|
||||
try {
|
||||
const [balanceData, transactionsData, packagesData] = await Promise.all([
|
||||
creditsService.getBalance(),
|
||||
creditsService.getTransactions(20),
|
||||
creditsService.getPackages(),
|
||||
]);
|
||||
balance = balanceData;
|
||||
transactions = transactionsData;
|
||||
packages = packagesData.filter((p) => p.active).sort((a, b) => a.sortOrder - b.sortOrder);
|
||||
} catch (e) {
|
||||
error = e instanceof Error ? e.message : $_('common.error_loading');
|
||||
console.error('Failed to load credits data:', e);
|
||||
} finally {
|
||||
|
||||
// API calls require auth — skip for guests, still show costs tab (static data)
|
||||
if (!authStore.isAuthenticated) {
|
||||
loading = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Load each independently so a single 401/failure doesn't blank the whole page
|
||||
const results = await Promise.allSettled([
|
||||
creditsService.getBalance(),
|
||||
creditsService.getTransactions(20),
|
||||
creditsService.getPackages(),
|
||||
]);
|
||||
|
||||
if (results[0].status === 'fulfilled') balance = results[0].value;
|
||||
if (results[1].status === 'fulfilled') transactions = results[1].value;
|
||||
if (results[2].status === 'fulfilled') {
|
||||
packages = results[2].value.filter((p) => p.active).sort((a, b) => a.sortOrder - b.sortOrder);
|
||||
}
|
||||
|
||||
// Only show error if ALL three failed
|
||||
const allFailed = results.every((r) => r.status === 'rejected');
|
||||
if (allFailed) {
|
||||
const firstErr = results.find((r) => r.status === 'rejected') as PromiseRejectedResult;
|
||||
error =
|
||||
firstErr.reason instanceof Error ? firstErr.reason.message : $_('common.error_loading');
|
||||
}
|
||||
|
||||
loading = false;
|
||||
}
|
||||
|
||||
// ── Helpers ────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -2,16 +2,12 @@
|
|||
* Playground LLM client — thin wrapper around mana-llm's OpenAI-compatible
|
||||
* `/v1/chat/completions` (streaming) and `/v1/models` endpoints.
|
||||
*
|
||||
* Lives next to the playground UI rather than in a shared package because
|
||||
* the playground is the only consumer right now. If chat / todo enrichment
|
||||
* / period insights end up calling the same surface in the future, lift
|
||||
* this into `$lib/data/llm-client.ts`.
|
||||
*
|
||||
* The chunk parser is hand-rolled rather than pulled from a library: the
|
||||
* SSE wire format from mana-llm is straight OpenAI (`data: {…}\n\n` lines
|
||||
* with a sentinel `[DONE]`), so a 30-line reader is simpler than a dep.
|
||||
* The SSE chunk parser lives in `@mana/shared-llm/sse-parser` and is shared
|
||||
* with the LLM orchestrator's remote backend (backends/remote.ts).
|
||||
*/
|
||||
|
||||
import { consumeSSEStream } from '@mana/shared-llm/sse-parser';
|
||||
|
||||
const DEFAULT_LLM_URL = 'http://localhost:3025';
|
||||
|
||||
/** Resolve the mana-llm base URL from the window-injected env, falling
|
||||
|
|
@ -91,49 +87,42 @@ export async function* streamCompletion(opts: CompletionOptions): AsyncGenerator
|
|||
throw new Error(`mana-llm: ${res.status} ${res.statusText}${text ? ` — ${text}` : ''}`);
|
||||
}
|
||||
|
||||
const reader = res.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
// Collect chunks via the shared SSE parser, then yield them.
|
||||
// We use a queue pattern so the async generator can yield chunks as
|
||||
// they arrive from the callback-based consumeSSEStream.
|
||||
const chunks: StreamChunk[] = [];
|
||||
let resolve: (() => void) | null = null;
|
||||
let done = false;
|
||||
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const streamPromise = consumeSSEStream(
|
||||
res.body,
|
||||
(content) => {
|
||||
chunks.push({ type: 'delta', content });
|
||||
resolve?.();
|
||||
},
|
||||
(usage) => {
|
||||
chunks.push({
|
||||
type: 'usage',
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
});
|
||||
resolve?.();
|
||||
}
|
||||
).then(() => {
|
||||
done = true;
|
||||
resolve?.();
|
||||
});
|
||||
|
||||
// SSE frames are separated by blank lines. Process complete frames
|
||||
// and leave any partial trailing frame in the buffer for the next
|
||||
// chunk.
|
||||
let sep: number;
|
||||
while ((sep = buffer.indexOf('\n\n')) !== -1) {
|
||||
const frame = buffer.slice(0, sep);
|
||||
buffer = buffer.slice(sep + 2);
|
||||
|
||||
for (const line of frame.split('\n')) {
|
||||
if (!line.startsWith('data:')) continue;
|
||||
const data = line.slice(5).trim();
|
||||
if (!data || data === '[DONE]') continue;
|
||||
try {
|
||||
const json = JSON.parse(data) as {
|
||||
choices?: Array<{ delta?: { content?: string } }>;
|
||||
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
||||
};
|
||||
const delta = json.choices?.[0]?.delta?.content;
|
||||
if (delta) yield { type: 'delta', content: delta };
|
||||
|
||||
// Usage stats — typically in the final chunk
|
||||
if (json.usage?.prompt_tokens != null) {
|
||||
yield {
|
||||
type: 'usage',
|
||||
promptTokens: json.usage.prompt_tokens,
|
||||
completionTokens: json.usage.completion_tokens ?? 0,
|
||||
};
|
||||
}
|
||||
} catch {
|
||||
// Malformed frame — skip silently. mana-llm occasionally
|
||||
// emits keepalive comments and we don't want them to
|
||||
// crash the stream.
|
||||
}
|
||||
}
|
||||
while (!done || chunks.length > 0) {
|
||||
if (chunks.length > 0) {
|
||||
yield chunks.shift()!;
|
||||
} else {
|
||||
await new Promise<void>((r) => {
|
||||
resolve = r;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the stream promise settles (propagate any errors).
|
||||
await streamPromise;
|
||||
}
|
||||
|
|
|
|||
416
docs/reports/ai-agent-architecture-comparison.md
Normal file
416
docs/reports/ai-agent-architecture-comparison.md
Normal file
|
|
@ -0,0 +1,416 @@
|
|||
# AI-Agent-Architektur: Mana vs. Industrie-Frameworks
|
||||
|
||||
**Stand:** 2026-04-16
|
||||
**Zweck:** Vergleich der Mana AI-Workbench-Architektur mit Google A2A, MCP, OpenAI Agents SDK, LangGraph, CrewAI und Microsoft Agent Framework. Stärken identifizieren, Verbesserungspotenzial aufzeigen.
|
||||
|
||||
---
|
||||
|
||||
## 1. Zusammenfassung der Mana-Architektur
|
||||
|
||||
### Was wir haben
|
||||
|
||||
Mana implementiert ein **Dual-Runtime AI-Agent-System**:
|
||||
|
||||
| Komponente | Beschreibung |
|
||||
|------------|-------------|
|
||||
| **Foreground Runner** (Browser) | `runner.ts` — Reasoning-Loop mit bis zu 5 Planner-Iterationen, direkte Dexie-Writes, E2E-verschlüsselt |
|
||||
| **Background Runner** (mana-ai, Port 3067) | `tick.ts` — 60s Cron-Tick, scannt fällige Missions, plant via mana-llm, schreibt über sync_changes zurück |
|
||||
| **Planner** | Shared Prompt-Template (`@mana/shared-ai/src/planner/`) → OpenAI-kompatible API auf mana-llm |
|
||||
| **Tool-System** | 13 Tools (todo, calendar, places, notes, news, drink), policy-gated (auto/propose/deny) |
|
||||
| **Agents** | Named Personas mit eigener systemPrompt, memory, policy, Budget, Concurrency-Limits |
|
||||
| **Proposals** | Mutationen unter `propose`-Policy erzeugen Proposals → User muss approven |
|
||||
| **Actor-System** | Jeder Write trägt einen immutablen Actor (user/ai/system) mit frozen displayName |
|
||||
| **Encryption** | Mission Key-Grants für serverseitige Entschlüsselung, AES-GCM-256, HKDF-scoped |
|
||||
|
||||
### Execution Flow
|
||||
|
||||
```
|
||||
Mission (title, objective, inputs, cadence)
|
||||
→ Input Resolution (Dexie oder DB + optionale Grant-Entschlüsselung)
|
||||
→ Planner Call (system prompt + tools + inputs → JSON plan)
|
||||
→ Policy Gate pro Step (auto → execute, propose → Proposal, deny → reject)
|
||||
→ Iteration Write-back (LWW sync via mana-sync)
|
||||
→ Optional: Chained Reasoning (tool output → nächster Planner Call)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Industrie-Frameworks im Überblick
|
||||
|
||||
### 2.1 Google A2A (Agent-to-Agent Protocol)
|
||||
|
||||
**Was es ist:** Offenes Protokoll für Inter-Agent-Kommunikation. Linux Foundation, 150+ Organisationen, v1.0 seit 2026.
|
||||
|
||||
**Kernkonzepte:**
|
||||
- **Agent Cards** (`/.well-known/a2a/agent-card`) — JSON-Selbstbeschreibung: Capabilities, Skills (mit Input/Output JSON-Schemas), Security, Protocol Bindings. Signiert für dezentrales Trust.
|
||||
- **Tasks** — Kern-Arbeitseinheit: `SUBMITTED → WORKING → COMPLETED/FAILED`. Plus `INPUT_REQUIRED` und `AUTH_REQUIRED` für HITL-Flows.
|
||||
- **Messages & Parts** — Modalitätsagnostisch: text, raw (binary), url (Datei), data (strukturiertes JSON).
|
||||
- **Artifacts** — Task-Outputs, getrennt von Konversation. Streambar (`append: true`, `lastChunk: true`).
|
||||
|
||||
**Kommunikationsmuster:**
|
||||
- Blocking (Request/Response)
|
||||
- Streaming (SSE, gRPC, JSON-RPC)
|
||||
- Push Notifications (Webhooks für long-running Tasks)
|
||||
|
||||
**Protocol Bindings:** JSON-RPC 2.0, HTTP/REST, gRPC.
|
||||
|
||||
### 2.2 Anthropic MCP (Model Context Protocol)
|
||||
|
||||
**Was es ist:** Offenes Protokoll für die Verbindung von LLM-Anwendungen mit externen Tools/Daten. Linux Foundation, 10.000+ Server, 97M monatliche SDK-Downloads.
|
||||
|
||||
**Architektur (Host → Client → Server):**
|
||||
- **Host** = LLM-Anwendung, managed Security
|
||||
- **Client** = 1:1 zu Server, JSON-RPC 2.0 Session
|
||||
- **Server** = Lightweight Adapter, exponiert drei Primitives:
|
||||
|
||||
| Primitive | Gesteuert von | Zweck |
|
||||
|-----------|---------------|-------|
|
||||
| **Resources** | Application | Kontextdaten (Dateien, DB-Einträge) |
|
||||
| **Tools** | Model | Ausführbare Funktionen |
|
||||
| **Prompts** | User | Vordefinierte Templates |
|
||||
|
||||
**Entscheidende Design-Regel:** Server sehen nie die volle Konversation oder andere Server. Host kontrolliert alle Cross-Server-Interaktionen.
|
||||
|
||||
### 2.3 OpenAI Agents SDK
|
||||
|
||||
**Kernprimitive:**
|
||||
- **Agent** — LLM + instructions + tools + handoffs
|
||||
- **Runner** — Orchestriert den Execution Loop
|
||||
- **Handoffs** — First-class Primitive für Multi-Agent-Delegation (kein separater Orchestrator nötig)
|
||||
- **Guardrails** — Input/Output-Validierung parallel zur Ausführung
|
||||
- **Sessions** — Persistente Memory (SQLite, Redis, verschlüsselt)
|
||||
- **Tracing** — Built-in Observability (LLM calls, tool calls, handoffs, OpenTelemetry)
|
||||
|
||||
### 2.4 LangGraph
|
||||
|
||||
**Kernarchitektur — StateGraph:**
|
||||
- **Nodes** = Funktionen (Agents, Tools, Logik)
|
||||
- **Edges** = Kontrollfluss (statisch oder conditional)
|
||||
- **State** = Zentraler TypedDict, immutable Updates
|
||||
- **Checkpointing** = State-Persistenz mit Time-Travel-Debugging
|
||||
|
||||
**Multi-Agent-Patterns:** Subagents, Handoffs, Router, Supervisor, Scatter-Gather, Subgraphs.
|
||||
|
||||
### 2.5 CrewAI
|
||||
|
||||
**Kernarchitektur:**
|
||||
- **Agent** = role + goal + backstory + tools + LLM
|
||||
- **Task** = Beschreibung + erwartetes Output-Format + Guardrails + HITL
|
||||
- **Crew** = Agents + Tasks + Prozesstyp (sequential, hierarchical, consensual)
|
||||
- **Flows** = Event-driven Orchestrierung für Produktion
|
||||
- **Memory** = 4 Typen: Short-term, Long-term (Embeddings), Entity, Contextual
|
||||
|
||||
### 2.6 Microsoft Agent Framework
|
||||
|
||||
**Vereinigung von AutoGen + Semantic Kernel:**
|
||||
- **Agents** = LLM + Tools + MCP-Server + Middleware
|
||||
- **Workflows** = Graph-basiert mit Type-safe Routing, Checkpointing, HITL
|
||||
- **Sessions** = Enterprise-grade State Management
|
||||
- Multi-Provider (Anthropic, Azure OpenAI, OpenAI, Ollama)
|
||||
|
||||
---
|
||||
|
||||
## 3. Vergleichsmatrix
|
||||
|
||||
| Dimension | Mana | A2A | MCP | OpenAI SDK | LangGraph | CrewAI |
|
||||
|-----------|------|-----|-----|------------|-----------|--------|
|
||||
| **Agent-Definition** | Agent(name, role, systemPrompt, memory, policy) | Agent Card (JSON, signiert) | N/A (Protokoll) | Agent(instructions, tools, handoffs) | Node-Funktionen | Agent(role, goal, backstory) |
|
||||
| **Tool-Registration** | Hardcoded Allow-List (13 Tools) | Skills in Agent Card | tools/list + tools/call | @function_tool + MCP | Node context | tools= Parameter |
|
||||
| **Agent↔Agent** | ❌ Nicht vorhanden | ✅ Kernzweck | ❌ Nicht designed dafür | Handoffs | Edges/Routing | Delegation + Hierarchie |
|
||||
| **Agent↔Tool** | Policy-gated Executor | Via MCP | ✅ Kernzweck | Function calls + MCP | Node-Aufrufe | Direkte Zuweisung |
|
||||
| **State/Memory** | LWW Sync + encrypted IndexedDB | Task contextId | Stateful Sessions | Sessions (SQLite/Redis) | StateGraph + Checkpoints | 4 Memory-Typen |
|
||||
| **Orchestrierung** | Dual-Runtime (Browser + Server Cron) | Task Lifecycle | Host koordiniert | Runner Loop | DAG Engine | Sequential/Hierarchical |
|
||||
| **Streaming** | ❌ Kein Streaming | SSE, gRPC, JSON-RPC | JSON-RPC Notifications | Built-in | Native Token-Stream | Log-basiert |
|
||||
| **Observability** | Prometheus Metrics + Debug Logs | Agent Cards Metadata | Server Logging | Built-in Tracing (OTel) | LangSmith | Built-in Logging |
|
||||
| **HITL** | Proposal-System (approve/reject) | INPUT_REQUIRED State | Elicitation | Guardrails | Interrupt/Resume | Task-Guardrails |
|
||||
| **Encryption** | ✅ AES-GCM + Key-Grants | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| **Local-First** | ✅ Dexie + Offline | ❌ Server-to-Server | ❌ | ❌ | ❌ | ❌ |
|
||||
| **Multi-Device** | ✅ LWW Sync | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
---
|
||||
|
||||
## 4. Was Mana gut macht
|
||||
|
||||
### 4.1 Privacy-First Architektur (einzigartig in der Industrie)
|
||||
|
||||
Kein einziges der verglichenen Frameworks hat ein vergleichbares Verschlüsselungskonzept:
|
||||
- **AES-GCM-256 at rest** für 27 Tabellen
|
||||
- **Mission Key-Grants** mit HKDF-Scoping, Audit Trail, TTL
|
||||
- **Zero-Knowledge-Modus** optional
|
||||
- Server sieht nie Klartext ohne expliziten Grant
|
||||
|
||||
**Bewertung:** Das ist ein echter Wettbewerbsvorteil. A2A, MCP, OpenAI SDK — alle gehen davon aus, dass der Server alles sehen darf.
|
||||
|
||||
### 4.2 Dual-Runtime mit Graceful Degradation
|
||||
|
||||
- Browser-Runner funktioniert offline und hat Zugriff auf alle verschlüsselten Daten
|
||||
- Server-Runner läuft scheduled auch wenn kein Browser offen ist
|
||||
- Bei fehlendem Key-Grant degradiert Server gracefully → Browser übernimmt
|
||||
|
||||
**Bewertung:** Kein anderes Framework hat diese Browser/Server-Dualität. LangGraph und CrewAI sind rein serverseitig. OpenAI SDK hat keinen Offline-Modus.
|
||||
|
||||
### 4.3 Immutable Actor Attribution
|
||||
|
||||
Jeder Write trägt einen eingefrorenen Actor mit `kind`, `principalId`, `displayName`, `missionId`, `iterationId`, `rationale`. Das ist besser als jedes der verglichenen Frameworks:
|
||||
- OpenAI SDK hat Tracing, aber keine Write-Level Attribution
|
||||
- LangGraph hat State-Checkpoints, aber keine Actor-Zuordnung pro Mutation
|
||||
- CrewAI hat keine per-Write-Attribution
|
||||
|
||||
**Bewertung:** Exzellent für Audit, Undo, und Vertrauen. Ermöglicht "Wer hat was warum geändert?" auf Feldebene.
|
||||
|
||||
### 4.4 Proposal-System (Human-in-the-Loop)
|
||||
|
||||
Das dreistufige Policy-System (auto/propose/deny) mit dem Proposal-Lifecycle ist durchdacht:
|
||||
- Granular pro Tool konfigurierbar
|
||||
- User-Feedback bei Rejection fließt zurück in den Planner
|
||||
- Pro-Agent Policies (Phase 3)
|
||||
|
||||
**Vergleich:** OpenAI hat Guardrails (aber binär: pass/fail). A2A hat `INPUT_REQUIRED` (aber kein propose/approve-Workflow). LangGraph hat Interrupt/Resume (ähnlich, aber weniger formalisiert).
|
||||
|
||||
### 4.5 Local-First + Multi-Device Sync
|
||||
|
||||
Field-Level LWW über mana-sync ist einzigartig unter AI-Agent-Frameworks. Missions, Agents und ihre Ergebnisse synchen automatisch über Geräte.
|
||||
|
||||
---
|
||||
|
||||
## 5. Was Mana verbessern könnte
|
||||
|
||||
### 5.1 🔴 Kein Agent-to-Agent-Protokoll
|
||||
|
||||
**Problem:** Agents können nicht miteinander kommunizieren. Jeder Agent ist eine isolierte Insel. Ein "Cashflow Watcher" kann keinen "Todo Manager" bitten, eine Aufgabe zu erstellen.
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **A2A**: Agents discovern sich über Agent Cards und delegieren Tasks
|
||||
- **OpenAI SDK**: Handoffs als First-class Primitive
|
||||
- **LangGraph**: Edges zwischen Agent-Nodes
|
||||
- **CrewAI**: Hierarchical delegation + shared context
|
||||
|
||||
**Empfehlung:** Ein leichtgewichtiges internes Agent-to-Agent-Protokoll einführen:
|
||||
|
||||
```typescript
|
||||
interface AgentDelegation {
|
||||
fromAgentId: string;
|
||||
toAgentId: string;
|
||||
intent: ToolCallIntent; // Was soll der andere Agent tun?
|
||||
context: string; // Warum?
|
||||
policy: 'auto' | 'propose'; // User muss bestätigen?
|
||||
}
|
||||
```
|
||||
|
||||
Langfristig: A2A-kompatible Agent Cards für externe Integration (z.B. Mana-Agents mit Google Agents kommunizieren lassen).
|
||||
|
||||
### 5.2 🔴 Kein Streaming
|
||||
|
||||
**Problem:** Der aktuelle Flow ist Request/Response. User sieht nichts, bis die gesamte Iteration fertig ist.
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **A2A**: SSE + gRPC Streaming mit `TaskStatusUpdateEvent` und `TaskArtifactUpdateEvent`
|
||||
- **OpenAI SDK**: Built-in Token-Streaming
|
||||
- **LangGraph**: Native Token-by-Token Streaming
|
||||
- **MCP**: JSON-RPC Notifications
|
||||
|
||||
**Empfehlung:** SSE-basiertes Streaming für den Foreground Runner:
|
||||
1. Planner-Response streamen (Token für Token)
|
||||
2. Step-Ausführung live anzeigen ("Erstelle Task... ✓", "Tagge Note... ⏳")
|
||||
3. Server-Runner: SSE-Events an verbundene Clients pushen
|
||||
|
||||
### 5.3 🟡 Statisches Tool-System
|
||||
|
||||
**Problem:** 13 hardcoded Tools in einer Allow-List. Neue Tools erfordern Code-Änderungen an mindestens 3 Stellen (server tools.ts, shared-ai proposable-tools, executor).
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **MCP**: Dynamische Tool-Discovery via `tools/list`, Hot-Reload via `notifications/tools/list_changed`
|
||||
- **OpenAI SDK**: `@function_tool` Decorator, automatische Schema-Generierung
|
||||
- **LangGraph**: Tools als Node-Funktionen, dynamisch registrierbar
|
||||
- **A2A**: Skills in Agent Cards mit JSON-Schema
|
||||
|
||||
**Empfehlung:** MCP-kompatibles Tool-Registry einführen:
|
||||
```typescript
|
||||
// Jedes Modul registriert seine Tools deklarativ
|
||||
// apps/mana/apps/web/src/lib/modules/todo/ai-tools.ts
|
||||
export const todoTools: AiToolDefinition[] = [
|
||||
{
|
||||
name: 'create_task',
|
||||
description: 'Create a new task',
|
||||
inputSchema: { /* JSON Schema */ },
|
||||
module: 'todo',
|
||||
annotations: { destructiveHint: false, idempotentHint: false }
|
||||
}
|
||||
];
|
||||
```
|
||||
Module registrieren Tools via Registry → Planner bekommt dynamische Tool-Liste → Server synchronisiert über shared-ai Package. **Bonus:** MCP-Server-Export ermöglicht externe Tool-Nutzung.
|
||||
|
||||
### 5.4 🟡 Kein Graph-basierter Workflow
|
||||
|
||||
**Problem:** Der Reasoning Loop ist linear (Planner → Steps → optional Loop). Komplexe Workflows (Branching, Parallelisierung, Conditional Logic) sind nicht möglich.
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **LangGraph**: DAG mit Conditional Edges, Parallel Nodes, Subgraphs
|
||||
- **CrewAI**: Sequential + Hierarchical + Consensual Prozesse
|
||||
- **Microsoft Agent Framework**: Type-safe Graph Workflows
|
||||
|
||||
**Empfehlung:** Für die meisten Mana-Usecases ist der lineare Loop ausreichend. Aber für komplexere Missions (z.B. "Analysiere meine Finanzen, erstelle einen Bericht, und plane Aufgaben basierend auf dem Ergebnis") wäre ein einfacher DAG sinnvoll:
|
||||
|
||||
```typescript
|
||||
interface MissionGraph {
|
||||
nodes: MissionStep[]; // Jeder Node = ein Planner-Call oder Tool-Execution
|
||||
edges: MissionEdge[]; // Conditional routing
|
||||
parallelGroups?: string[][]; // Steps die parallel laufen können
|
||||
}
|
||||
```
|
||||
|
||||
**Priorität:** Niedrig. Der aktuelle Loop deckt 90% der Usecases ab.
|
||||
|
||||
### 5.5 🟡 Keine Agent-Discovery
|
||||
|
||||
**Problem:** Agents sind nur ihrem Ersteller bekannt. Kein Mechanismus für:
|
||||
- "Welche Agents gibt es?"
|
||||
- "Welcher Agent kann X?"
|
||||
- Automatische Delegation basierend auf Capabilities
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **A2A**: Agent Cards mit Skills + JSON-Schemas → automatische Discovery
|
||||
- **MCP**: Resource Discovery via URI-Schemes
|
||||
|
||||
**Empfehlung:** Agent Capabilities als strukturierte Metadaten:
|
||||
```typescript
|
||||
interface AgentCapability {
|
||||
module: string; // 'todo', 'notes', 'calendar'
|
||||
actions: string[]; // ['create', 'update', 'query']
|
||||
inputTypes: string[]; // Welche Inputs kann der Agent verarbeiten?
|
||||
}
|
||||
```
|
||||
Ermöglicht: "Finde den Agent, der Kalender-Events erstellen kann" → automatisches Routing.
|
||||
|
||||
### 5.6 🟡 Kein Guardrail-System
|
||||
|
||||
**Problem:** Policy (auto/propose/deny) ist ein Gating-Mechanismus, aber kein Guardrail. Es gibt keine:
|
||||
- Input-Validierung (ist der Prompt safe?)
|
||||
- Output-Validierung (ist das Ergebnis korrekt/sicher?)
|
||||
- Budget-Enforcement serverseitig (nur client-seitig)
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **OpenAI SDK**: Input + Output Guardrails parallel zur Execution
|
||||
- **CrewAI**: Task-Level Guardrails
|
||||
- **LangGraph**: Custom Validation Nodes
|
||||
|
||||
**Empfehlung:** Guardrails als separate Schicht:
|
||||
```typescript
|
||||
interface Guardrail {
|
||||
name: string;
|
||||
phase: 'pre-plan' | 'post-plan' | 'pre-execute' | 'post-execute';
|
||||
check: (context: GuardrailContext) => GuardrailResult;
|
||||
}
|
||||
// Beispiele:
|
||||
// - PII-Detection vor Planner-Call
|
||||
// - Budget-Check vor Execution
|
||||
// - Output-Schema-Validierung nach Tool-Call
|
||||
```
|
||||
|
||||
### 5.7 🟢 Tracing & Debugging verbessern
|
||||
|
||||
**Problem:** Debug-Info ist in `_aiDebugLog` (localStorage, nie gesynct) + Prometheus Metrics. Kein zusammenhängendes Tracing einer Mission über Browser + Server.
|
||||
|
||||
**Was die Industrie macht:**
|
||||
- **OpenAI SDK**: Built-in Tracing mit visuellen DAGs, OpenTelemetry Export
|
||||
- **LangGraph**: LangSmith Integration, Step-by-Step Debugging, Time-Travel
|
||||
- **Microsoft**: Semantic Kernel Telemetry
|
||||
|
||||
**Empfehlung:** OpenTelemetry-Spans für den gesamten Mission-Lifecycle:
|
||||
```
|
||||
Mission.run (span)
|
||||
├── Input.resolve (span, per input)
|
||||
├── Planner.call (span, mit prompt + response)
|
||||
├── Step[0].execute (span, tool name + params + result)
|
||||
├── Step[1].propose (span, proposal created)
|
||||
└── Iteration.write (span, sync write)
|
||||
```
|
||||
Exportierbar nach Grafana Tempo oder ähnlichem.
|
||||
|
||||
---
|
||||
|
||||
## 6. Strategische Empfehlungen (priorisiert)
|
||||
|
||||
### Kurzfristig (nächste 2-4 Wochen)
|
||||
|
||||
| # | Maßnahme | Aufwand | Impact |
|
||||
|---|----------|---------|--------|
|
||||
| 1 | **Streaming für Foreground Runner** — SSE vom Planner, live Step-Status im UI | Mittel | Hoch — UX-Sprung |
|
||||
| 2 | **Dynamisches Tool-Registry** — Module registrieren Tools deklarativ, Server synchronisiert | Mittel | Hoch — Skalierbarkeit |
|
||||
| 3 | **Budget-Enforcement serverseitig** — Token-Counting pro Agent im tick.ts | Klein | Mittel — Sicherheit |
|
||||
|
||||
### Mittelfristig (1-3 Monate)
|
||||
|
||||
| # | Maßnahme | Aufwand | Impact |
|
||||
|---|----------|---------|--------|
|
||||
| 4 | **Agent-to-Agent Delegation** — Internes Protokoll, ein Agent kann einen anderen beauftragen | Groß | Hoch — Multi-Agent |
|
||||
| 5 | **MCP-Server-Export** — Mana-Tools als MCP-Server exponieren | Mittel | Hoch — Ökosystem |
|
||||
| 6 | **Guardrail-Layer** — Pre/Post-Execution Checks | Mittel | Mittel — Sicherheit |
|
||||
| 7 | **OpenTelemetry Tracing** — End-to-End Mission Spans | Mittel | Mittel — Debugging |
|
||||
|
||||
### Langfristig (3-6 Monate)
|
||||
|
||||
| # | Maßnahme | Aufwand | Impact |
|
||||
|---|----------|---------|--------|
|
||||
| 8 | **A2A-kompatible Agent Cards** — Mana-Agents extern discoverable machen | Groß | Hoch — Interop |
|
||||
| 9 | **Graph-basierte Workflows** — DAG für komplexe Missions | Groß | Mittel — Power-User |
|
||||
| 10 | **Agent Memory (Embeddings)** — Long-term Memory à la CrewAI | Groß | Mittel — Intelligenz |
|
||||
|
||||
---
|
||||
|
||||
## 7. Architektur-Diagramm: Ist vs. Soll
|
||||
|
||||
### Ist-Zustand
|
||||
```
|
||||
┌──────────────────────────────────────────────────────┐
|
||||
│ Browser (Foreground Runner) │
|
||||
│ Mission → Planner → Policy Gate → Execute/Propose │
|
||||
│ ↕ Dexie (encrypted) ↕ mana-sync (LWW) │
|
||||
└──────────────────┬───────────────────────────────────┘
|
||||
│ sync_changes
|
||||
┌──────────────────▼───────────────────────────────────┐
|
||||
│ mana-ai (Background Runner) │
|
||||
│ Tick → Due Missions → Planner → Write Iteration │
|
||||
│ ↕ PostgreSQL (RLS) ↕ mana-llm │
|
||||
└──────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Soll-Zustand (mittelfristig)
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ Browser │
|
||||
│ Mission → [Guardrails] → Planner (streaming) → Policy Gate │
|
||||
│ Agent A ←→ Agent B (delegation) │
|
||||
│ ↕ Dexie (encrypted) ↕ mana-sync (LWW + SSE) │
|
||||
└──────────────────┬───────────────────────────────────────────┘
|
||||
│ sync_changes + OTel spans
|
||||
┌──────────────────▼───────────────────────────────────────────┐
|
||||
│ mana-ai (Background Runner) │
|
||||
│ Tick → [Budget Check] → Due Missions → Planner → Write │
|
||||
│ Tool Registry (dynamic, MCP-compatible) │
|
||||
│ ↕ PostgreSQL (RLS) ↕ mana-llm ↕ Grafana Tempo │
|
||||
└──────────────────┬───────────────────────────────────────────┘
|
||||
│ A2A Agent Cards (langfristig)
|
||||
┌──────────────────▼───────────────────────────────────────────┐
|
||||
│ Externe Agents (Google ADK, OpenAI, etc.) │
|
||||
│ Discovery via /.well-known/a2a/agent-card │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Fazit
|
||||
|
||||
**Mana's AI-Agent-Architektur ist in drei Bereichen führend:**
|
||||
1. **Privacy/Encryption** — Kein vergleichbares Framework hat Key-Grants, Zero-Knowledge, oder at-rest Encryption für Agent-Daten
|
||||
2. **Local-First + Multi-Device** — Einzigartige Dual-Runtime mit Offline-Fähigkeit
|
||||
3. **Actor Attribution** — Bestes Audit-Trail-System unter allen verglichenen Frameworks
|
||||
|
||||
**Die größten Lücken gegenüber der Industrie:**
|
||||
1. **Kein Agent-to-Agent** — Die wichtigste fehlende Capability für echte Multi-Agent-Systeme
|
||||
2. **Kein Streaming** — Standard in allen modernen Frameworks, fehlt komplett
|
||||
3. **Statisches Tool-System** — Skaliert nicht, wenn neue Module Tools brauchen
|
||||
|
||||
**Die Kernempfehlung:** Mana sollte nicht versuchen, ein General-Purpose-Agent-Framework zu werden (das machen LangGraph/CrewAI besser). Stattdessen die einzigartigen Stärken (Privacy, Local-First, Attribution) ausbauen und gezielt die Industrie-Standards adoptieren, die den größten UX-Impact haben: **Streaming**, **dynamische Tools**, und **Agent-Delegation**.
|
||||
|
||||
MCP-Kompatibilität als mittelfristiges Ziel ist strategisch richtig — es ist das Protokoll, das sich als Standard für Agent↔Tool durchgesetzt hat (97M Downloads/Monat). A2A für Agent↔Agent ist das natürliche Pendant, aber erst relevant, wenn interne Multi-Agent-Kommunikation steht.
|
||||
|
|
@ -30,6 +30,13 @@ export interface AiPlanInput {
|
|||
readonly mission: Mission;
|
||||
readonly resolvedInputs: readonly ResolvedInput[];
|
||||
readonly availableTools: readonly AvailableTool[];
|
||||
/**
|
||||
* Optional streaming callback — called with each token delta as it
|
||||
* arrives from the LLM. The Runner uses this to show live progress
|
||||
* during the "calling-llm" phase. Only effective when the backend
|
||||
* supports streaming (mana-server / cloud tiers).
|
||||
*/
|
||||
readonly onToken?: (delta: string) => void;
|
||||
}
|
||||
|
||||
export interface PlannedStep {
|
||||
|
|
|
|||
|
|
@ -7,7 +7,8 @@
|
|||
"main": "./src/index.ts",
|
||||
"types": "./src/index.ts",
|
||||
"exports": {
|
||||
".": "./src/index.ts"
|
||||
".": "./src/index.ts",
|
||||
"./sse-parser": "./src/sse-parser.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"type-check": "tsc --noEmit",
|
||||
|
|
|
|||
|
|
@ -7,13 +7,12 @@
|
|||
*
|
||||
* The endpoint is `/v1/chat/completions` and the wire format is
|
||||
* straight OpenAI SSE: `data: {…}\n\n` lines, terminated by
|
||||
* `data: [DONE]`. The hand-rolled parser is the same shape as the
|
||||
* existing playground client (apps/mana/apps/web/src/lib/modules/
|
||||
* playground/llm.ts) so the two consumers stay aligned and can be
|
||||
* unified later if we want.
|
||||
* `data: [DONE]`. The SSE parser lives in `../sse-parser.ts` and is
|
||||
* shared with the playground client.
|
||||
*/
|
||||
|
||||
import { BackendUnreachableError, ProviderBlockedError } from '../errors';
|
||||
import { consumeSSEStream } from '../sse-parser';
|
||||
import type { LlmTier } from '../tiers';
|
||||
import type { GenerateResult, LlmTaskRequest } from '../types';
|
||||
|
||||
|
|
@ -33,31 +32,20 @@ export function resolveLlmBaseUrl(): string {
|
|||
/**
|
||||
* Send a chat completion to mana-llm and return the result.
|
||||
*
|
||||
* Implementation notes:
|
||||
* When `req.onToken` is set, uses SSE streaming (`stream: true`) so
|
||||
* the caller receives per-token callbacks as they arrive — used by the
|
||||
* Mission Runner to show live progress during the "calling-llm" phase.
|
||||
*
|
||||
* - We use the NON-streaming endpoint (`stream: false`). Curl tests
|
||||
* from the same hostname showed that mana-llm's streaming endpoint
|
||||
* works perfectly when called from outside the browser, but the
|
||||
* browser receives `totalFrames=0` (an empty response body) for
|
||||
* reasons that almost certainly trace back to CORS + credentials
|
||||
* + streaming-body interactions. Non-streaming is a single JSON
|
||||
* response, much friendlier to the browser fetch API.
|
||||
* When `req.onToken` is absent, uses the non-streaming endpoint
|
||||
* (`stream: false`) which returns a single JSON response — simpler and
|
||||
* sufficient for tasks that only care about the final result.
|
||||
*
|
||||
* - We do NOT pass `credentials: 'include'`. The mana-llm service
|
||||
* doesn't require user auth (the API key middleware accepts
|
||||
* anonymous requests), and `credentials: 'include'` plus
|
||||
* `Access-Control-Allow-Origin: *` is one of the patterns that
|
||||
* silently breaks the response body in browsers. Verified by
|
||||
* comparing curl-from-server (no creds, works) vs browser fetch
|
||||
* (with creds, empty body).
|
||||
*
|
||||
* - For tasks that registered an `onToken` callback (legacy chat-
|
||||
* style streaming UX), we fire it ONCE with the full content at
|
||||
* the end. That's a degraded streaming experience, but no current
|
||||
* shared-llm caller actually consumes the per-token stream — the
|
||||
* queue + watcher model only cares about the final result. The
|
||||
* playground module uses its own client (apps/.../modules/
|
||||
* playground/llm.ts) which keeps real streaming for live UX.
|
||||
* We do NOT pass `credentials: 'include'` — the mana-llm service
|
||||
* accepts anonymous requests, and `credentials: 'include'` plus
|
||||
* `Access-Control-Allow-Origin: *` silently breaks the response body
|
||||
* in browsers (verified by comparing curl vs browser fetch). The
|
||||
* playground module uses the same no-credentials pattern with
|
||||
* `stream: true` and it works fine.
|
||||
*
|
||||
* `tier` is only used for error tagging — both 'mana-server' and
|
||||
* 'cloud' call the same endpoint with different model strings.
|
||||
|
|
@ -69,6 +57,7 @@ export async function callManaLlmStreaming(
|
|||
): Promise<GenerateResult> {
|
||||
const url = `${resolveLlmBaseUrl()}/v1/chat/completions`;
|
||||
const start = performance.now();
|
||||
const useStreaming = typeof req.onToken === 'function';
|
||||
|
||||
let res: Response;
|
||||
try {
|
||||
|
|
@ -80,7 +69,7 @@ export async function callManaLlmStreaming(
|
|||
messages: req.messages,
|
||||
temperature: req.temperature ?? 0.7,
|
||||
max_tokens: req.maxTokens ?? 1024,
|
||||
stream: false,
|
||||
stream: useStreaming,
|
||||
}),
|
||||
});
|
||||
} catch (err) {
|
||||
|
|
@ -102,6 +91,25 @@ export async function callManaLlmStreaming(
|
|||
throw new BackendUnreachableError(tier, res.status, text);
|
||||
}
|
||||
|
||||
// ── Streaming path: SSE with per-token callbacks ───────────
|
||||
if (useStreaming && res.body) {
|
||||
let usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
||||
const content = await consumeSSEStream(res.body, req.onToken, (u) => {
|
||||
usage = {
|
||||
promptTokens: u.promptTokens,
|
||||
completionTokens: u.completionTokens,
|
||||
totalTokens: u.promptTokens + u.completionTokens,
|
||||
};
|
||||
});
|
||||
|
||||
if (!content) {
|
||||
console.warn(`[shared-llm:${tier}] empty streaming content`, { model });
|
||||
}
|
||||
|
||||
return { content, usage, latencyMs: Math.round(performance.now() - start) };
|
||||
}
|
||||
|
||||
// ── Non-streaming path: single JSON response ──────────────
|
||||
let json: {
|
||||
choices?: Array<{
|
||||
message?: { content?: string; reasoning?: string };
|
||||
|
|
@ -119,12 +127,7 @@ export async function callManaLlmStreaming(
|
|||
// Field ordering: prefer the canonical OpenAI `message.content` first.
|
||||
// If that's empty AND `message.reasoning` is set, fall back to it —
|
||||
// reasoning models like Gemma 4 emit their thought process there
|
||||
// when given too few tokens to also produce a final answer (we hit
|
||||
// this with max_tokens=10 / no system prompt: content was "" while
|
||||
// reasoning had the half-finished thought). For our title task this
|
||||
// rarely happens because the system prompt is directive, but the
|
||||
// fallback is cheap and protects against future tasks that might
|
||||
// trigger longer reasoning chains.
|
||||
// when given too few tokens to also produce a final answer.
|
||||
const choice = json.choices?.[0];
|
||||
const content = choice?.message?.content ?? choice?.message?.reasoning ?? choice?.text ?? '';
|
||||
|
||||
|
|
@ -132,14 +135,6 @@ export async function callManaLlmStreaming(
|
|||
console.warn(`[shared-llm:${tier}] empty completion content`, { model, json });
|
||||
}
|
||||
|
||||
// One-shot "streaming" for any caller that registered onToken: emit
|
||||
// the whole content as a single chunk at the end. The current
|
||||
// orchestrator + queue model never reads tokens incrementally for
|
||||
// remote tiers anyway.
|
||||
if (content && req.onToken) {
|
||||
req.onToken(content);
|
||||
}
|
||||
|
||||
return {
|
||||
content,
|
||||
usage: {
|
||||
|
|
|
|||
74
packages/shared-llm/src/sse-parser.ts
Normal file
74
packages/shared-llm/src/sse-parser.ts
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
/**
|
||||
* Shared SSE parser for OpenAI-compatible streaming responses.
|
||||
*
|
||||
* The wire format from mana-llm is straight OpenAI: `data: {…}\n\n`
|
||||
* lines with a sentinel `data: [DONE]`. This ~40-line reader is simpler
|
||||
* than adding a dependency and is shared between the LLM orchestrator
|
||||
* (backends/remote.ts) and the playground module.
|
||||
*/
|
||||
|
||||
export interface SSEUsage {
|
||||
readonly promptTokens: number;
|
||||
readonly completionTokens: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Consume a ReadableStream of SSE chunks from an OpenAI-compatible
|
||||
* `/v1/chat/completions` endpoint.
|
||||
*
|
||||
* Calls `onDelta` for each content token and `onUsage` (if provided)
|
||||
* when the final usage stats arrive. Returns the accumulated full
|
||||
* content string once the stream is done.
|
||||
*/
|
||||
export async function consumeSSEStream(
|
||||
body: ReadableStream<Uint8Array>,
|
||||
onDelta?: (content: string) => void,
|
||||
onUsage?: (usage: SSEUsage) => void
|
||||
): Promise<string> {
|
||||
const reader = body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
let content = '';
|
||||
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
|
||||
// SSE frames are separated by blank lines. Process complete frames
|
||||
// and leave any partial trailing frame in the buffer.
|
||||
let sep: number;
|
||||
while ((sep = buffer.indexOf('\n\n')) !== -1) {
|
||||
const frame = buffer.slice(0, sep);
|
||||
buffer = buffer.slice(sep + 2);
|
||||
|
||||
for (const line of frame.split('\n')) {
|
||||
if (!line.startsWith('data:')) continue;
|
||||
const data = line.slice(5).trim();
|
||||
if (!data || data === '[DONE]') continue;
|
||||
try {
|
||||
const json = JSON.parse(data) as {
|
||||
choices?: Array<{ delta?: { content?: string } }>;
|
||||
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
||||
};
|
||||
const delta = json.choices?.[0]?.delta?.content;
|
||||
if (delta) {
|
||||
content += delta;
|
||||
onDelta?.(delta);
|
||||
}
|
||||
if (json.usage?.prompt_tokens != null) {
|
||||
onUsage?.({
|
||||
promptTokens: json.usage.prompt_tokens,
|
||||
completionTokens: json.usage.completion_tokens ?? 0,
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Malformed frame — skip silently. mana-llm occasionally
|
||||
// emits keepalive comments.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue