mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 17:06:41 +02:00
Final milestone of docs/plans/llm-fallback-aliases.md. Every backend
caller now requests models via the `mana/<class>` alias system instead
of hardcoded `ollama/...` strings. mana-llm resolves aliases through
`services/mana-llm/aliases.yaml` with health-aware fallback (M3) and
emits resolved-model + fallback metrics (M4).
SSOT moved to `packages/shared-ai/src/llm-aliases.ts` so apps/api,
apps/mana/apps/web, and services/mana-ai all import the same
`MANA_LLM` constant via the existing `@mana/shared-ai` workspace
dependency. Three additional sites (memoro-server, mana-events,
mana-research) inline the alias string with a SSOT comment because
they don't pull @mana/shared-ai today.
Migrated 14 sites across 10 files:
- apps/api: writing(LONG_FORM), comic(STRUCTURED), context(FAST_TEXT),
food(VISION), plants(VISION), research orchestrator (3 tiers
collapsed to STRUCTURED+FAST_TEXT/LONG_FORM)
- apps/mana/apps/web: voice/parse-task + parse-habit (STRUCTURED)
- services/mana-ai: planner llm-client + tick.ts (REASONING)
- services/mana-events: website-extractor (STRUCTURED, inlined)
- services/mana-research: mana-llm client (FAST_TEXT, inlined)
- apps/memoro/apps/server: ai.ts (FAST_TEXT, inlined)
Legacy env-vars removed: WRITING_MODEL, COMIC_STORYBOARD_MODEL,
VISION_MODEL, MANA_LLM_DEFAULT_MODEL. The chain in aliases.yaml is
now the single tuning surface; SIGHUP reloads it without redeploys.
New `scripts/validate-llm-strings.mjs` regex-scans 2538 files for
hardcoded `<provider>/<model>` strings and fails the build if any
land outside the SSOT or the explicitly-allowed paths (image-gen
modules, model-inspector code, this validator itself, the registry).
Wired into `validate:all` next to the i18n + theme validators.
Verified: `pnpm validate:llm-strings` clean, `pnpm --filter @mana/api
type-check` clean, `pnpm --filter @mana/ai-service type-check`
clean. Web type-check has 2 pre-existing errors in
SettingsSidebar.svelte (i18n MessageFormatter type drift, last
touched in 988c17a67 — unrelated to this work).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
239 lines
7.3 KiB
TypeScript
239 lines
7.3 KiB
TypeScript
/**
|
|
* Website Extractor — LLM-based event extraction from unstructured web pages.
|
|
*
|
|
* Pipeline:
|
|
* 1. Crawl the page via mana-research POST /api/v1/extract
|
|
* 2. Feed the extracted text to mana-llm with a structured output prompt
|
|
* 3. Parse the LLM response as NormalizedEvent[]
|
|
*
|
|
* Uses cheap/fast models (gemma3:4b or haiku) to keep costs low.
|
|
* Falls back gracefully on any failure — one bad page doesn't crash the batch.
|
|
*/
|
|
|
|
import type { NormalizedEvent } from './types';
|
|
|
|
const EXTRACT_TIMEOUT_MS = 20_000;
|
|
const LLM_TIMEOUT_MS = 30_000;
|
|
const MAX_CONTENT_CHARS = 15_000; // Trim long pages to stay within context window
|
|
|
|
interface ExtractResponse {
|
|
success: boolean;
|
|
data?: {
|
|
content: {
|
|
title?: string;
|
|
text?: string;
|
|
markdown?: string;
|
|
html?: string;
|
|
};
|
|
};
|
|
}
|
|
|
|
interface ChatCompletionResponse {
|
|
choices: Array<{
|
|
message: {
|
|
content: string;
|
|
};
|
|
}>;
|
|
}
|
|
|
|
/**
|
|
* Extract events from a website URL.
|
|
*
|
|
* 1. Fetches + renders the page via mana-research (Firecrawl/Jina/Readability)
|
|
* 2. Sends the text to mana-llm with a structured extraction prompt
|
|
* 3. Parses JSON output into NormalizedEvent[]
|
|
*/
|
|
export async function extractEventsFromWebsite(
|
|
url: string,
|
|
sourceName: string,
|
|
manaResearchUrl: string,
|
|
manaLlmUrl: string
|
|
): Promise<NormalizedEvent[]> {
|
|
// Step 1: Extract page content
|
|
const content = await fetchPageContent(url, manaResearchUrl);
|
|
if (!content) return [];
|
|
|
|
// Step 2: LLM extraction
|
|
const events = await llmExtractEvents(content, url, sourceName, manaLlmUrl);
|
|
return events;
|
|
}
|
|
|
|
/** Fetch and extract text content from a URL via mana-research. */
|
|
async function fetchPageContent(url: string, manaResearchUrl: string): Promise<string | null> {
|
|
try {
|
|
const res = await fetch(`${manaResearchUrl}/api/v1/extract`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ url }),
|
|
signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
console.warn(`[website-extractor] extract failed ${res.status}: ${url}`);
|
|
return null;
|
|
}
|
|
|
|
const data = (await res.json()) as ExtractResponse;
|
|
if (!data.success || !data.data?.content) return null;
|
|
|
|
// Prefer markdown > text > html
|
|
const text = data.data.content.markdown || data.data.content.text || '';
|
|
if (text.length < 50) return null; // Too short to contain events
|
|
|
|
// Trim to stay within LLM context window
|
|
return text.slice(0, MAX_CONTENT_CHARS);
|
|
} catch (err) {
|
|
console.warn(`[website-extractor] fetch error for ${url}:`, err);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/** Build the LLM system prompt for event extraction. */
|
|
function buildExtractionPrompt(): string {
|
|
const today = new Date().toISOString().slice(0, 10);
|
|
return `Du bist ein Event-Extractor. Extrahiere ALLE kommenden Veranstaltungen von der gegebenen Webseite.
|
|
|
|
Pro Event liefere:
|
|
- title (string, Pflicht) — Name der Veranstaltung
|
|
- date (string, Pflicht) — Startdatum im Format YYYY-MM-DD
|
|
- time (string, optional) — Startzeit im Format HH:MM
|
|
- endDate (string, optional) — Enddatum falls mehrtägig
|
|
- endTime (string, optional) — Endzeit
|
|
- location (string, optional) — Veranstaltungsort / Adresse
|
|
- description (string, optional) — Kurzbeschreibung, max 300 Zeichen
|
|
- category (string, optional) — Eine von: music, theater, art, tech, sport, food, family, nature, education, community, nightlife, market, other
|
|
- priceInfo (string, optional) — Preis, z.B. "Eintritt frei", "15 EUR", "VVK 12 / AK 15"
|
|
|
|
Heutiges Datum: ${today}
|
|
Ignoriere vergangene Events (vor ${today}).
|
|
Antwort als JSON-Objekt mit einem "events"-Array. Kein Markdown, nur JSON.`;
|
|
}
|
|
|
|
/** Send page content to mana-llm for structured event extraction. */
|
|
async function llmExtractEvents(
|
|
pageContent: string,
|
|
sourceUrl: string,
|
|
sourceName: string,
|
|
manaLlmUrl: string
|
|
): Promise<NormalizedEvent[]> {
|
|
try {
|
|
const res = await fetch(`${manaLlmUrl}/v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
// JSON event extraction → STRUCTURED alias (resolved by mana-llm).
|
|
// SSOT: packages/shared-ai/src/llm-aliases.ts. Inlined because
|
|
// mana-events doesn't depend on @mana/shared-ai today.
|
|
model: 'mana/structured',
|
|
messages: [
|
|
{ role: 'system', content: buildExtractionPrompt() },
|
|
{ role: 'user', content: `Extrahiere Events von dieser Seite:\n\n${pageContent}` },
|
|
],
|
|
max_tokens: 2048,
|
|
temperature: 0,
|
|
response_format: { type: 'json_object' },
|
|
}),
|
|
signal: AbortSignal.timeout(LLM_TIMEOUT_MS),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
console.warn(`[website-extractor] LLM failed ${res.status}`);
|
|
return [];
|
|
}
|
|
|
|
const completion = (await res.json()) as ChatCompletionResponse;
|
|
const rawJson = completion.choices?.[0]?.message?.content ?? '';
|
|
|
|
return parseExtractedEvents(rawJson, sourceUrl, sourceName);
|
|
} catch (err) {
|
|
console.warn(`[website-extractor] LLM error:`, err);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/** Parse and validate LLM JSON output into NormalizedEvents. */
|
|
export function parseExtractedEvents(
|
|
rawJson: string,
|
|
sourceUrl: string,
|
|
sourceName: string
|
|
): NormalizedEvent[] {
|
|
try {
|
|
// Strip markdown fences if present
|
|
const cleaned = rawJson.replace(/^```(?:json)?\s*\n?/m, '').replace(/\n?```\s*$/m, '');
|
|
const parsed = JSON.parse(cleaned);
|
|
const rawEvents = parsed.events ?? parsed;
|
|
|
|
if (!Array.isArray(rawEvents)) return [];
|
|
|
|
const now = new Date();
|
|
const events: NormalizedEvent[] = [];
|
|
|
|
for (const raw of rawEvents) {
|
|
if (!raw.title || !raw.date) continue;
|
|
|
|
// Parse date — LLMs sometimes return "25. April 2026" instead of ISO
|
|
const startAt = parseFlexibleDate(raw.date, raw.time);
|
|
if (!startAt || isNaN(startAt.getTime())) continue;
|
|
|
|
// Skip past events
|
|
if (startAt.getTime() < now.getTime() - 24 * 60 * 60 * 1000) continue;
|
|
|
|
const endAt = raw.endDate ? parseFlexibleDate(raw.endDate, raw.endTime) : null;
|
|
|
|
events.push({
|
|
title: String(raw.title).trim().slice(0, 200),
|
|
description: raw.description ? String(raw.description).trim().slice(0, 2000) : null,
|
|
location: raw.location ? String(raw.location).trim() : null,
|
|
startAt,
|
|
endAt,
|
|
allDay: !raw.time,
|
|
sourceUrl,
|
|
category: raw.category ?? null,
|
|
priceInfo: raw.priceInfo ? String(raw.priceInfo).trim() : null,
|
|
});
|
|
}
|
|
|
|
return events;
|
|
} catch (err) {
|
|
console.warn(`[website-extractor] JSON parse error:`, err);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/** Parse dates flexibly — handles ISO, German formats, and partial dates. */
|
|
function parseFlexibleDate(dateStr: string, timeStr?: string): Date | null {
|
|
if (!dateStr) return null;
|
|
|
|
// Try ISO format first (YYYY-MM-DD)
|
|
const isoMatch = dateStr.match(/^(\d{4})-(\d{2})-(\d{2})/);
|
|
if (isoMatch) {
|
|
const [, y, m, d] = isoMatch;
|
|
const time = parseTime(timeStr);
|
|
return new Date(`${y}-${m}-${d}T${time}:00`);
|
|
}
|
|
|
|
// Try German format (DD.MM.YYYY)
|
|
const deMatch = dateStr.match(/(\d{1,2})\.(\d{1,2})\.(\d{4})/);
|
|
if (deMatch) {
|
|
const [, d, m, y] = deMatch;
|
|
const time = parseTime(timeStr);
|
|
return new Date(`${y}-${m!.padStart(2, '0')}-${d!.padStart(2, '0')}T${time}:00`);
|
|
}
|
|
|
|
// Fallback: let Date parse it
|
|
try {
|
|
const d = new Date(dateStr);
|
|
if (!isNaN(d.getTime())) return d;
|
|
} catch {
|
|
// ignore
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function parseTime(timeStr?: string): string {
|
|
if (!timeStr) return '00:00';
|
|
const match = timeStr.match(/(\d{1,2}):(\d{2})/);
|
|
if (match) return `${match[1]!.padStart(2, '0')}:${match[2]}`;
|
|
return '00:00';
|
|
}
|