feat(api): POST /api/v1/context/import-url — crawler + optional LLM summary

New backend endpoint that wraps mana-crawler + mana-llm in a single call so the Kontext "Aus URL" UI can hit one route: - Starts a crawl job (single page or up-to-20-page deep crawl) via mana-crawler's /api/v1/crawl, polls status up to 90s, then fetches paginated results. - When multiple pages are returned, joins them into one markdown document with H1-per-page section headers separated by ---. - When summarize=true, routes the collected markdown through mana-llm/chat/completions with a system prompt that asks for "Überblick / Kernaussagen / Details" H2 structure in the source language. sanitizeSummary() strips the common local-LLM artefacts (```markdown fences, "Hier ist …:" preamble, stray leading H1) so the output drops cleanly into the Kontext doc. On summary failure the endpoint returns 502 rather than silently falling back to the raw crawl. - Credits are validated + consumed via @mana/shared-hono/credits (1 credit crawl-only, 5 crawl+summary) under the new AI_CONTEXT_IMPORT_URL action. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 21:41:09 +02:00 · 2026-04-15 14:24:19 +02:00 · 2026-04-15 14:24:19 +02:00 · 121a0c0a6f
commit 121a0c0a6f
parent 7f1520d6f4
1 changed files with 180 additions and 0 deletions
--- a/apps/api/src/modules/context/routes.ts
+++ b/apps/api/src/modules/context/routes.ts
@ -10,9 +10,189 @@ import { consumeCredits, validateCredits } from '@mana/shared-hono/credits';
 import type { AuthVariables } from '@mana/shared-hono';

 const LLM_URL = process.env.MANA_LLM_URL || 'http://localhost:3025';
+const CRAWLER_URL = process.env.MANA_CRAWLER_URL || 'http://localhost:3023';
+const DEFAULT_SUMMARY_MODEL = process.env.MANA_LLM_DEFAULT_MODEL || 'gemma3:4b';

 const routes = new Hono<{ Variables: AuthVariables }>();

+// ─── URL Import (crawler → optional LLM summary → document) ──
+
+const DEEP_MAX_PAGES = 20;
+const CRAWL_POLL_INTERVAL_MS = 1500;
+const CRAWL_TIMEOUT_MS = 90_000;
+
+/**
+ * Local LLMs love to wrap Markdown in ```markdown fences or prepend
+ * a "Hier ist die Zusammenfassung:" preamble. Strip those so the
+ * output renders correctly when dropped into the Kontext document.
+ */
+function sanitizeSummary(raw: string): string {
+	let s = raw.trim();
+	// Strip a leading ```markdown / ```md / ``` fence and its closing ```.
+	const fenceMatch = s.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n?```\s*$/i);
+	if (fenceMatch) s = fenceMatch[1].trim();
+	// Drop a single-line preamble that ends with a colon (LLM chatter).
+	const lines = s.split('\n');
+	if (lines.length > 2 && /^[^#\n].{0,80}:\s*$/.test(lines[0].trim())) {
+		s = lines.slice(1).join('\n').trim();
+	}
+	// Demote a solitary leading H1 to H2 so it doesn't clash with our
+	// section header that the frontend prepends.
+	s = s.replace(/^#\s+/, '## ');
+	return s;
+}
+
+async function pollCrawlJob(jobId: string) {
+	const deadline = Date.now() + CRAWL_TIMEOUT_MS;
+	while (Date.now() < deadline) {
+		await new Promise((r) => setTimeout(r, CRAWL_POLL_INTERVAL_MS));
+		const res = await fetch(`${CRAWLER_URL}/api/v1/crawl/${jobId}`);
+		if (!res.ok) throw new Error(`crawl status ${res.status}`);
+		const job = (await res.json()) as { status: string; error?: string };
+		if (job.status === 'completed') return;
+		if (job.status === 'failed') throw new Error(job.error || 'crawl failed');
+	}
+	throw new Error('crawl timeout');
+}
+
+routes.post('/import-url', async (c) => {
+	const userId = c.get('userId');
+	const {
+		url,
+		mode = 'single',
+		summarize = false,
+	} = (await c.req.json()) as {
+		url?: string;
+		mode?: 'single' | 'deep';
+		summarize?: boolean;
+	};
+
+	if (!url || !/^https?:\/\//i.test(url)) {
+		return c.json({ error: 'valid http(s) url required' }, 400);
+	}
+
+	const creditCost = summarize ? 5 : 1;
+	const validation = await validateCredits(userId, 'AI_CONTEXT_IMPORT_URL', creditCost);
+	if (!validation.hasCredits) {
+		return c.json(
+			{
+				error: 'Insufficient credits',
+				required: creditCost,
+				available: validation.availableCredits,
+			},
+			402
+		);
+	}
+
+	try {
+		const crawlBody = {
+			startUrl: url,
+			config: {
+				maxDepth: mode === 'deep' ? 3 : 0,
+				maxPages: mode === 'deep' ? DEEP_MAX_PAGES : 1,
+				rateLimit: 2,
+				respectRobots: true,
+				outputFormat: 'markdown',
+			},
+		};
+
+		const startRes = await fetch(`${CRAWLER_URL}/api/v1/crawl`, {
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify(crawlBody),
+		});
+		if (!startRes.ok) return c.json({ error: 'crawler unreachable' }, 502);
+		const { jobId } = (await startRes.json()) as { jobId: string };
+
+		await pollCrawlJob(jobId);
+
+		const resultsRes = await fetch(
+			`${CRAWLER_URL}/api/v1/crawl/${jobId}/results?page=1&limit=${DEEP_MAX_PAGES}`
+		);
+		if (!resultsRes.ok) return c.json({ error: 'crawl results failed' }, 502);
+		const results = (await resultsRes.json()) as {
+			results: Array<{
+				url: string;
+				title?: string | null;
+				markdown?: string | null;
+				content?: string | null;
+				depth: number;
+			}>;
+		};
+		const items = (results.results || []).filter((it) => it.markdown || it.content);
+		if (items.length === 0) return c.json({ error: 'no content crawled' }, 422);
+
+		items.sort((a, b) => a.depth - b.depth);
+		const root = items[0];
+		const pageTitle = root.title || new URL(url).hostname;
+
+		let content: string;
+		if (mode === 'deep' && items.length > 1) {
+			content = items
+				.map((it) => `# ${it.title || it.url}\n\n_${it.url}_\n\n${it.markdown || it.content}`)
+				.join('\n\n---\n\n');
+		} else {
+			content = root.markdown || root.content || '';
+		}
+
+		if (summarize) {
+			const summaryRes = await fetch(`${LLM_URL}/api/v1/chat/completions`, {
+				method: 'POST',
+				headers: { 'Content-Type': 'application/json' },
+				body: JSON.stringify({
+					model: DEFAULT_SUMMARY_MODEL,
+					max_tokens: 2000,
+					messages: [
+						{
+							role: 'system',
+							content:
+								'Du bist ein Assistent, der Web-Inhalte in strukturierte Kontext-Dokumente zusammenfasst. ' +
+								'Antworte ausschließlich in sauberem Markdown. Gliedere in H2-Abschnitte: ' +
+								'"## Überblick", "## Kernaussagen", "## Details". Nutze die Sprache der Quelle. ' +
+								'Schreibe die Antwort direkt, ohne Einleitung ("Hier ist…"), ohne Schlussformel, ' +
+								'und OHNE Code-Fences (```) um die Antwort.',
+						},
+						{
+							role: 'user',
+							content: `Quelle: ${url}\n\n${content.slice(0, 60_000)}`,
+						},
+					],
+				}),
+			});
+			if (summaryRes.ok) {
+				const data = (await summaryRes.json()) as {
+					choices?: Array<{ message?: { content?: string } }>;
+				};
+				const raw = data.choices?.[0]?.message?.content?.trim();
+				if (raw) {
+					content = sanitizeSummary(raw);
+				}
+			} else {
+				return c.json({ error: 'summary failed' }, 502);
+			}
+		}
+
+		await consumeCredits(
+			userId,
+			'AI_CONTEXT_IMPORT_URL',
+			creditCost,
+			`URL import (${mode}${summarize ? ' + summary' : ''})`
+		);
+
+		return c.json({
+			title: pageTitle,
+			content,
+			sourceUrl: url,
+			crawlMode: mode,
+			crawledAt: new Date().toISOString(),
+			pageCount: items.length,
+		});
+	} catch (err) {
+		const message = err instanceof Error ? err.message : 'import failed';
+		return c.json({ error: message }, 500);
+	}
+});
+
 // ─── AI Generation (server-only: mana-llm) ──────────────────

 routes.post('/ai/generate', async (c) => {