From 121a0c0a6fa51379adccfa8ad63f5661adfb2e0d Mon Sep 17 00:00:00 2001 From: Till JS Date: Wed, 15 Apr 2026 14:24:19 +0200 Subject: [PATCH] =?UTF-8?q?feat(api):=20POST=20/api/v1/context/import-url?= =?UTF-8?q?=20=E2=80=94=20crawler=20+=20optional=20LLM=20summary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New backend endpoint that wraps mana-crawler + mana-llm in a single call so the Kontext "Aus URL" UI can hit one route: - Starts a crawl job (single page or up-to-20-page deep crawl) via mana-crawler's /api/v1/crawl, polls status up to 90s, then fetches paginated results. - When multiple pages are returned, joins them into one markdown document with H1-per-page section headers separated by ---. - When summarize=true, routes the collected markdown through mana-llm/chat/completions with a system prompt that asks for "Überblick / Kernaussagen / Details" H2 structure in the source language. sanitizeSummary() strips the common local-LLM artefacts (```markdown fences, "Hier ist …:" preamble, stray leading H1) so the output drops cleanly into the Kontext doc. On summary failure the endpoint returns 502 rather than silently falling back to the raw crawl. - Credits are validated + consumed via @mana/shared-hono/credits (1 credit crawl-only, 5 crawl+summary) under the new AI_CONTEXT_IMPORT_URL action. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/api/src/modules/context/routes.ts | 180 +++++++++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/apps/api/src/modules/context/routes.ts b/apps/api/src/modules/context/routes.ts index 8f3bdd2fd..f5cd59414 100644 --- a/apps/api/src/modules/context/routes.ts +++ b/apps/api/src/modules/context/routes.ts @@ -10,9 +10,189 @@ import { consumeCredits, validateCredits } from '@mana/shared-hono/credits'; import type { AuthVariables } from '@mana/shared-hono'; const LLM_URL = process.env.MANA_LLM_URL || 'http://localhost:3025'; +const CRAWLER_URL = process.env.MANA_CRAWLER_URL || 'http://localhost:3023'; +const DEFAULT_SUMMARY_MODEL = process.env.MANA_LLM_DEFAULT_MODEL || 'gemma3:4b'; const routes = new Hono<{ Variables: AuthVariables }>(); +// ─── URL Import (crawler → optional LLM summary → document) ── + +const DEEP_MAX_PAGES = 20; +const CRAWL_POLL_INTERVAL_MS = 1500; +const CRAWL_TIMEOUT_MS = 90_000; + +/** + * Local LLMs love to wrap Markdown in ```markdown fences or prepend + * a "Hier ist die Zusammenfassung:" preamble. Strip those so the + * output renders correctly when dropped into the Kontext document. + */ +function sanitizeSummary(raw: string): string { + let s = raw.trim(); + // Strip a leading ```markdown / ```md / ``` fence and its closing ```. + const fenceMatch = s.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n?```\s*$/i); + if (fenceMatch) s = fenceMatch[1].trim(); + // Drop a single-line preamble that ends with a colon (LLM chatter). + const lines = s.split('\n'); + if (lines.length > 2 && /^[^#\n].{0,80}:\s*$/.test(lines[0].trim())) { + s = lines.slice(1).join('\n').trim(); + } + // Demote a solitary leading H1 to H2 so it doesn't clash with our + // section header that the frontend prepends. + s = s.replace(/^#\s+/, '## '); + return s; +} + +async function pollCrawlJob(jobId: string) { + const deadline = Date.now() + CRAWL_TIMEOUT_MS; + while (Date.now() < deadline) { + await new Promise((r) => setTimeout(r, CRAWL_POLL_INTERVAL_MS)); + const res = await fetch(`${CRAWLER_URL}/api/v1/crawl/${jobId}`); + if (!res.ok) throw new Error(`crawl status ${res.status}`); + const job = (await res.json()) as { status: string; error?: string }; + if (job.status === 'completed') return; + if (job.status === 'failed') throw new Error(job.error || 'crawl failed'); + } + throw new Error('crawl timeout'); +} + +routes.post('/import-url', async (c) => { + const userId = c.get('userId'); + const { + url, + mode = 'single', + summarize = false, + } = (await c.req.json()) as { + url?: string; + mode?: 'single' | 'deep'; + summarize?: boolean; + }; + + if (!url || !/^https?:\/\//i.test(url)) { + return c.json({ error: 'valid http(s) url required' }, 400); + } + + const creditCost = summarize ? 5 : 1; + const validation = await validateCredits(userId, 'AI_CONTEXT_IMPORT_URL', creditCost); + if (!validation.hasCredits) { + return c.json( + { + error: 'Insufficient credits', + required: creditCost, + available: validation.availableCredits, + }, + 402 + ); + } + + try { + const crawlBody = { + startUrl: url, + config: { + maxDepth: mode === 'deep' ? 3 : 0, + maxPages: mode === 'deep' ? DEEP_MAX_PAGES : 1, + rateLimit: 2, + respectRobots: true, + outputFormat: 'markdown', + }, + }; + + const startRes = await fetch(`${CRAWLER_URL}/api/v1/crawl`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(crawlBody), + }); + if (!startRes.ok) return c.json({ error: 'crawler unreachable' }, 502); + const { jobId } = (await startRes.json()) as { jobId: string }; + + await pollCrawlJob(jobId); + + const resultsRes = await fetch( + `${CRAWLER_URL}/api/v1/crawl/${jobId}/results?page=1&limit=${DEEP_MAX_PAGES}` + ); + if (!resultsRes.ok) return c.json({ error: 'crawl results failed' }, 502); + const results = (await resultsRes.json()) as { + results: Array<{ + url: string; + title?: string | null; + markdown?: string | null; + content?: string | null; + depth: number; + }>; + }; + const items = (results.results || []).filter((it) => it.markdown || it.content); + if (items.length === 0) return c.json({ error: 'no content crawled' }, 422); + + items.sort((a, b) => a.depth - b.depth); + const root = items[0]; + const pageTitle = root.title || new URL(url).hostname; + + let content: string; + if (mode === 'deep' && items.length > 1) { + content = items + .map((it) => `# ${it.title || it.url}\n\n_${it.url}_\n\n${it.markdown || it.content}`) + .join('\n\n---\n\n'); + } else { + content = root.markdown || root.content || ''; + } + + if (summarize) { + const summaryRes = await fetch(`${LLM_URL}/api/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: DEFAULT_SUMMARY_MODEL, + max_tokens: 2000, + messages: [ + { + role: 'system', + content: + 'Du bist ein Assistent, der Web-Inhalte in strukturierte Kontext-Dokumente zusammenfasst. ' + + 'Antworte ausschließlich in sauberem Markdown. Gliedere in H2-Abschnitte: ' + + '"## Überblick", "## Kernaussagen", "## Details". Nutze die Sprache der Quelle. ' + + 'Schreibe die Antwort direkt, ohne Einleitung ("Hier ist…"), ohne Schlussformel, ' + + 'und OHNE Code-Fences (```) um die Antwort.', + }, + { + role: 'user', + content: `Quelle: ${url}\n\n${content.slice(0, 60_000)}`, + }, + ], + }), + }); + if (summaryRes.ok) { + const data = (await summaryRes.json()) as { + choices?: Array<{ message?: { content?: string } }>; + }; + const raw = data.choices?.[0]?.message?.content?.trim(); + if (raw) { + content = sanitizeSummary(raw); + } + } else { + return c.json({ error: 'summary failed' }, 502); + } + } + + await consumeCredits( + userId, + 'AI_CONTEXT_IMPORT_URL', + creditCost, + `URL import (${mode}${summarize ? ' + summary' : ''})` + ); + + return c.json({ + title: pageTitle, + content, + sourceUrl: url, + crawlMode: mode, + crawledAt: new Date().toISOString(), + pageCount: items.length, + }); + } catch (err) { + const message = err instanceof Error ? err.message : 'import failed'; + return c.json({ error: message }, 500); + } +}); + // ─── AI Generation (server-only: mana-llm) ────────────────── routes.post('/ai/generate', async (c) => {