diff --git a/apps/api/src/modules/articles/routes.ts b/apps/api/src/modules/articles/routes.ts index 79263e4f8..8772a2f39 100644 --- a/apps/api/src/modules/articles/routes.ts +++ b/apps/api/src/modules/articles/routes.ts @@ -1,35 +1,72 @@ /** * Articles module — server-side URL extraction. * - * Thin wrapper around `@mana/shared-rss`'s Readability pipeline. The - * extracted payload is returned to the client which then encrypts + - * stores it locally (and syncs via mana-sync). The server keeps no - * per-user article state — all reading-list data lives in the unified - * Mana app's IndexedDB. + * Two endpoints, both thin wrappers around `@mana/shared-rss`: * - * One endpoint (`POST /extract`), not two. News has a `preview` + `save` - * split for legacy reasons; here both UI paths (AddUrlForm preview + the - * direct saveFromUrl path) use the same payload. The client caches the - * response when the user confirms, avoiding a double server fetch. + * POST /extract ← server fetches the URL itself, then runs + * Readability on the HTML it got back. Works + * for simple sites but fails on anything behind + * a cookie-consent wall or a paywall — the + * server has no user session. + * POST /extract/html ← client already has the rendered HTML (from a + * browser bookmarklet running in the user's + * own tab with all their cookies applied). + * Server just runs Readability on that. This + * is how we bypass Golem / Spiegel / Zeit / + * Heise-style consent dialogs: use the user's + * already-consented session, not the server's + * anonymous fetch. + * + * Consent-wall heuristic: when /extract returns a suspiciously short + * payload that contains consent-dialog vocabulary we still hand the + * extracted text back but flag it with `warning: 'probable_consent_wall'` + * so the client can offer the bookmarklet-v2 path instead of pretending + * a 4-line "Cookies zustimmen" blob is the article. */ import { Hono } from 'hono'; -import { extractFromUrl } from '@mana/shared-rss'; +import { extractFromUrl, extractFromHtml } from '@mana/shared-rss'; const routes = new Hono(); +const CONSENT_KEYWORDS = [ + 'cookies zustimmen', + 'cookie consent', + 'zustimmung', + 'accept all cookies', + 'consent to the use', + 'enable javascript', + 'javascript is disabled', + 'please enable', + 'privacy center', + 'datenschutz­einstellungen', + 'datenschutzeinstellungen', +]; +const CONSENT_WORDCOUNT_THRESHOLD = 300; + +function looksLikeConsentWall(content: string, wordCount: number): boolean { + if (wordCount >= CONSENT_WORDCOUNT_THRESHOLD) return false; + const haystack = content.toLowerCase(); + return CONSENT_KEYWORDS.some((needle) => haystack.includes(needle)); +} + +function isValidHttpUrl(url: string): boolean { + try { + const u = new URL(url); + return u.protocol === 'http:' || u.protocol === 'https:'; + } catch { + return false; + } +} + +// POST /extract — server fetches the URL + extracts. Legacy path. routes.post('/extract', async (c) => { const body = await c.req.json<{ url?: string }>().catch(() => ({}) as { url?: string }); const url = body.url; if (!url || typeof url !== 'string') { return c.json({ error: 'URL is required' }, 400); } - - // Minimal URL shape check — extractFromUrl will no-op on a bad URL but - // the caller deserves a clear 400 vs a generic 502. - try { - new URL(url); - } catch { + if (!isValidHttpUrl(url)) { return c.json({ error: 'Invalid URL' }, 400); } @@ -38,6 +75,10 @@ routes.post('/extract', async (c) => { return c.json({ error: 'Extraction failed' }, 502); } + const warning = looksLikeConsentWall(extracted.content, extracted.wordCount) + ? 'probable_consent_wall' + : undefined; + return c.json({ originalUrl: url, title: extracted.title, @@ -48,6 +89,59 @@ routes.post('/extract', async (c) => { siteName: extracted.siteName, wordCount: extracted.wordCount, readingTimeMinutes: extracted.readingTimeMinutes, + ...(warning && { warning }), + }); +}); + +// POST /extract/html — client supplies HTML (from the user's browser +// tab, where cookies + JS rendering already happened). We only run +// Readability on it. Cap payload to 10 MiB so a pathological site +// can't exhaust server memory via the bookmarklet — typical rendered +// article HTML is 200-800 KB. +const MAX_HTML_BYTES = 10 * 1024 * 1024; + +routes.post('/extract/html', async (c) => { + const body = await c.req + .json<{ url?: string; html?: string }>() + .catch(() => ({}) as { url?: string; html?: string }); + const url = body.url; + const html = body.html; + if (!url || typeof url !== 'string') { + return c.json({ error: 'URL is required' }, 400); + } + if (!html || typeof html !== 'string') { + return c.json({ error: 'HTML is required' }, 400); + } + if (!isValidHttpUrl(url)) { + return c.json({ error: 'Invalid URL' }, 400); + } + if (html.length > MAX_HTML_BYTES) { + return c.json({ error: 'HTML payload too large' }, 413); + } + + const extracted = await extractFromHtml(html, url); + if (!extracted) { + return c.json({ error: 'Extraction failed' }, 502); + } + + // The consent-wall heuristic still applies here — a rare case is + // that the user bookmarklet-fires BEFORE the consent dialog is + // dismissed. Flag it so the client doesn't silently persist garbage. + const warning = looksLikeConsentWall(extracted.content, extracted.wordCount) + ? 'probable_consent_wall' + : undefined; + + return c.json({ + originalUrl: url, + title: extracted.title, + excerpt: extracted.excerpt, + content: extracted.content, + htmlContent: extracted.htmlContent, + author: extracted.byline, + siteName: extracted.siteName, + wordCount: extracted.wordCount, + readingTimeMinutes: extracted.readingTimeMinutes, + ...(warning && { warning }), }); }); diff --git a/apps/mana/apps/web/src/lib/modules/articles/api.ts b/apps/mana/apps/web/src/lib/modules/articles/api.ts index b49b9b083..1a3fd41a2 100644 --- a/apps/mana/apps/web/src/lib/modules/articles/api.ts +++ b/apps/mana/apps/web/src/lib/modules/articles/api.ts @@ -28,23 +28,104 @@ export interface ExtractedArticle { siteName: string | null; wordCount: number; readingTimeMinutes: number; + /** + * Server-side quality flag. Today only `'probable_consent_wall'` is + * emitted: the extracted text was suspiciously short AND contained + * consent-dialog vocabulary, which typically means the server's + * anonymous fetch hit a GDPR interstitial instead of the article. + * The client uses this to offer the bookmarklet-v2 (browser-HTML) + * path without silently persisting garbage. + */ + warning?: 'probable_consent_wall'; } +/** + * Hard client-side timeout for the extract roundtrip. The server's + * own Readability fetch has a 15s timeout + a few seconds of JSDOM + * parse overhead; anything past 25s on the wire is almost certainly a + * dead server or a stuck network path, not a slow article. Without + * this, AddUrlForm's loader just sat there forever when the API was + * unreachable — hence the bookmarklet-lands-on-loader bug. + */ +const EXTRACT_TIMEOUT_MS = 25_000; + export async function extractArticle( url: string, fetchImpl: typeof fetch = fetch ): Promise { - const response = await fetchImpl(`${getManaApiUrl()}/api/v1/articles/extract`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - ...(await authHeader()), - }, - body: JSON.stringify({ url }), - }); + let response: Response; + try { + response = await fetchImpl(`${getManaApiUrl()}/api/v1/articles/extract`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(await authHeader()), + }, + body: JSON.stringify({ url }), + signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS), + }); + } catch (err) { + if (err instanceof DOMException && err.name === 'TimeoutError') { + throw new Error( + `Server antwortet nicht (nach ${EXTRACT_TIMEOUT_MS / 1000}s). Läuft apps/api?` + ); + } + if (err instanceof TypeError) { + // Network-layer failure (connection refused, DNS, offline). + throw new Error( + `Server nicht erreichbar. Prüf dass apps/api läuft — pnpm run mana:dev startet beides.` + ); + } + throw err; + } if (!response.ok) { const text = await response.text(); throw new Error(`extractArticle failed: ${response.status} ${text}`); } return (await response.json()) as ExtractedArticle; } + +/** + * Extract from a HTML payload the browser already has. Used by the + * bookmarklet-v2 flow — the user's browser already dealt with the + * cookie-consent wall, so we skip the server-side fetch entirely. + * + * The HTML cap is 10 MiB on the server; the browser sends + * `document.documentElement.outerHTML` which for typical article + * pages is 200-800 KB, well under the limit. + */ +export async function extractFromHtml( + url: string, + html: string, + fetchImpl: typeof fetch = fetch +): Promise { + let response: Response; + try { + response = await fetchImpl(`${getManaApiUrl()}/api/v1/articles/extract/html`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(await authHeader()), + }, + body: JSON.stringify({ url, html }), + signal: AbortSignal.timeout(EXTRACT_TIMEOUT_MS), + }); + } catch (err) { + if (err instanceof DOMException && err.name === 'TimeoutError') { + throw new Error( + `Server antwortet nicht (nach ${EXTRACT_TIMEOUT_MS / 1000}s). Läuft apps/api?` + ); + } + if (err instanceof TypeError) { + throw new Error( + `Server nicht erreichbar. Prüf dass apps/api läuft — pnpm run mana:dev startet beides.` + ); + } + throw err; + } + if (!response.ok) { + const text = await response.text(); + throw new Error(`extractFromHtml failed: ${response.status} ${text}`); + } + return (await response.json()) as ExtractedArticle; +} diff --git a/apps/mana/apps/web/src/lib/modules/articles/components/AddUrlForm.svelte b/apps/mana/apps/web/src/lib/modules/articles/components/AddUrlForm.svelte index 62fe91ee7..959454e0f 100644 --- a/apps/mana/apps/web/src/lib/modules/articles/components/AddUrlForm.svelte +++ b/apps/mana/apps/web/src/lib/modules/articles/components/AddUrlForm.svelte @@ -1,28 +1,25 @@

Artikel speichern

-

URL einfügen, Vorschau prüfen, speichern.

+

URL einfügen — Mana extrahiert + speichert direkt.

@@ -122,16 +246,33 @@ class="url-input" bind:value={url} placeholder="https://…" + disabled={loading || saving} onkeydown={(e) => { - if (e.key === 'Enter') handlePreview(); + if (e.key === 'Enter') handleSubmit(); }} use:focusOnMount /> -
+ {#if (loading || saving) && !error && !preview && !duplicate} +
+ +
+

+ {saving ? 'Speichere in deine Leseliste…' : 'Server extrahiert den Artikel…'} +

+

+ {saving + ? 'Gleich weiter zum Reader.' + : 'Dauert normalerweise 2–5 Sekunden. Nach 25 Sekunden geben wir auf.'} +

+
+
+ {/if} + {#if error}

{error}

{/if} @@ -150,6 +291,25 @@ {/if} {#if preview} + +

{preview.title}

@@ -162,8 +322,8 @@

{preview.excerpt}

{/if}
-
Quellcode anzeigen -
{bookmarklet}
+
{bookmarkletV2}

- Funktioniert in jedem Desktop-Browser. In Safari: Lesezeichen anlegen mit einer beliebigen - URL, dann nachträglich die URL durch das Snippet ersetzen. + Öffnet einen neuen Tab mit Mana, der Mana-Tab bekommt das HTML per + postMessage von deinem Artikel-Tab. Braucht erlaubte Popups für diese Domain (Browser + fragt beim ersten Mal). +

+ + +
+

URL-Bookmarklet (klassisch)

+

+ Schickt nur die URL an Mana, der Server lädt + extrahiert dann selbst. Schnell auf einfachen + Blogs / Wikis; scheitert auf Seiten hinter DSGVO-Zustimmungs-Dialogen. +

+
+ {#if bookmarkletV1} + e.preventDefault()} + > + + In Mana speichern (URL) + + {:else} + Bookmarklet wird geladen… + {/if} + +
+
+ Quellcode anzeigen +
{bookmarkletV1}
+
+

+ Funktioniert in jedem Desktop-Browser. In Safari: Lesezeichen mit beliebiger URL anlegen und + die URL dann durch das Snippet ersetzen.

@@ -98,7 +168,9 @@ auswählen → Artikel wird direkt in der Leseliste vorgeschlagen.

- iOS-Safari unterstützt die Web-Share-Target-API derzeit nicht — nutze dort das Bookmarklet. + Benutzt dieselbe URL-Route wie das klassische Bookmarklet oben — für cookie-gewalled Seiten + lieber das HTML-Bookmarklet verwenden. iOS-Safari unterstützt die Web-Share-Target-API derzeit + nicht.

@@ -126,6 +198,32 @@ border: 1px solid var(--color-border, rgba(0, 0, 0, 0.1)); border-radius: 0.75rem; background: var(--color-surface, transparent); + position: relative; + } + .card-recommended { + border-color: color-mix(in srgb, #f97316 50%, transparent); + background: color-mix(in srgb, #f97316 4%, transparent); + } + .badge { + position: absolute; + top: -0.55rem; + left: 1rem; + padding: 0.15rem 0.55rem; + border-radius: 999px; + background: #f97316; + color: white; + font-size: 0.72rem; + font-weight: 600; + letter-spacing: 0.03em; + text-transform: uppercase; + } + .bookmarklet-secondary { + background: transparent; + color: #f97316; + border: 1px solid #f97316; + } + .bookmarklet-secondary:hover { + background: color-mix(in srgb, #f97316 10%, transparent); } .card h2 { margin: 0 0 0.5rem 0;