managarten/apps/api/src/modules/articles/routes.ts
Till JS b297f68ee4 fix(articles, mana-ai): rollout-block hardening for sync_changes projections
Four cross-cutting fixes that make the bulk-import worker safe to run
under real production load. All four were called out as live-rollout
risks in the post-ship review of docs/plans/articles-bulk-import.md.

#1 — Same fieldMetaTime bug fixed in mana-ai
   The articles fix in 054b9e5be hoists the helper to its own file
   `apps/api/src/modules/articles/field-meta.ts`. The same naive
   `rowFM[k] >= localTime` LWW comparison existed in three more
   projections under services/mana-ai (missions-projection,
   snapshot-refresh, agents-projection). Once any F3 stamp lands
   beside a legacy-string stamp, the comparison evaluates
   `'[object Object]' >= 'ISO-…'` (false) and the older value wins.
   New `services/mana-ai/src/db/field-meta.ts` — same helper,
   deliberately duplicated (each service treats sync_changes as a
   read-only event log; sharing infra across services is out of
   scope here). All 61 mana-ai bun tests still pass.

#2 — Stale 'extracting' items recycle
   If the worker dies mid-fetch (OOM, pod restart), items stay in
   state='extracting' forever and the job never completes. New sweep
   at the start of `processOneJob`: items whose lastAttemptAt is
   older than 5 minutes get bounced back to 'pending' so the next
   tick re-claims them. STALE_EXTRACTING_MS tuned for the 15s
   shared-rss fetch + JSDOM-parse worst case.

#3 — Pickup-row GC
   Every 30 ticks (~once per minute) the worker hard-deletes
   articleExtractPickup rows older than 24h. Without this a stuck
   pickup-consumer (all tabs closed, Web-Lock mismatch) would let
   sync_changes accumulate without bound. Logs the row count when
   non-zero so we can spot stuck consumers in the wild.

#4 — DRY consent-wall heuristic
   Identical CONSENT_KEYWORDS + threshold lived in routes.ts AND
   import-extractor.ts. Hoisted to
   `apps/api/src/modules/articles/consent-wall.ts`; both call sites
   now share one heuristic.

Plan: docs/plans/articles-bulk-import.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 00:53:39 +02:00

128 lines
4.4 KiB
TypeScript

/**
* Articles module — server-side URL extraction.
*
* Two endpoints, both thin wrappers around `@mana/shared-rss`:
*
* POST /extract ← server fetches the URL itself, then runs
* Readability on the HTML it got back. Works
* for simple sites but fails on anything behind
* a cookie-consent wall or a paywall — the
* server has no user session.
* POST /extract/html ← client already has the rendered HTML (from a
* browser bookmarklet running in the user's
* own tab with all their cookies applied).
* Server just runs Readability on that. This
* is how we bypass Golem / Spiegel / Zeit /
* Heise-style consent dialogs: use the user's
* already-consented session, not the server's
* anonymous fetch.
*
* Consent-wall heuristic: when /extract returns a suspiciously short
* payload that contains consent-dialog vocabulary we still hand the
* extracted text back but flag it with `warning: 'probable_consent_wall'`
* so the client can offer the bookmarklet-v2 path instead of pretending
* a 4-line "Cookies zustimmen" blob is the article.
*/
import { Hono } from 'hono';
import { extractFromUrl, extractFromHtml } from '@mana/shared-rss';
import { looksLikeConsentWall } from './consent-wall';
const routes = new Hono();
function isValidHttpUrl(url: string): boolean {
try {
const u = new URL(url);
return u.protocol === 'http:' || u.protocol === 'https:';
} catch {
return false;
}
}
// POST /extract — server fetches the URL + extracts. Legacy path.
routes.post('/extract', async (c) => {
const body = await c.req.json<{ url?: string }>().catch(() => ({}) as { url?: string });
const url = body.url;
if (!url || typeof url !== 'string') {
return c.json({ error: 'URL is required' }, 400);
}
if (!isValidHttpUrl(url)) {
return c.json({ error: 'Invalid URL' }, 400);
}
const extracted = await extractFromUrl(url);
if (!extracted) {
return c.json({ error: 'Extraction failed' }, 502);
}
const warning = looksLikeConsentWall(extracted.content, extracted.wordCount)
? 'probable_consent_wall'
: undefined;
return c.json({
originalUrl: url,
title: extracted.title,
excerpt: extracted.excerpt,
content: extracted.content,
htmlContent: extracted.htmlContent,
author: extracted.byline,
siteName: extracted.siteName,
wordCount: extracted.wordCount,
readingTimeMinutes: extracted.readingTimeMinutes,
...(warning && { warning }),
});
});
// POST /extract/html — client supplies HTML (from the user's browser
// tab, where cookies + JS rendering already happened). We only run
// Readability on it. Cap payload to 10 MiB so a pathological site
// can't exhaust server memory via the bookmarklet — typical rendered
// article HTML is 200-800 KB.
const MAX_HTML_BYTES = 10 * 1024 * 1024;
routes.post('/extract/html', async (c) => {
const body = await c.req
.json<{ url?: string; html?: string }>()
.catch(() => ({}) as { url?: string; html?: string });
const url = body.url;
const html = body.html;
if (!url || typeof url !== 'string') {
return c.json({ error: 'URL is required' }, 400);
}
if (!html || typeof html !== 'string') {
return c.json({ error: 'HTML is required' }, 400);
}
if (!isValidHttpUrl(url)) {
return c.json({ error: 'Invalid URL' }, 400);
}
if (html.length > MAX_HTML_BYTES) {
return c.json({ error: 'HTML payload too large' }, 413);
}
const extracted = await extractFromHtml(html, url);
if (!extracted) {
return c.json({ error: 'Extraction failed' }, 502);
}
// The consent-wall heuristic still applies here — a rare case is
// that the user bookmarklet-fires BEFORE the consent dialog is
// dismissed. Flag it so the client doesn't silently persist garbage.
const warning = looksLikeConsentWall(extracted.content, extracted.wordCount)
? 'probable_consent_wall'
: undefined;
return c.json({
originalUrl: url,
title: extracted.title,
excerpt: extracted.excerpt,
content: extracted.content,
htmlContent: extracted.htmlContent,
author: extracted.byline,
siteName: extracted.siteName,
wordCount: extracted.wordCount,
readingTimeMinutes: extracted.readingTimeMinutes,
...(warning && { warning }),
});
});
export { routes as articlesRoutes };