mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 10:26:43 +02:00
Four cross-cutting fixes that make the bulk-import worker safe to run
under real production load. All four were called out as live-rollout
risks in the post-ship review of docs/plans/articles-bulk-import.md.
#1 — Same fieldMetaTime bug fixed in mana-ai
The articles fix in 054b9e5be hoists the helper to its own file
`apps/api/src/modules/articles/field-meta.ts`. The same naive
`rowFM[k] >= localTime` LWW comparison existed in three more
projections under services/mana-ai (missions-projection,
snapshot-refresh, agents-projection). Once any F3 stamp lands
beside a legacy-string stamp, the comparison evaluates
`'[object Object]' >= 'ISO-…'` (false) and the older value wins.
New `services/mana-ai/src/db/field-meta.ts` — same helper,
deliberately duplicated (each service treats sync_changes as a
read-only event log; sharing infra across services is out of
scope here). All 61 mana-ai bun tests still pass.
#2 — Stale 'extracting' items recycle
If the worker dies mid-fetch (OOM, pod restart), items stay in
state='extracting' forever and the job never completes. New sweep
at the start of `processOneJob`: items whose lastAttemptAt is
older than 5 minutes get bounced back to 'pending' so the next
tick re-claims them. STALE_EXTRACTING_MS tuned for the 15s
shared-rss fetch + JSDOM-parse worst case.
#3 — Pickup-row GC
Every 30 ticks (~once per minute) the worker hard-deletes
articleExtractPickup rows older than 24h. Without this a stuck
pickup-consumer (all tabs closed, Web-Lock mismatch) would let
sync_changes accumulate without bound. Logs the row count when
non-zero so we can spot stuck consumers in the wild.
#4 — DRY consent-wall heuristic
Identical CONSENT_KEYWORDS + threshold lived in routes.ts AND
import-extractor.ts. Hoisted to
`apps/api/src/modules/articles/consent-wall.ts`; both call sites
now share one heuristic.
Plan: docs/plans/articles-bulk-import.md.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
128 lines
4.4 KiB
TypeScript
128 lines
4.4 KiB
TypeScript
/**
|
|
* Articles module — server-side URL extraction.
|
|
*
|
|
* Two endpoints, both thin wrappers around `@mana/shared-rss`:
|
|
*
|
|
* POST /extract ← server fetches the URL itself, then runs
|
|
* Readability on the HTML it got back. Works
|
|
* for simple sites but fails on anything behind
|
|
* a cookie-consent wall or a paywall — the
|
|
* server has no user session.
|
|
* POST /extract/html ← client already has the rendered HTML (from a
|
|
* browser bookmarklet running in the user's
|
|
* own tab with all their cookies applied).
|
|
* Server just runs Readability on that. This
|
|
* is how we bypass Golem / Spiegel / Zeit /
|
|
* Heise-style consent dialogs: use the user's
|
|
* already-consented session, not the server's
|
|
* anonymous fetch.
|
|
*
|
|
* Consent-wall heuristic: when /extract returns a suspiciously short
|
|
* payload that contains consent-dialog vocabulary we still hand the
|
|
* extracted text back but flag it with `warning: 'probable_consent_wall'`
|
|
* so the client can offer the bookmarklet-v2 path instead of pretending
|
|
* a 4-line "Cookies zustimmen" blob is the article.
|
|
*/
|
|
|
|
import { Hono } from 'hono';
|
|
import { extractFromUrl, extractFromHtml } from '@mana/shared-rss';
|
|
import { looksLikeConsentWall } from './consent-wall';
|
|
|
|
const routes = new Hono();
|
|
|
|
function isValidHttpUrl(url: string): boolean {
|
|
try {
|
|
const u = new URL(url);
|
|
return u.protocol === 'http:' || u.protocol === 'https:';
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// POST /extract — server fetches the URL + extracts. Legacy path.
|
|
routes.post('/extract', async (c) => {
|
|
const body = await c.req.json<{ url?: string }>().catch(() => ({}) as { url?: string });
|
|
const url = body.url;
|
|
if (!url || typeof url !== 'string') {
|
|
return c.json({ error: 'URL is required' }, 400);
|
|
}
|
|
if (!isValidHttpUrl(url)) {
|
|
return c.json({ error: 'Invalid URL' }, 400);
|
|
}
|
|
|
|
const extracted = await extractFromUrl(url);
|
|
if (!extracted) {
|
|
return c.json({ error: 'Extraction failed' }, 502);
|
|
}
|
|
|
|
const warning = looksLikeConsentWall(extracted.content, extracted.wordCount)
|
|
? 'probable_consent_wall'
|
|
: undefined;
|
|
|
|
return c.json({
|
|
originalUrl: url,
|
|
title: extracted.title,
|
|
excerpt: extracted.excerpt,
|
|
content: extracted.content,
|
|
htmlContent: extracted.htmlContent,
|
|
author: extracted.byline,
|
|
siteName: extracted.siteName,
|
|
wordCount: extracted.wordCount,
|
|
readingTimeMinutes: extracted.readingTimeMinutes,
|
|
...(warning && { warning }),
|
|
});
|
|
});
|
|
|
|
// POST /extract/html — client supplies HTML (from the user's browser
|
|
// tab, where cookies + JS rendering already happened). We only run
|
|
// Readability on it. Cap payload to 10 MiB so a pathological site
|
|
// can't exhaust server memory via the bookmarklet — typical rendered
|
|
// article HTML is 200-800 KB.
|
|
const MAX_HTML_BYTES = 10 * 1024 * 1024;
|
|
|
|
routes.post('/extract/html', async (c) => {
|
|
const body = await c.req
|
|
.json<{ url?: string; html?: string }>()
|
|
.catch(() => ({}) as { url?: string; html?: string });
|
|
const url = body.url;
|
|
const html = body.html;
|
|
if (!url || typeof url !== 'string') {
|
|
return c.json({ error: 'URL is required' }, 400);
|
|
}
|
|
if (!html || typeof html !== 'string') {
|
|
return c.json({ error: 'HTML is required' }, 400);
|
|
}
|
|
if (!isValidHttpUrl(url)) {
|
|
return c.json({ error: 'Invalid URL' }, 400);
|
|
}
|
|
if (html.length > MAX_HTML_BYTES) {
|
|
return c.json({ error: 'HTML payload too large' }, 413);
|
|
}
|
|
|
|
const extracted = await extractFromHtml(html, url);
|
|
if (!extracted) {
|
|
return c.json({ error: 'Extraction failed' }, 502);
|
|
}
|
|
|
|
// The consent-wall heuristic still applies here — a rare case is
|
|
// that the user bookmarklet-fires BEFORE the consent dialog is
|
|
// dismissed. Flag it so the client doesn't silently persist garbage.
|
|
const warning = looksLikeConsentWall(extracted.content, extracted.wordCount)
|
|
? 'probable_consent_wall'
|
|
: undefined;
|
|
|
|
return c.json({
|
|
originalUrl: url,
|
|
title: extracted.title,
|
|
excerpt: extracted.excerpt,
|
|
content: extracted.content,
|
|
htmlContent: extracted.htmlContent,
|
|
author: extracted.byline,
|
|
siteName: extracted.siteName,
|
|
wordCount: extracted.wordCount,
|
|
readingTimeMinutes: extracted.readingTimeMinutes,
|
|
...(warning && { warning }),
|
|
});
|
|
});
|
|
|
|
export { routes as articlesRoutes };
|