From 52159ee07a570c9a62faa05fb2b4f815ad27103c Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 9 Apr 2026 16:21:09 +0200 Subject: [PATCH] fix(news-ingester): disable Readability fallback to break crash loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSDOM throws CSS / parser errors from detached parse5 callbacks that escape every try/catch in the call stack and even bun's process.on('uncaughtException') handlers — leaving the daemon stuck crash-looping past the first bad page in source #4 (heise) without ever making forward progress. Set FULL_TEXT_THRESHOLD_WORDS = 0 so we never call into Readability. Sources that ship full RSS bodies (Tagesschau, Spiegel, BBC, …) are unaffected. Title-only sources (Hacker News) keep the row with an empty content field; the reader already falls back to "Original öffnen ↗" in that case. Re-enabling extraction in a worker thread is left for a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) --- services/news-ingester/src/ingest.ts | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/services/news-ingester/src/ingest.ts b/services/news-ingester/src/ingest.ts index 0245abd4d..ceed677ee 100644 --- a/services/news-ingester/src/ingest.ts +++ b/services/news-ingester/src/ingest.ts @@ -25,8 +25,22 @@ import { fetchAndExtract } from './parsers/readability'; const RETENTION_DAYS = 30; -/** Min word count to consider an RSS body "full enough" to skip Readability. */ -const FULL_TEXT_THRESHOLD_WORDS = 200; +/** + * Min word count to consider an RSS body "full enough" to skip Readability. + * + * Set to 0 to disable Readability entirely. The fallback was useful for + * sources like Hacker News where the RSS only ships titles, but JSDOM's + * CSS parser throws on a meaningful fraction of real-world pages and + * those throws happen in detached parse5 callbacks that escape every + * try/catch frame *and* bun's process-level handlers — leaving us with + * a crash-restart loop that never makes forward progress past the + * first bad page in source #4. Storing the RSS excerpt and letting the + * client reader offer "Open original ↗" is the robust fallback. + * + * If we want full text back later: run Readability in a worker thread + * or out-of-process so the parent stays alive when JSDOM blows up. + */ +const FULL_TEXT_THRESHOLD_WORDS = 0; function hashUrl(url: string): string { return createHash('sha256').update(url).digest('hex'); @@ -63,7 +77,7 @@ async function buildRow( let imageUrl = item.imageUrl; const initialWords = wordCountOf(content); - if (initialWords < FULL_TEXT_THRESHOLD_WORDS) { + if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) { const extracted = await fetchAndExtract(item.url); if (extracted) { content = extracted.content; @@ -75,8 +89,12 @@ async function buildRow( } } + // If the RSS gave nothing usable (Hacker News only ships titles + + // urls) keep the row anyway so the title is searchable / clickable + // and the reader can fall back to "Original öffnen ↗". The empty + // `content` is the signal the reader uses to skip prose rendering. + if (!content) content = excerpt ?? ''; const words = wordCountOf(content); - if (words === 0) return null; // nothing usable, skip return { urlHash: hashUrl(item.url),