From dad174a63160eb32519c0db69fe87b92bd5fe3ff Mon Sep 17 00:00:00 2001 From: Till JS Date: Thu, 9 Apr 2026 16:15:46 +0200 Subject: [PATCH] fix(news-ingester): silence JSDOM CSS errors + add process-level safety net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSDOM's CSS parser throws on plenty of real-world pages and the error escapes every try/catch in the buildRow → ingestSource chain because it fires from a parse5 callback that runs after JSDOM has returned. In the prod container this killed the process on the first bad page, docker restarted it, and it crash-looped on the same first source forever — no progress past tech. Two-layer fix: a silent VirtualConsole on every JSDOM instance to swallow CSS / resource errors at the source, plus process-level uncaughtException + unhandledRejection handlers that log and continue so any future async escape can't kill the daemon either. Co-Authored-By: Claude Opus 4.6 (1M context) --- services/news-ingester/src/index.ts | 20 +++++++++++++++++++ .../news-ingester/src/parsers/readability.ts | 12 +++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/services/news-ingester/src/index.ts b/services/news-ingester/src/index.ts index 48954581c..6deb51e94 100644 --- a/services/news-ingester/src/index.ts +++ b/services/news-ingester/src/index.ts @@ -17,6 +17,26 @@ import { loadConfig } from './config'; import { getDb } from './db/connection'; import { runIngestTick, type TickResult } from './ingest'; +// JSDOM (used by the Readability fallback parser) likes to throw async +// CSS / parser errors that escape every try/catch frame in the call +// stack and reach the process top-level. In a long-running daemon +// container we'd rather log + continue than crash + restart, because +// crash-restart loops never make forward progress through the source +// list (each restart begins again at source #0). Catching here is the +// failure-mode boundary for "we attempted to ingest one bad page". +process.on('uncaughtException', (err) => { + console.warn( + '[news-ingester] uncaughtException swallowed:', + err instanceof Error ? err.message : err + ); +}); +process.on('unhandledRejection', (reason) => { + console.warn( + '[news-ingester] unhandledRejection swallowed:', + reason instanceof Error ? reason.message : reason + ); +}); + const config = loadConfig(); const db = getDb(config.databaseUrl); diff --git a/services/news-ingester/src/parsers/readability.ts b/services/news-ingester/src/parsers/readability.ts index 57dbe1544..a9679a995 100644 --- a/services/news-ingester/src/parsers/readability.ts +++ b/services/news-ingester/src/parsers/readability.ts @@ -8,7 +8,15 @@ */ import { Readability } from '@mozilla/readability'; -import { JSDOM } from 'jsdom'; +import { JSDOM, VirtualConsole } from 'jsdom'; + +// JSDOM emits CSS parse errors and resource-loading warnings via its +// virtualConsole. The default console rethrows some of those as +// uncaughtException, which in a long-running ingester loop kills the +// whole process. A bare VirtualConsole with no listeners swallows +// everything quietly — exactly what we want for a "best-effort article +// extractor" that doesn't care about CSS or sub-resources. +const silentConsole = new VirtualConsole(); export interface ExtractedArticle { title: string | null; @@ -37,7 +45,7 @@ export async function fetchAndExtract(url: string): Promise