mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-15 01:41:08 +02:00
fix(news-ingester): silence JSDOM CSS errors + add process-level safety net
JSDOM's CSS parser throws on plenty of real-world pages and the error escapes every try/catch in the buildRow → ingestSource chain because it fires from a parse5 callback that runs after JSDOM has returned. In the prod container this killed the process on the first bad page, docker restarted it, and it crash-looped on the same first source forever — no progress past tech. Two-layer fix: a silent VirtualConsole on every JSDOM instance to swallow CSS / resource errors at the source, plus process-level uncaughtException + unhandledRejection handlers that log and continue so any future async escape can't kill the daemon either. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bd748b0a14
commit
dad174a631
2 changed files with 30 additions and 2 deletions
|
|
@ -17,6 +17,26 @@ import { loadConfig } from './config';
|
|||
import { getDb } from './db/connection';
|
||||
import { runIngestTick, type TickResult } from './ingest';
|
||||
|
||||
// JSDOM (used by the Readability fallback parser) likes to throw async
|
||||
// CSS / parser errors that escape every try/catch frame in the call
|
||||
// stack and reach the process top-level. In a long-running daemon
|
||||
// container we'd rather log + continue than crash + restart, because
|
||||
// crash-restart loops never make forward progress through the source
|
||||
// list (each restart begins again at source #0). Catching here is the
|
||||
// failure-mode boundary for "we attempted to ingest one bad page".
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.warn(
|
||||
'[news-ingester] uncaughtException swallowed:',
|
||||
err instanceof Error ? err.message : err
|
||||
);
|
||||
});
|
||||
process.on('unhandledRejection', (reason) => {
|
||||
console.warn(
|
||||
'[news-ingester] unhandledRejection swallowed:',
|
||||
reason instanceof Error ? reason.message : reason
|
||||
);
|
||||
});
|
||||
|
||||
const config = loadConfig();
|
||||
const db = getDb(config.databaseUrl);
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,15 @@
|
|||
*/
|
||||
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
|
||||
// JSDOM emits CSS parse errors and resource-loading warnings via its
|
||||
// virtualConsole. The default console rethrows some of those as
|
||||
// uncaughtException, which in a long-running ingester loop kills the
|
||||
// whole process. A bare VirtualConsole with no listeners swallows
|
||||
// everything quietly — exactly what we want for a "best-effort article
|
||||
// extractor" that doesn't care about CSS or sub-resources.
|
||||
const silentConsole = new VirtualConsole();
|
||||
|
||||
export interface ExtractedArticle {
|
||||
title: string | null;
|
||||
|
|
@ -37,7 +45,7 @@ export async function fetchAndExtract(url: string): Promise<ExtractedArticle | n
|
|||
}
|
||||
|
||||
try {
|
||||
const dom = new JSDOM(html, { url });
|
||||
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
if (!article || !article.textContent) return null;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue