managarten/services/news-ingester/src/index.ts
Till JS dad174a631 fix(news-ingester): silence JSDOM CSS errors + add process-level safety net
JSDOM's CSS parser throws on plenty of real-world pages and the error
escapes every try/catch in the buildRow → ingestSource chain because
it fires from a parse5 callback that runs after JSDOM has returned.
In the prod container this killed the process on the first bad page,
docker restarted it, and it crash-looped on the same first source
forever — no progress past tech.

Two-layer fix: a silent VirtualConsole on every JSDOM instance to
swallow CSS / resource errors at the source, plus process-level
uncaughtException + unhandledRejection handlers that log and continue
so any future async escape can't kill the daemon either.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 16:15:46 +02:00

104 lines
3.3 KiB
TypeScript

/**
* news-ingester — pulls public RSS / JSON feeds into news.curated_articles
* on a fixed interval. Exposes a tiny Hono server for health + manual
* trigger so the container can be probed and re-kicked without a restart.
*
* Why a long-running container instead of a host cron:
* - logs land in the same docker stack as everything else
* - restarts on crash via docker
* - health endpoint for the docker-compose healthcheck
* - lets us hit /ingest/run from a shell to debug new sources without
* waiting 15 minutes
*/
import { Hono } from 'hono';
import { sql } from 'drizzle-orm';
import { loadConfig } from './config';
import { getDb } from './db/connection';
import { runIngestTick, type TickResult } from './ingest';
// JSDOM (used by the Readability fallback parser) likes to throw async
// CSS / parser errors that escape every try/catch frame in the call
// stack and reach the process top-level. In a long-running daemon
// container we'd rather log + continue than crash + restart, because
// crash-restart loops never make forward progress through the source
// list (each restart begins again at source #0). Catching here is the
// failure-mode boundary for "we attempted to ingest one bad page".
process.on('uncaughtException', (err) => {
console.warn(
'[news-ingester] uncaughtException swallowed:',
err instanceof Error ? err.message : err
);
});
process.on('unhandledRejection', (reason) => {
console.warn(
'[news-ingester] unhandledRejection swallowed:',
reason instanceof Error ? reason.message : reason
);
});
const config = loadConfig();
const db = getDb(config.databaseUrl);
let lastTick: TickResult | null = null;
let running = false;
async function tick() {
if (running) {
console.log('[news-ingester] previous tick still running, skipping');
return;
}
running = true;
try {
lastTick = await runIngestTick(db);
} catch (err) {
console.error('[news-ingester] tick failed:', err);
} finally {
running = false;
}
}
// ─── Hono app ──────────────────────────────────────────────
const app = new Hono();
app.get('/health', async (c) => {
try {
// Cheap connectivity check — don't claim healthy if Postgres is down.
await db.execute(sql`SELECT 1`);
} catch {
return c.json({ status: 'degraded', service: 'news-ingester' }, 503);
}
return c.json({
status: 'ok',
service: 'news-ingester',
lastTickStartedAt: lastTick?.startedAt ?? null,
lastTickInserted: lastTick?.totalInserted ?? null,
running,
});
});
app.get('/status', (c) => c.json(lastTick ?? { message: 'no tick yet' }));
app.post('/ingest/run', async (c) => {
if (running) return c.json({ status: 'busy' }, 409);
// Fire-and-forget; client polls /status.
void tick();
return c.json({ status: 'started' });
});
// ─── Bootstrap ─────────────────────────────────────────────
console.log(
`[news-ingester] starting on port ${config.port}, tick every ${config.tickIntervalMs}ms`
);
if (config.runOnStartup) {
// Defer one tick so the HTTP server is up first (healthchecks pass
// while we ingest).
setTimeout(() => void tick(), 5_000);
}
setInterval(() => void tick(), config.tickIntervalMs);
export default { port: config.port, fetch: app.fetch };