mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-18 15:49:42 +02:00
JSDOM's CSS parser throws on plenty of real-world pages and the error escapes every try/catch in the buildRow → ingestSource chain because it fires from a parse5 callback that runs after JSDOM has returned. In the prod container this killed the process on the first bad page, docker restarted it, and it crash-looped on the same first source forever — no progress past tech. Two-layer fix: a silent VirtualConsole on every JSDOM instance to swallow CSS / resource errors at the source, plus process-level uncaughtException + unhandledRejection handlers that log and continue so any future async escape can't kill the daemon either. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
104 lines
3.3 KiB
TypeScript
104 lines
3.3 KiB
TypeScript
/**
|
|
* news-ingester — pulls public RSS / JSON feeds into news.curated_articles
|
|
* on a fixed interval. Exposes a tiny Hono server for health + manual
|
|
* trigger so the container can be probed and re-kicked without a restart.
|
|
*
|
|
* Why a long-running container instead of a host cron:
|
|
* - logs land in the same docker stack as everything else
|
|
* - restarts on crash via docker
|
|
* - health endpoint for the docker-compose healthcheck
|
|
* - lets us hit /ingest/run from a shell to debug new sources without
|
|
* waiting 15 minutes
|
|
*/
|
|
|
|
import { Hono } from 'hono';
|
|
import { sql } from 'drizzle-orm';
|
|
import { loadConfig } from './config';
|
|
import { getDb } from './db/connection';
|
|
import { runIngestTick, type TickResult } from './ingest';
|
|
|
|
// JSDOM (used by the Readability fallback parser) likes to throw async
|
|
// CSS / parser errors that escape every try/catch frame in the call
|
|
// stack and reach the process top-level. In a long-running daemon
|
|
// container we'd rather log + continue than crash + restart, because
|
|
// crash-restart loops never make forward progress through the source
|
|
// list (each restart begins again at source #0). Catching here is the
|
|
// failure-mode boundary for "we attempted to ingest one bad page".
|
|
process.on('uncaughtException', (err) => {
|
|
console.warn(
|
|
'[news-ingester] uncaughtException swallowed:',
|
|
err instanceof Error ? err.message : err
|
|
);
|
|
});
|
|
process.on('unhandledRejection', (reason) => {
|
|
console.warn(
|
|
'[news-ingester] unhandledRejection swallowed:',
|
|
reason instanceof Error ? reason.message : reason
|
|
);
|
|
});
|
|
|
|
const config = loadConfig();
|
|
const db = getDb(config.databaseUrl);
|
|
|
|
let lastTick: TickResult | null = null;
|
|
let running = false;
|
|
|
|
async function tick() {
|
|
if (running) {
|
|
console.log('[news-ingester] previous tick still running, skipping');
|
|
return;
|
|
}
|
|
running = true;
|
|
try {
|
|
lastTick = await runIngestTick(db);
|
|
} catch (err) {
|
|
console.error('[news-ingester] tick failed:', err);
|
|
} finally {
|
|
running = false;
|
|
}
|
|
}
|
|
|
|
// ─── Hono app ──────────────────────────────────────────────
|
|
|
|
const app = new Hono();
|
|
|
|
app.get('/health', async (c) => {
|
|
try {
|
|
// Cheap connectivity check — don't claim healthy if Postgres is down.
|
|
await db.execute(sql`SELECT 1`);
|
|
} catch {
|
|
return c.json({ status: 'degraded', service: 'news-ingester' }, 503);
|
|
}
|
|
return c.json({
|
|
status: 'ok',
|
|
service: 'news-ingester',
|
|
lastTickStartedAt: lastTick?.startedAt ?? null,
|
|
lastTickInserted: lastTick?.totalInserted ?? null,
|
|
running,
|
|
});
|
|
});
|
|
|
|
app.get('/status', (c) => c.json(lastTick ?? { message: 'no tick yet' }));
|
|
|
|
app.post('/ingest/run', async (c) => {
|
|
if (running) return c.json({ status: 'busy' }, 409);
|
|
// Fire-and-forget; client polls /status.
|
|
void tick();
|
|
return c.json({ status: 'started' });
|
|
});
|
|
|
|
// ─── Bootstrap ─────────────────────────────────────────────
|
|
|
|
console.log(
|
|
`[news-ingester] starting on port ${config.port}, tick every ${config.tickIntervalMs}ms`
|
|
);
|
|
|
|
if (config.runOnStartup) {
|
|
// Defer one tick so the HTTP server is up first (healthchecks pass
|
|
// while we ingest).
|
|
setTimeout(() => void tick(), 5_000);
|
|
}
|
|
|
|
setInterval(() => void tick(), config.tickIntervalMs);
|
|
|
|
export default { port: config.port, fetch: app.fetch };
|