managarten/packages/shared-rss/src/extract.ts
Till JS b768a0ffce refactor(shared-rss): extract RSS parsing + Readability into one package
news-ingester and apps/api both shipped their own copy of rss-parser
+ jsdom + Readability glue. Single source now in packages/shared-rss.
Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed
which News Research will use. JSDOM virtualConsole is silenced once,
in the package, instead of in two parallel call sites.

- packages/shared-rss: parse, extract, discover, validate
- services/news-ingester: drop local parsers, depend on @mana/shared-rss
- apps/api: drop @mozilla/readability + jsdom direct deps, use shared

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 22:30:44 +02:00

47 lines
1.5 KiB
TypeScript

import { Readability } from '@mozilla/readability';
import { JSDOM, VirtualConsole } from 'jsdom';
import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
// JSDOM's default virtualConsole rethrows CSS parse errors as
// uncaughtException, which kills long-running services. A bare
// VirtualConsole with no listeners swallows everything.
const silentConsole = new VirtualConsole();
export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
try {
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
const article = new Readability(dom.window.document).parse();
if (!article || !article.textContent) return null;
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
return {
title: article.title ?? null,
content: article.textContent,
htmlContent: article.content ?? '',
excerpt: article.excerpt || article.textContent.slice(0, 240),
byline: article.byline ?? null,
siteName: article.siteName ?? null,
wordCount,
readingTimeMinutes,
};
} catch {
return null;
}
}
export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
let html: string;
try {
const response = await fetch(url, {
headers: { 'User-Agent': DEFAULT_USER_AGENT },
signal: AbortSignal.timeout(15_000),
});
if (!response.ok) return null;
html = await response.text();
} catch {
return null;
}
return extractFromHtml(html, url);
}