refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-17 05:59:39 +02:00 · 2026-04-15 22:30:44 +02:00 · 2026-04-15 22:30:44 +02:00 · b768a0ffce
commit b768a0ffce
parent 5ae7f99fe1
16 changed files with 414 additions and 252 deletions
--- a/packages/shared-rss/src/discover.ts
+++ b/packages/shared-rss/src/discover.ts
@ -0,0 +1,128 @@
+import { JSDOM, VirtualConsole } from 'jsdom';
+import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types';
+
+const silentConsole = new VirtualConsole();
+
+const FEED_TYPES: Record<string, DiscoveredFeed['type']> = {
+	'application/rss+xml': 'rss',
+	'application/atom+xml': 'atom',
+	'application/feed+json': 'unknown',
+	'application/json': 'unknown',
+};
+
+const COMMON_FEED_PATHS = [
+	'/feed',
+	'/feed/',
+	'/rss',
+	'/rss.xml',
+	'/atom.xml',
+	'/index.xml',
+	'/feed.xml',
+];
+
+function absolutize(href: string, base: string): string | null {
+	try {
+		return new URL(href, base).toString();
+	} catch {
+		return null;
+	}
+}
+
+/**
+ * Discover RSS/Atom feeds linked from a site URL.
+ *
+ * Strategy: fetch HTML, look for <link rel="alternate"> tags with a
+ * feed mime type. That covers ~90% of well-behaved sites. For the rest,
+ * the caller can fall back to `probeCommonPaths`.
+ */
+export async function discoverFeedsFromSite(siteUrl: string): Promise<DiscoveredFeed[]> {
+	let html: string;
+	try {
+		const response = await fetch(siteUrl, {
+			headers: { 'User-Agent': DEFAULT_USER_AGENT },
+			signal: AbortSignal.timeout(15_000),
+			redirect: 'follow',
+		});
+		if (!response.ok) return [];
+		html = await response.text();
+	} catch {
+		return [];
+	}
+
+	const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole });
+	const links = dom.window.document.querySelectorAll(
+		'link[rel="alternate"], link[rel~="alternate"]'
+	);
+
+	const found = new Map<string, DiscoveredFeed>();
+	for (const link of Array.from(links)) {
+		const type = (link.getAttribute('type') || '').toLowerCase();
+		const href = link.getAttribute('href');
+		if (!href || !(type in FEED_TYPES)) continue;
+
+		const abs = absolutize(href, siteUrl);
+		if (!abs) continue;
+
+		if (!found.has(abs)) {
+			found.set(abs, {
+				url: abs,
+				title: link.getAttribute('title'),
+				type: FEED_TYPES[type] ?? 'unknown',
+				siteUrl,
+			});
+		}
+	}
+
+	return Array.from(found.values());
+}
+
+/**
+ * Probe a handful of common feed paths on a domain. Cheap fallback when
+ * discoverFeedsFromSite returns nothing.
+ */
+export async function probeCommonPaths(siteUrl: string): Promise<DiscoveredFeed[]> {
+	const base = (() => {
+		try {
+			return new URL(siteUrl).origin;
+		} catch {
+			return null;
+		}
+	})();
+	if (!base) return [];
+
+	const probes = await Promise.all(
+		COMMON_FEED_PATHS.map(async (path) => {
+			const url = base + path;
+			try {
+				const res = await fetch(url, {
+					method: 'GET',
+					headers: {
+						'User-Agent': DEFAULT_USER_AGENT,
+						Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1',
+					},
+					signal: AbortSignal.timeout(8_000),
+					redirect: 'follow',
+				});
+				if (!res.ok) return null;
+				const ct = res.headers.get('content-type') || '';
+				if (!/xml|rss|atom/i.test(ct)) return null;
+				return {
+					url,
+					title: null,
+					type: ct.includes('atom') ? 'atom' : 'rss',
+					siteUrl: base,
+				} as DiscoveredFeed;
+			} catch {
+				return null;
+			}
+		})
+	);
+
+	return probes.filter((p): p is DiscoveredFeed => p !== null);
+}
+
+export async function discoverFeeds(siteUrl: string): Promise<DiscoveredFeed[]> {
+	const viaLinks = await discoverFeedsFromSite(siteUrl);
+	if (viaLinks.length > 0) return viaLinks;
+	return probeCommonPaths(siteUrl);
+}
--- a/packages/shared-rss/src/extract.ts
+++ b/packages/shared-rss/src/extract.ts
@ -0,0 +1,47 @@
+import { Readability } from '@mozilla/readability';
+import { JSDOM, VirtualConsole } from 'jsdom';
+import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
+
+// JSDOM's default virtualConsole rethrows CSS parse errors as
+// uncaughtException, which kills long-running services. A bare
+// VirtualConsole with no listeners swallows everything.
+const silentConsole = new VirtualConsole();
+
+export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
+	try {
+		const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
+		const article = new Readability(dom.window.document).parse();
+		if (!article || !article.textContent) return null;
+
+		const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
+		const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
+
+		return {
+			title: article.title ?? null,
+			content: article.textContent,
+			htmlContent: article.content ?? '',
+			excerpt: article.excerpt || article.textContent.slice(0, 240),
+			byline: article.byline ?? null,
+			siteName: article.siteName ?? null,
+			wordCount,
+			readingTimeMinutes,
+		};
+	} catch {
+		return null;
+	}
+}
+
+export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
+	let html: string;
+	try {
+		const response = await fetch(url, {
+			headers: { 'User-Agent': DEFAULT_USER_AGENT },
+			signal: AbortSignal.timeout(15_000),
+		});
+		if (!response.ok) return null;
+		html = await response.text();
+	} catch {
+		return null;
+	}
+	return extractFromHtml(html, url);
+}
--- a/packages/shared-rss/src/index.ts
+++ b/packages/shared-rss/src/index.ts
@ -0,0 +1,6 @@
+export type { NormalizedFeedItem, ExtractedArticle, DiscoveredFeed, FeedValidation } from './types';
+export { DEFAULT_USER_AGENT } from './types';
+export { parseFeedUrl, parseFeedXml, parseFeedMeta, type ParsedFeed } from './parse';
+export { extractFromUrl, extractFromHtml } from './extract';
+export { discoverFeeds, discoverFeedsFromSite, probeCommonPaths } from './discover';
+export { validateFeed } from './validate';
--- a/packages/shared-rss/src/parse.ts
+++ b/packages/shared-rss/src/parse.ts
@ -0,0 +1,65 @@
+import Parser from 'rss-parser';
+import { DEFAULT_USER_AGENT, type NormalizedFeedItem } from './types';
+
+type CustomItem = {
+	'media:content'?: { $: { url: string } };
+	'media:thumbnail'?: { $: { url: string } };
+	enclosure?: { url?: string };
+};
+
+const parser: Parser<unknown, CustomItem> = new Parser({
+	timeout: 15_000,
+	headers: { 'User-Agent': DEFAULT_USER_AGENT },
+	customFields: {
+		item: ['media:content', 'media:thumbnail', 'enclosure'],
+	},
+});
+
+function mapItem(item: unknown): NormalizedFeedItem {
+	const i = item as CustomItem & {
+		link?: string;
+		title?: string;
+		content?: string;
+		contentSnippet?: string;
+		creator?: string;
+		author?: string;
+		isoDate?: string;
+	};
+
+	const imageUrl =
+		i['media:content']?.$?.url ?? i['media:thumbnail']?.$?.url ?? i.enclosure?.url ?? null;
+
+	return {
+		url: i.link ?? '',
+		title: i.title ?? '',
+		excerpt: i.contentSnippet ?? null,
+		content: i.contentSnippet ?? null,
+		htmlContent: i.content ?? null,
+		author: i.creator ?? i.author ?? null,
+		imageUrl,
+		publishedAt: i.isoDate ? new Date(i.isoDate) : null,
+	};
+}
+
+export async function parseFeedUrl(url: string): Promise<NormalizedFeedItem[]> {
+	const feed = await parser.parseURL(url);
+	return (feed.items ?? []).map(mapItem);
+}
+
+export async function parseFeedXml(xml: string): Promise<NormalizedFeedItem[]> {
+	const feed = await parser.parseString(xml);
+	return (feed.items ?? []).map(mapItem);
+}
+
+export interface ParsedFeed {
+	title: string | null;
+	items: NormalizedFeedItem[];
+}
+
+export async function parseFeedMeta(url: string): Promise<ParsedFeed> {
+	const feed = await parser.parseURL(url);
+	return {
+		title: feed.title ?? null,
+		items: (feed.items ?? []).map(mapItem),
+	};
+}
--- a/packages/shared-rss/src/types.ts
+++ b/packages/shared-rss/src/types.ts
@ -0,0 +1,38 @@
+export interface NormalizedFeedItem {
+	url: string;
+	title: string;
+	excerpt: string | null;
+	content: string | null;
+	htmlContent: string | null;
+	author: string | null;
+	imageUrl: string | null;
+	publishedAt: Date | null;
+}
+
+export interface ExtractedArticle {
+	title: string | null;
+	content: string;
+	htmlContent: string;
+	excerpt: string;
+	byline: string | null;
+	siteName: string | null;
+	wordCount: number;
+	readingTimeMinutes: number;
+}
+
+export interface DiscoveredFeed {
+	url: string;
+	title: string | null;
+	type: 'rss' | 'atom' | 'unknown';
+	siteUrl: string | null;
+}
+
+export interface FeedValidation {
+	ok: boolean;
+	itemCount: number;
+	title: string | null;
+	sample: NormalizedFeedItem[];
+	error?: string;
+}
+
+export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; ManaRSS/1.0; +https://mana.how)';
--- a/packages/shared-rss/src/validate.ts
+++ b/packages/shared-rss/src/validate.ts
@ -0,0 +1,24 @@
+import { parseFeedMeta } from './parse';
+import type { FeedValidation } from './types';
+
+const SAMPLE_LIMIT = 5;
+
+export async function validateFeed(url: string): Promise<FeedValidation> {
+	try {
+		const parsed = await parseFeedMeta(url);
+		return {
+			ok: parsed.items.length > 0,
+			itemCount: parsed.items.length,
+			title: parsed.title,
+			sample: parsed.items.slice(0, SAMPLE_LIMIT),
+		};
+	} catch (err) {
+		return {
+			ok: false,
+			itemCount: 0,
+			title: null,
+			sample: [],
+			error: err instanceof Error ? err.message : String(err),
+		};
+	}
+}