refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-18 20:01:22 +02:00 · 2026-04-15 22:30:44 +02:00 · 2026-04-15 22:30:44 +02:00 · b768a0ffce
commit b768a0ffce
parent 5ae7f99fe1
16 changed files with 414 additions and 252 deletions
--- a/services/news-ingester/package.json
+++ b/services/news-ingester/package.json
@ -11,15 +11,12 @@
 		"db:studio": "drizzle-kit studio"
 	},
 	"dependencies": {
-		"@mozilla/readability": "^0.5.0",
+		"@mana/shared-rss": "workspace:*",
 		"drizzle-orm": "^0.38.3",
 		"hono": "^4.7.0",
-		"jsdom": "^25.0.1",
-		"postgres": "^3.4.5",
-		"rss-parser": "^3.13.0"
+		"postgres": "^3.4.5"
 	},
 	"devDependencies": {
-		"@types/jsdom": "^21.1.7",
 		"drizzle-kit": "^0.30.4",
 		"typescript": "^5.9.3"
 	}
--- a/services/news-ingester/src/ingest.ts
+++ b/services/news-ingester/src/ingest.ts
@ -19,9 +19,8 @@ import { sql } from 'drizzle-orm';
 import type { Database } from './db/connection';
 import { curatedArticles, type NewCuratedArticle } from './db/schema';
 import { SOURCES, type NewsSource } from './sources';
-import { fetchFeed, type NormalizedFeedItem } from './parsers/rss';
+import { parseFeedUrl, extractFromUrl, type NormalizedFeedItem } from '@mana/shared-rss';
 import { fetchHackerNews } from './parsers/hn';
-import { fetchAndExtract } from './parsers/readability';

 const RETENTION_DAYS = 30;

@ -57,7 +56,7 @@ function readingMinutes(words: number): number {

 async function fetchSourceItems(source: NewsSource): Promise<NormalizedFeedItem[]> {
 	if (source.type === 'hn') return fetchHackerNews(source.url);
-	return fetchFeed(source.url);
+	return parseFeedUrl(source.url);
 }

 /**
@ -78,7 +77,7 @@ async function buildRow(

 	const initialWords = wordCountOf(content);
 	if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) {
-		const extracted = await fetchAndExtract(item.url);
+		const extracted = await extractFromUrl(item.url);
 		if (extracted) {
 			content = extracted.content;
 			htmlContent = extracted.htmlContent || htmlContent;
--- a/services/news-ingester/src/parsers/hn.ts
+++ b/services/news-ingester/src/parsers/hn.ts
@ -8,7 +8,7 @@
 * struggles with and which isn't the user's expectation for a news feed.
 */

-import type { NormalizedFeedItem } from './rss';
+import type { NormalizedFeedItem } from '@mana/shared-rss';

 interface HnItem {
 	id: number;
--- a/services/news-ingester/src/parsers/readability.ts
+++ b/services/news-ingester/src/parsers/readability.ts
@ -1,69 +0,0 @@
-/**
- * Mozilla Readability fallback. Used when an RSS item only ships an
- * excerpt, so we fetch the original page and extract the article body.
- *
- * Kept dependency-local to the ingester so this service is the canonical
- * "content acquisition" boundary — apps/api never has to call out to a
- * crawler.
- */
-
-import { Readability } from '@mozilla/readability';
-import { JSDOM, VirtualConsole } from 'jsdom';
-
-// JSDOM emits CSS parse errors and resource-loading warnings via its
-// virtualConsole. The default console rethrows some of those as
-// uncaughtException, which in a long-running ingester loop kills the
-// whole process. A bare VirtualConsole with no listeners swallows
-// everything quietly — exactly what we want for a "best-effort article
-// extractor" that doesn't care about CSS or sub-resources.
-const silentConsole = new VirtualConsole();
-
-export interface ExtractedArticle {
-	title: string | null;
-	content: string;
-	htmlContent: string;
-	excerpt: string;
-	byline: string | null;
-	siteName: string | null;
-	wordCount: number;
-	readingTimeMinutes: number;
-}
-
-const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)';
-
-export async function fetchAndExtract(url: string): Promise<ExtractedArticle | null> {
-	let html: string;
-	try {
-		const response = await fetch(url, {
-			headers: { 'User-Agent': USER_AGENT },
-			signal: AbortSignal.timeout(15_000),
-		});
-		if (!response.ok) return null;
-		html = await response.text();
-	} catch {
-		return null;
-	}
-
-	try {
-		const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
-		const reader = new Readability(dom.window.document);
-		const article = reader.parse();
-		if (!article || !article.textContent) return null;
-
-		const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
-		const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
-
-		return {
-			title: article.title ?? null,
-			content: article.textContent,
-			htmlContent: article.content ?? '',
-			excerpt: article.excerpt || article.textContent.slice(0, 240),
-			byline: article.byline ?? null,
-			siteName: article.siteName ?? null,
-			wordCount,
-			readingTimeMinutes,
-		};
-	} catch {
-		return null;
-	}
-}
--- a/services/news-ingester/src/parsers/rss.ts
+++ b/services/news-ingester/src/parsers/rss.ts
@ -1,72 +0,0 @@
-/**
- * RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop
- * doesn't have to know about feed-format quirks.
- *
- * `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out
- * the bits we need (link, title, content/snippet, image, date) and let
- * the ingester decide whether to call Readability for full-text.
- */
-
-import Parser from 'rss-parser';
-
-export interface NormalizedFeedItem {
-	url: string;
-	title: string;
-	excerpt: string | null;
-	content: string | null;
-	htmlContent: string | null;
-	author: string | null;
-	imageUrl: string | null;
-	publishedAt: Date | null;
-}
-
-type CustomItem = {
-	'media:content'?: { $: { url: string } };
-	'media:thumbnail'?: { $: { url: string } };
-	enclosure?: { url?: string };
-};
-
-const parser: Parser<unknown, CustomItem> = new Parser({
-	timeout: 15_000,
-	headers: {
-		'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)',
-	},
-	customFields: {
-		item: ['media:content', 'media:thumbnail', 'enclosure'],
-	},
-});
-
-export async function fetchFeed(url: string): Promise<NormalizedFeedItem[]> {
-	const feed = await parser.parseURL(url);
-
-	return (feed.items ?? []).map((item) => {
-		// rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`.
-		const html = (item as { content?: string }).content ?? null;
-		const text = (item as { contentSnippet?: string }).contentSnippet ?? null;
-
-		// Image: try a few common locations.
-		const mediaContent = item['media:content']?.$?.url;
-		const mediaThumb = item['media:thumbnail']?.$?.url;
-		const enclosureUrl = item.enclosure?.url;
-		const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null;
-
-		const link = (item as { link?: string }).link ?? '';
-		const title = (item as { title?: string }).title ?? '';
-		const author =
-			(item as { creator?: string; author?: string }).creator ??
-			(item as { author?: string }).author ??
-			null;
-		const isoDate = (item as { isoDate?: string }).isoDate ?? null;
-
-		return {
-			url: link,
-			title,
-			excerpt: text,
-			content: text,
-			htmlContent: html,
-			author,
-			imageUrl,
-			publishedAt: isoDate ? new Date(isoDate) : null,
-		};
-	});
-}