refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 16:41:08 +02:00 · 2026-04-15 22:30:44 +02:00 · 2026-04-15 22:30:44 +02:00 · b768a0ffce
commit b768a0ffce
parent 5ae7f99fe1
16 changed files with 414 additions and 252 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -16,20 +16,18 @@
 		"@ai-sdk/openai-compatible": "^2.0.41",
 		"@mana/media-client": "workspace:*",
 		"@mana/shared-hono": "workspace:*",
+		"@mana/shared-rss": "workspace:*",
 		"@mana/shared-storage": "workspace:*",
 		"@mana/shared-types": "workspace:^",
-		"@mozilla/readability": "^0.5.0",
 		"ai": "^6.0.154",
 		"drizzle-orm": "^0.38.0",
 		"hono": "^4.7.0",
-		"jsdom": "^25.0.0",
 		"postgres": "^3.4.0",
 		"rrule": "^2.8.1",
 		"zod": "^3.23.0"
 	},
 	"devDependencies": {
 		"@types/bun": "latest",
-		"@types/jsdom": "^21.1.0",
 		"drizzle-kit": "^0.30.0",
 		"typescript": "^5.8.0"
 	}
--- a/apps/api/src/modules/news/routes.ts
+++ b/apps/api/src/modules/news/routes.ts
@ -11,8 +11,7 @@
 */

 import { Hono } from 'hono';
-import { Readability } from '@mozilla/readability';
-import { JSDOM } from 'jsdom';
+import { extractFromUrl } from '@mana/shared-rss';
 import { drizzle } from 'drizzle-orm/postgres-js';
 import { sql } from 'drizzle-orm';
 import { getConnection } from '../../lib/db';
@ -21,54 +20,6 @@ import { getConnection } from '../../lib/db';

 const db = drizzle(getConnection());

-// ─── Extract Service (Readability fallback for ad-hoc URLs) ─
-
-interface ExtractedArticle {
-	title: string;
-	content: string;
-	htmlContent: string;
-	excerpt: string;
-	byline: string | null;
-	siteName: string | null;
-	wordCount: number;
-	readingTimeMinutes: number;
-}
-
-async function extractFromUrl(url: string): Promise<ExtractedArticle> {
-	const response = await fetch(url, {
-		headers: {
-			'User-Agent': 'Mozilla/5.0 (compatible; ManaNews/1.0; +https://mana.how)',
-		},
-	});
-
-	if (!response.ok) {
-		throw new Error(`Failed to fetch URL: ${response.status}`);
-	}
-
-	const html = await response.text();
-	const dom = new JSDOM(html, { url });
-	const reader = new Readability(dom.window.document);
-	const article = reader.parse();
-
-	if (!article) {
-		throw new Error('Could not extract article content');
-	}
-
-	const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
-	const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
-
-	return {
-		title: article.title,
-		content: article.textContent,
-		htmlContent: article.content,
-		excerpt: article.excerpt || article.textContent.slice(0, 200),
-		byline: article.byline || null,
-		siteName: article.siteName || null,
-		wordCount,
-		readingTimeMinutes,
-	};
-}
-
 // ─── Routes ─────────────────────────────────────────────────

 const routes = new Hono();
@ -150,40 +101,33 @@ routes.post('/extract/preview', async (c) => {
 	const { url } = await c.req.json<{ url: string }>();
 	if (!url) return c.json({ error: 'URL is required' }, 400);

-	try {
-		const article = await extractFromUrl(url);
-		return c.json(article);
-	} catch (err) {
-		return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
-	}
+	const article = await extractFromUrl(url);
+	if (!article) return c.json({ error: 'Extraction failed' }, 502);
+	return c.json(article);
 });

 routes.post('/extract/save', async (c) => {
 	const { url } = await c.req.json<{ url: string }>();
 	if (!url) return c.json({ error: 'URL is required' }, 400);

-	try {
-		const extracted = await extractFromUrl(url);
+	const extracted = await extractFromUrl(url);
+	if (!extracted) return c.json({ error: 'Extraction failed' }, 502);

-		// Return extracted data — client saves to local-first store.
-		return c.json({
-			id: crypto.randomUUID(),
-			type: 'saved',
-			sourceOrigin: 'user_saved',
-			originalUrl: url,
-			title: extracted.title,
-			content: extracted.content,
-			htmlContent: extracted.htmlContent,
-			excerpt: extracted.excerpt,
-			author: extracted.byline,
-			siteName: extracted.siteName,
-			wordCount: extracted.wordCount,
-			readingTimeMinutes: extracted.readingTimeMinutes,
-			isArchived: false,
-		});
-	} catch (err) {
-		return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
-	}
+	return c.json({
+		id: crypto.randomUUID(),
+		type: 'saved',
+		sourceOrigin: 'user_saved',
+		originalUrl: url,
+		title: extracted.title,
+		content: extracted.content,
+		htmlContent: extracted.htmlContent,
+		excerpt: extracted.excerpt,
+		author: extracted.byline,
+		siteName: extracted.siteName,
+		wordCount: extracted.wordCount,
+		readingTimeMinutes: extracted.readingTimeMinutes,
+		isArchived: false,
+	});
 });

 export { routes as newsRoutes };
--- a/packages/shared-rss/package.json
+++ b/packages/shared-rss/package.json
@ -0,0 +1,32 @@
+{
+	"name": "@mana/shared-rss",
+	"version": "0.1.0",
+	"private": true,
+	"sideEffects": false,
+	"description": "RSS/Atom parsing, article extraction, and feed discovery primitives.",
+	"main": "./src/index.ts",
+	"types": "./src/index.ts",
+	"exports": {
+		".": "./src/index.ts",
+		"./parse": "./src/parse.ts",
+		"./extract": "./src/extract.ts",
+		"./discover": "./src/discover.ts",
+		"./validate": "./src/validate.ts",
+		"./types": "./src/types.ts"
+	},
+	"scripts": {
+		"type-check": "tsc --noEmit",
+		"clean": "rm -rf dist",
+		"lint": "eslint ."
+	},
+	"dependencies": {
+		"@mozilla/readability": "^0.5.0",
+		"jsdom": "^25.0.1",
+		"rss-parser": "^3.13.0"
+	},
+	"devDependencies": {
+		"@types/jsdom": "^21.1.7",
+		"@types/node": "^24.10.1",
+		"typescript": "^5.9.3"
+	}
+}
--- a/packages/shared-rss/src/discover.ts
+++ b/packages/shared-rss/src/discover.ts
@ -0,0 +1,128 @@
+import { JSDOM, VirtualConsole } from 'jsdom';
+import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types';
+
+const silentConsole = new VirtualConsole();
+
+const FEED_TYPES: Record<string, DiscoveredFeed['type']> = {
+	'application/rss+xml': 'rss',
+	'application/atom+xml': 'atom',
+	'application/feed+json': 'unknown',
+	'application/json': 'unknown',
+};
+
+const COMMON_FEED_PATHS = [
+	'/feed',
+	'/feed/',
+	'/rss',
+	'/rss.xml',
+	'/atom.xml',
+	'/index.xml',
+	'/feed.xml',
+];
+
+function absolutize(href: string, base: string): string | null {
+	try {
+		return new URL(href, base).toString();
+	} catch {
+		return null;
+	}
+}
+
+/**
+ * Discover RSS/Atom feeds linked from a site URL.
+ *
+ * Strategy: fetch HTML, look for <link rel="alternate"> tags with a
+ * feed mime type. That covers ~90% of well-behaved sites. For the rest,
+ * the caller can fall back to `probeCommonPaths`.
+ */
+export async function discoverFeedsFromSite(siteUrl: string): Promise<DiscoveredFeed[]> {
+	let html: string;
+	try {
+		const response = await fetch(siteUrl, {
+			headers: { 'User-Agent': DEFAULT_USER_AGENT },
+			signal: AbortSignal.timeout(15_000),
+			redirect: 'follow',
+		});
+		if (!response.ok) return [];
+		html = await response.text();
+	} catch {
+		return [];
+	}
+
+	const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole });
+	const links = dom.window.document.querySelectorAll(
+		'link[rel="alternate"], link[rel~="alternate"]'
+	);
+
+	const found = new Map<string, DiscoveredFeed>();
+	for (const link of Array.from(links)) {
+		const type = (link.getAttribute('type') || '').toLowerCase();
+		const href = link.getAttribute('href');
+		if (!href || !(type in FEED_TYPES)) continue;
+
+		const abs = absolutize(href, siteUrl);
+		if (!abs) continue;
+
+		if (!found.has(abs)) {
+			found.set(abs, {
+				url: abs,
+				title: link.getAttribute('title'),
+				type: FEED_TYPES[type] ?? 'unknown',
+				siteUrl,
+			});
+		}
+	}
+
+	return Array.from(found.values());
+}
+
+/**
+ * Probe a handful of common feed paths on a domain. Cheap fallback when
+ * discoverFeedsFromSite returns nothing.
+ */
+export async function probeCommonPaths(siteUrl: string): Promise<DiscoveredFeed[]> {
+	const base = (() => {
+		try {
+			return new URL(siteUrl).origin;
+		} catch {
+			return null;
+		}
+	})();
+	if (!base) return [];
+
+	const probes = await Promise.all(
+		COMMON_FEED_PATHS.map(async (path) => {
+			const url = base + path;
+			try {
+				const res = await fetch(url, {
+					method: 'GET',
+					headers: {
+						'User-Agent': DEFAULT_USER_AGENT,
+						Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1',
+					},
+					signal: AbortSignal.timeout(8_000),
+					redirect: 'follow',
+				});
+				if (!res.ok) return null;
+				const ct = res.headers.get('content-type') || '';
+				if (!/xml|rss|atom/i.test(ct)) return null;
+				return {
+					url,
+					title: null,
+					type: ct.includes('atom') ? 'atom' : 'rss',
+					siteUrl: base,
+				} as DiscoveredFeed;
+			} catch {
+				return null;
+			}
+		})
+	);
+
+	return probes.filter((p): p is DiscoveredFeed => p !== null);
+}
+
+export async function discoverFeeds(siteUrl: string): Promise<DiscoveredFeed[]> {
+	const viaLinks = await discoverFeedsFromSite(siteUrl);
+	if (viaLinks.length > 0) return viaLinks;
+	return probeCommonPaths(siteUrl);
+}
--- a/packages/shared-rss/src/extract.ts
+++ b/packages/shared-rss/src/extract.ts
@ -0,0 +1,47 @@
+import { Readability } from '@mozilla/readability';
+import { JSDOM, VirtualConsole } from 'jsdom';
+import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
+
+// JSDOM's default virtualConsole rethrows CSS parse errors as
+// uncaughtException, which kills long-running services. A bare
+// VirtualConsole with no listeners swallows everything.
+const silentConsole = new VirtualConsole();
+
+export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
+	try {
+		const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
+		const article = new Readability(dom.window.document).parse();
+		if (!article || !article.textContent) return null;
+
+		const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
+		const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
+
+		return {
+			title: article.title ?? null,
+			content: article.textContent,
+			htmlContent: article.content ?? '',
+			excerpt: article.excerpt || article.textContent.slice(0, 240),
+			byline: article.byline ?? null,
+			siteName: article.siteName ?? null,
+			wordCount,
+			readingTimeMinutes,
+		};
+	} catch {
+		return null;
+	}
+}
+
+export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
+	let html: string;
+	try {
+		const response = await fetch(url, {
+			headers: { 'User-Agent': DEFAULT_USER_AGENT },
+			signal: AbortSignal.timeout(15_000),
+		});
+		if (!response.ok) return null;
+		html = await response.text();
+	} catch {
+		return null;
+	}
+	return extractFromHtml(html, url);
+}
--- a/packages/shared-rss/src/index.ts
+++ b/packages/shared-rss/src/index.ts
@ -0,0 +1,6 @@
+export type { NormalizedFeedItem, ExtractedArticle, DiscoveredFeed, FeedValidation } from './types';
+export { DEFAULT_USER_AGENT } from './types';
+export { parseFeedUrl, parseFeedXml, parseFeedMeta, type ParsedFeed } from './parse';
+export { extractFromUrl, extractFromHtml } from './extract';
+export { discoverFeeds, discoverFeedsFromSite, probeCommonPaths } from './discover';
+export { validateFeed } from './validate';
--- a/packages/shared-rss/src/parse.ts
+++ b/packages/shared-rss/src/parse.ts
@ -0,0 +1,65 @@
+import Parser from 'rss-parser';
+import { DEFAULT_USER_AGENT, type NormalizedFeedItem } from './types';
+
+type CustomItem = {
+	'media:content'?: { $: { url: string } };
+	'media:thumbnail'?: { $: { url: string } };
+	enclosure?: { url?: string };
+};
+
+const parser: Parser<unknown, CustomItem> = new Parser({
+	timeout: 15_000,
+	headers: { 'User-Agent': DEFAULT_USER_AGENT },
+	customFields: {
+		item: ['media:content', 'media:thumbnail', 'enclosure'],
+	},
+});
+
+function mapItem(item: unknown): NormalizedFeedItem {
+	const i = item as CustomItem & {
+		link?: string;
+		title?: string;
+		content?: string;
+		contentSnippet?: string;
+		creator?: string;
+		author?: string;
+		isoDate?: string;
+	};
+
+	const imageUrl =
+		i['media:content']?.$?.url ?? i['media:thumbnail']?.$?.url ?? i.enclosure?.url ?? null;
+
+	return {
+		url: i.link ?? '',
+		title: i.title ?? '',
+		excerpt: i.contentSnippet ?? null,
+		content: i.contentSnippet ?? null,
+		htmlContent: i.content ?? null,
+		author: i.creator ?? i.author ?? null,
+		imageUrl,
+		publishedAt: i.isoDate ? new Date(i.isoDate) : null,
+	};
+}
+
+export async function parseFeedUrl(url: string): Promise<NormalizedFeedItem[]> {
+	const feed = await parser.parseURL(url);
+	return (feed.items ?? []).map(mapItem);
+}
+
+export async function parseFeedXml(xml: string): Promise<NormalizedFeedItem[]> {
+	const feed = await parser.parseString(xml);
+	return (feed.items ?? []).map(mapItem);
+}
+
+export interface ParsedFeed {
+	title: string | null;
+	items: NormalizedFeedItem[];
+}
+
+export async function parseFeedMeta(url: string): Promise<ParsedFeed> {
+	const feed = await parser.parseURL(url);
+	return {
+		title: feed.title ?? null,
+		items: (feed.items ?? []).map(mapItem),
+	};
+}
--- a/packages/shared-rss/src/types.ts
+++ b/packages/shared-rss/src/types.ts
@ -0,0 +1,38 @@
+export interface NormalizedFeedItem {
+	url: string;
+	title: string;
+	excerpt: string | null;
+	content: string | null;
+	htmlContent: string | null;
+	author: string | null;
+	imageUrl: string | null;
+	publishedAt: Date | null;
+}
+
+export interface ExtractedArticle {
+	title: string | null;
+	content: string;
+	htmlContent: string;
+	excerpt: string;
+	byline: string | null;
+	siteName: string | null;
+	wordCount: number;
+	readingTimeMinutes: number;
+}
+
+export interface DiscoveredFeed {
+	url: string;
+	title: string | null;
+	type: 'rss' | 'atom' | 'unknown';
+	siteUrl: string | null;
+}
+
+export interface FeedValidation {
+	ok: boolean;
+	itemCount: number;
+	title: string | null;
+	sample: NormalizedFeedItem[];
+	error?: string;
+}
+
+export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; ManaRSS/1.0; +https://mana.how)';
--- a/packages/shared-rss/src/validate.ts
+++ b/packages/shared-rss/src/validate.ts
@ -0,0 +1,24 @@
+import { parseFeedMeta } from './parse';
+import type { FeedValidation } from './types';
+
+const SAMPLE_LIMIT = 5;
+
+export async function validateFeed(url: string): Promise<FeedValidation> {
+	try {
+		const parsed = await parseFeedMeta(url);
+		return {
+			ok: parsed.items.length > 0,
+			itemCount: parsed.items.length,
+			title: parsed.title,
+			sample: parsed.items.slice(0, SAMPLE_LIMIT),
+		};
+	} catch (err) {
+		return {
+			ok: false,
+			itemCount: 0,
+			title: null,
+			sample: [],
+			error: err instanceof Error ? err.message : String(err),
+		};
+	}
+}
--- a/packages/shared-rss/tsconfig.json
+++ b/packages/shared-rss/tsconfig.json
@ -0,0 +1,18 @@
+{
+	"compilerOptions": {
+		"target": "ES2022",
+		"module": "ESNext",
+		"moduleResolution": "bundler",
+		"lib": ["ES2022", "DOM"],
+		"types": ["node"],
+		"strict": true,
+		"esModuleInterop": true,
+		"skipLibCheck": true,
+		"forceConsistentCasingInFileNames": true,
+		"isolatedModules": true,
+		"verbatimModuleSyntax": true,
+		"noEmit": true
+	},
+	"include": ["src/**/*"],
+	"exclude": ["node_modules"]
+}
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -72,15 +72,15 @@ importers:
      '@mana/shared-hono':
        specifier: workspace:*
        version: link:../../packages/shared-hono
+      '@mana/shared-rss':
+        specifier: workspace:*
+        version: link:../../packages/shared-rss
      '@mana/shared-storage':
        specifier: workspace:*
        version: link:../../packages/shared-storage
      '@mana/shared-types':
        specifier: workspace:^
        version: link:../../packages/shared-types
-      '@mozilla/readability':
-        specifier: ^0.5.0
-        version: 0.5.0
      ai:
        specifier: ^6.0.154
        version: 6.0.154(zod@3.25.76)
@ -90,9 +90,6 @@ importers:
      hono:
        specifier: ^4.7.0
        version: 4.12.12
-      jsdom:
-        specifier: ^25.0.0
-        version: 25.0.1
      postgres:
        specifier: ^3.4.0
        version: 3.4.9
@ -106,9 +103,6 @@ importers:
      '@types/bun':
        specifier: latest
        version: 1.3.12
-      '@types/jsdom':
-        specifier: ^21.1.0
-        version: 21.1.7
      drizzle-kit:
        specifier: ^0.30.0
        version: 0.30.6
@ -2996,6 +2990,28 @@ importers:
        specifier: ^7.0.0
        version: 7.4.0(@types/babel__core@7.20.5)

+  packages/shared-rss:
+    dependencies:
+      '@mozilla/readability':
+        specifier: ^0.5.0
+        version: 0.5.0
+      jsdom:
+        specifier: ^25.0.1
+        version: 25.0.1
+      rss-parser:
+        specifier: ^3.13.0
+        version: 3.13.0
+    devDependencies:
+      '@types/jsdom':
+        specifier: ^21.1.7
+        version: 21.1.7
+      '@types/node':
+        specifier: ^24.10.1
+        version: 24.12.2
+      typescript:
+        specifier: ^5.9.3
+        version: 5.9.3
+
  packages/shared-storage:
    dependencies:
      '@aws-sdk/client-s3':
@ -3665,28 +3681,19 @@ importers:

  services/news-ingester:
    dependencies:
-      '@mozilla/readability':
-        specifier: ^0.5.0
-        version: 0.5.0
+      '@mana/shared-rss':
+        specifier: workspace:*
+        version: link:../../packages/shared-rss
      drizzle-orm:
        specifier: ^0.38.3
        version: 0.38.4(@opentelemetry/api@1.9.1)(@types/pg@8.6.1)(@types/react@19.2.14)(bun-types@1.3.12)(kysely@0.28.15)(postgres@3.4.9)(react@19.2.0)
      hono:
        specifier: ^4.7.0
        version: 4.12.12
-      jsdom:
-        specifier: ^25.0.1
-        version: 25.0.1
      postgres:
        specifier: ^3.4.5
        version: 3.4.9
-      rss-parser:
-        specifier: ^3.13.0
-        version: 3.13.0
    devDependencies:
-      '@types/jsdom':
-        specifier: ^21.1.7
-        version: 21.1.7
      drizzle-kit:
        specifier: ^0.30.4
        version: 0.30.6
--- a/services/news-ingester/package.json
+++ b/services/news-ingester/package.json
@ -11,15 +11,12 @@
 		"db:studio": "drizzle-kit studio"
 	},
 	"dependencies": {
-		"@mozilla/readability": "^0.5.0",
+		"@mana/shared-rss": "workspace:*",
 		"drizzle-orm": "^0.38.3",
 		"hono": "^4.7.0",
-		"jsdom": "^25.0.1",
-		"postgres": "^3.4.5",
-		"rss-parser": "^3.13.0"
+		"postgres": "^3.4.5"
 	},
 	"devDependencies": {
-		"@types/jsdom": "^21.1.7",
 		"drizzle-kit": "^0.30.4",
 		"typescript": "^5.9.3"
 	}
--- a/services/news-ingester/src/ingest.ts
+++ b/services/news-ingester/src/ingest.ts
@ -19,9 +19,8 @@ import { sql } from 'drizzle-orm';
 import type { Database } from './db/connection';
 import { curatedArticles, type NewCuratedArticle } from './db/schema';
 import { SOURCES, type NewsSource } from './sources';
-import { fetchFeed, type NormalizedFeedItem } from './parsers/rss';
+import { parseFeedUrl, extractFromUrl, type NormalizedFeedItem } from '@mana/shared-rss';
 import { fetchHackerNews } from './parsers/hn';
-import { fetchAndExtract } from './parsers/readability';

 const RETENTION_DAYS = 30;

@ -57,7 +56,7 @@ function readingMinutes(words: number): number {

 async function fetchSourceItems(source: NewsSource): Promise<NormalizedFeedItem[]> {
 	if (source.type === 'hn') return fetchHackerNews(source.url);
-	return fetchFeed(source.url);
+	return parseFeedUrl(source.url);
 }

 /**
@ -78,7 +77,7 @@ async function buildRow(

 	const initialWords = wordCountOf(content);
 	if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) {
-		const extracted = await fetchAndExtract(item.url);
+		const extracted = await extractFromUrl(item.url);
 		if (extracted) {
 			content = extracted.content;
 			htmlContent = extracted.htmlContent || htmlContent;
--- a/services/news-ingester/src/parsers/hn.ts
+++ b/services/news-ingester/src/parsers/hn.ts
@ -8,7 +8,7 @@
 * struggles with and which isn't the user's expectation for a news feed.
 */

-import type { NormalizedFeedItem } from './rss';
+import type { NormalizedFeedItem } from '@mana/shared-rss';

 interface HnItem {
 	id: number;
--- a/services/news-ingester/src/parsers/readability.ts
+++ b/services/news-ingester/src/parsers/readability.ts
@ -1,69 +0,0 @@
-/**
- * Mozilla Readability fallback. Used when an RSS item only ships an
- * excerpt, so we fetch the original page and extract the article body.
- *
- * Kept dependency-local to the ingester so this service is the canonical
- * "content acquisition" boundary — apps/api never has to call out to a
- * crawler.
- */
-
-import { Readability } from '@mozilla/readability';
-import { JSDOM, VirtualConsole } from 'jsdom';
-
-// JSDOM emits CSS parse errors and resource-loading warnings via its
-// virtualConsole. The default console rethrows some of those as
-// uncaughtException, which in a long-running ingester loop kills the
-// whole process. A bare VirtualConsole with no listeners swallows
-// everything quietly — exactly what we want for a "best-effort article
-// extractor" that doesn't care about CSS or sub-resources.
-const silentConsole = new VirtualConsole();
-
-export interface ExtractedArticle {
-	title: string | null;
-	content: string;
-	htmlContent: string;
-	excerpt: string;
-	byline: string | null;
-	siteName: string | null;
-	wordCount: number;
-	readingTimeMinutes: number;
-}
-
-const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)';
-
-export async function fetchAndExtract(url: string): Promise<ExtractedArticle | null> {
-	let html: string;
-	try {
-		const response = await fetch(url, {
-			headers: { 'User-Agent': USER_AGENT },
-			signal: AbortSignal.timeout(15_000),
-		});
-		if (!response.ok) return null;
-		html = await response.text();
-	} catch {
-		return null;
-	}
-
-	try {
-		const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
-		const reader = new Readability(dom.window.document);
-		const article = reader.parse();
-		if (!article || !article.textContent) return null;
-
-		const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
-		const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
-
-		return {
-			title: article.title ?? null,
-			content: article.textContent,
-			htmlContent: article.content ?? '',
-			excerpt: article.excerpt || article.textContent.slice(0, 240),
-			byline: article.byline ?? null,
-			siteName: article.siteName ?? null,
-			wordCount,
-			readingTimeMinutes,
-		};
-	} catch {
-		return null;
-	}
-}
--- a/services/news-ingester/src/parsers/rss.ts
+++ b/services/news-ingester/src/parsers/rss.ts
@ -1,72 +0,0 @@
-/**
- * RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop
- * doesn't have to know about feed-format quirks.
- *
- * `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out
- * the bits we need (link, title, content/snippet, image, date) and let
- * the ingester decide whether to call Readability for full-text.
- */
-
-import Parser from 'rss-parser';
-
-export interface NormalizedFeedItem {
-	url: string;
-	title: string;
-	excerpt: string | null;
-	content: string | null;
-	htmlContent: string | null;
-	author: string | null;
-	imageUrl: string | null;
-	publishedAt: Date | null;
-}
-
-type CustomItem = {
-	'media:content'?: { $: { url: string } };
-	'media:thumbnail'?: { $: { url: string } };
-	enclosure?: { url?: string };
-};
-
-const parser: Parser<unknown, CustomItem> = new Parser({
-	timeout: 15_000,
-	headers: {
-		'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)',
-	},
-	customFields: {
-		item: ['media:content', 'media:thumbnail', 'enclosure'],
-	},
-});
-
-export async function fetchFeed(url: string): Promise<NormalizedFeedItem[]> {
-	const feed = await parser.parseURL(url);
-
-	return (feed.items ?? []).map((item) => {
-		// rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`.
-		const html = (item as { content?: string }).content ?? null;
-		const text = (item as { contentSnippet?: string }).contentSnippet ?? null;
-
-		// Image: try a few common locations.
-		const mediaContent = item['media:content']?.$?.url;
-		const mediaThumb = item['media:thumbnail']?.$?.url;
-		const enclosureUrl = item.enclosure?.url;
-		const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null;
-
-		const link = (item as { link?: string }).link ?? '';
-		const title = (item as { title?: string }).title ?? '';
-		const author =
-			(item as { creator?: string; author?: string }).creator ??
-			(item as { author?: string }).author ??
-			null;
-		const isoDate = (item as { isoDate?: string }).isoDate ?? null;
-
-		return {
-			url: link,
-			title,
-			excerpt: text,
-			content: text,
-			htmlContent: html,
-			author,
-			imageUrl,
-			publishedAt: isoDate ? new Date(isoDate) : null,
-		};
-	});
-}