refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser
+ jsdom + Readability glue. Single source now in packages/shared-rss.
Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed
which News Research will use. JSDOM virtualConsole is silenced once,
in the package, instead of in two parallel call sites.

- packages/shared-rss: parse, extract, discover, validate
- services/news-ingester: drop local parsers, depend on @mana/shared-rss
- apps/api: drop @mozilla/readability + jsdom direct deps, use shared

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-15 22:30:44 +02:00
parent 5ae7f99fe1
commit b768a0ffce
16 changed files with 414 additions and 252 deletions

View file

@ -0,0 +1,128 @@
import { JSDOM, VirtualConsole } from 'jsdom';
import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types';
const silentConsole = new VirtualConsole();
const FEED_TYPES: Record<string, DiscoveredFeed['type']> = {
'application/rss+xml': 'rss',
'application/atom+xml': 'atom',
'application/feed+json': 'unknown',
'application/json': 'unknown',
};
const COMMON_FEED_PATHS = [
'/feed',
'/feed/',
'/rss',
'/rss.xml',
'/atom.xml',
'/index.xml',
'/feed.xml',
];
function absolutize(href: string, base: string): string | null {
try {
return new URL(href, base).toString();
} catch {
return null;
}
}
/**
* Discover RSS/Atom feeds linked from a site URL.
*
* Strategy: fetch HTML, look for <link rel="alternate"> tags with a
* feed mime type. That covers ~90% of well-behaved sites. For the rest,
* the caller can fall back to `probeCommonPaths`.
*/
export async function discoverFeedsFromSite(siteUrl: string): Promise<DiscoveredFeed[]> {
let html: string;
try {
const response = await fetch(siteUrl, {
headers: { 'User-Agent': DEFAULT_USER_AGENT },
signal: AbortSignal.timeout(15_000),
redirect: 'follow',
});
if (!response.ok) return [];
html = await response.text();
} catch {
return [];
}
const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole });
const links = dom.window.document.querySelectorAll(
'link[rel="alternate"], link[rel~="alternate"]'
);
const found = new Map<string, DiscoveredFeed>();
for (const link of Array.from(links)) {
const type = (link.getAttribute('type') || '').toLowerCase();
const href = link.getAttribute('href');
if (!href || !(type in FEED_TYPES)) continue;
const abs = absolutize(href, siteUrl);
if (!abs) continue;
if (!found.has(abs)) {
found.set(abs, {
url: abs,
title: link.getAttribute('title'),
type: FEED_TYPES[type] ?? 'unknown',
siteUrl,
});
}
}
return Array.from(found.values());
}
/**
* Probe a handful of common feed paths on a domain. Cheap fallback when
* discoverFeedsFromSite returns nothing.
*/
export async function probeCommonPaths(siteUrl: string): Promise<DiscoveredFeed[]> {
const base = (() => {
try {
return new URL(siteUrl).origin;
} catch {
return null;
}
})();
if (!base) return [];
const probes = await Promise.all(
COMMON_FEED_PATHS.map(async (path) => {
const url = base + path;
try {
const res = await fetch(url, {
method: 'GET',
headers: {
'User-Agent': DEFAULT_USER_AGENT,
Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1',
},
signal: AbortSignal.timeout(8_000),
redirect: 'follow',
});
if (!res.ok) return null;
const ct = res.headers.get('content-type') || '';
if (!/xml|rss|atom/i.test(ct)) return null;
return {
url,
title: null,
type: ct.includes('atom') ? 'atom' : 'rss',
siteUrl: base,
} as DiscoveredFeed;
} catch {
return null;
}
})
);
return probes.filter((p): p is DiscoveredFeed => p !== null);
}
export async function discoverFeeds(siteUrl: string): Promise<DiscoveredFeed[]> {
const viaLinks = await discoverFeedsFromSite(siteUrl);
if (viaLinks.length > 0) return viaLinks;
return probeCommonPaths(siteUrl);
}

View file

@ -0,0 +1,47 @@
import { Readability } from '@mozilla/readability';
import { JSDOM, VirtualConsole } from 'jsdom';
import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
// JSDOM's default virtualConsole rethrows CSS parse errors as
// uncaughtException, which kills long-running services. A bare
// VirtualConsole with no listeners swallows everything.
const silentConsole = new VirtualConsole();
export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
try {
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
const article = new Readability(dom.window.document).parse();
if (!article || !article.textContent) return null;
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
return {
title: article.title ?? null,
content: article.textContent,
htmlContent: article.content ?? '',
excerpt: article.excerpt || article.textContent.slice(0, 240),
byline: article.byline ?? null,
siteName: article.siteName ?? null,
wordCount,
readingTimeMinutes,
};
} catch {
return null;
}
}
export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
let html: string;
try {
const response = await fetch(url, {
headers: { 'User-Agent': DEFAULT_USER_AGENT },
signal: AbortSignal.timeout(15_000),
});
if (!response.ok) return null;
html = await response.text();
} catch {
return null;
}
return extractFromHtml(html, url);
}

View file

@ -0,0 +1,6 @@
export type { NormalizedFeedItem, ExtractedArticle, DiscoveredFeed, FeedValidation } from './types';
export { DEFAULT_USER_AGENT } from './types';
export { parseFeedUrl, parseFeedXml, parseFeedMeta, type ParsedFeed } from './parse';
export { extractFromUrl, extractFromHtml } from './extract';
export { discoverFeeds, discoverFeedsFromSite, probeCommonPaths } from './discover';
export { validateFeed } from './validate';

View file

@ -0,0 +1,65 @@
import Parser from 'rss-parser';
import { DEFAULT_USER_AGENT, type NormalizedFeedItem } from './types';
type CustomItem = {
'media:content'?: { $: { url: string } };
'media:thumbnail'?: { $: { url: string } };
enclosure?: { url?: string };
};
const parser: Parser<unknown, CustomItem> = new Parser({
timeout: 15_000,
headers: { 'User-Agent': DEFAULT_USER_AGENT },
customFields: {
item: ['media:content', 'media:thumbnail', 'enclosure'],
},
});
function mapItem(item: unknown): NormalizedFeedItem {
const i = item as CustomItem & {
link?: string;
title?: string;
content?: string;
contentSnippet?: string;
creator?: string;
author?: string;
isoDate?: string;
};
const imageUrl =
i['media:content']?.$?.url ?? i['media:thumbnail']?.$?.url ?? i.enclosure?.url ?? null;
return {
url: i.link ?? '',
title: i.title ?? '',
excerpt: i.contentSnippet ?? null,
content: i.contentSnippet ?? null,
htmlContent: i.content ?? null,
author: i.creator ?? i.author ?? null,
imageUrl,
publishedAt: i.isoDate ? new Date(i.isoDate) : null,
};
}
export async function parseFeedUrl(url: string): Promise<NormalizedFeedItem[]> {
const feed = await parser.parseURL(url);
return (feed.items ?? []).map(mapItem);
}
export async function parseFeedXml(xml: string): Promise<NormalizedFeedItem[]> {
const feed = await parser.parseString(xml);
return (feed.items ?? []).map(mapItem);
}
export interface ParsedFeed {
title: string | null;
items: NormalizedFeedItem[];
}
export async function parseFeedMeta(url: string): Promise<ParsedFeed> {
const feed = await parser.parseURL(url);
return {
title: feed.title ?? null,
items: (feed.items ?? []).map(mapItem),
};
}

View file

@ -0,0 +1,38 @@
export interface NormalizedFeedItem {
url: string;
title: string;
excerpt: string | null;
content: string | null;
htmlContent: string | null;
author: string | null;
imageUrl: string | null;
publishedAt: Date | null;
}
export interface ExtractedArticle {
title: string | null;
content: string;
htmlContent: string;
excerpt: string;
byline: string | null;
siteName: string | null;
wordCount: number;
readingTimeMinutes: number;
}
export interface DiscoveredFeed {
url: string;
title: string | null;
type: 'rss' | 'atom' | 'unknown';
siteUrl: string | null;
}
export interface FeedValidation {
ok: boolean;
itemCount: number;
title: string | null;
sample: NormalizedFeedItem[];
error?: string;
}
export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; ManaRSS/1.0; +https://mana.how)';

View file

@ -0,0 +1,24 @@
import { parseFeedMeta } from './parse';
import type { FeedValidation } from './types';
const SAMPLE_LIMIT = 5;
export async function validateFeed(url: string): Promise<FeedValidation> {
try {
const parsed = await parseFeedMeta(url);
return {
ok: parsed.items.length > 0,
itemCount: parsed.items.length,
title: parsed.title,
sample: parsed.items.slice(0, SAMPLE_LIMIT),
};
} catch (err) {
return {
ok: false,
itemCount: 0,
title: null,
sample: [],
error: err instanceof Error ? err.message : String(err),
};
}
}