refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser
+ jsdom + Readability glue. Single source now in packages/shared-rss.
Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed
which News Research will use. JSDOM virtualConsole is silenced once,
in the package, instead of in two parallel call sites.

- packages/shared-rss: parse, extract, discover, validate
- services/news-ingester: drop local parsers, depend on @mana/shared-rss
- apps/api: drop @mozilla/readability + jsdom direct deps, use shared

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-15 22:30:44 +02:00
parent 5ae7f99fe1
commit b768a0ffce
16 changed files with 414 additions and 252 deletions

View file

@ -11,15 +11,12 @@
"db:studio": "drizzle-kit studio"
},
"dependencies": {
"@mozilla/readability": "^0.5.0",
"@mana/shared-rss": "workspace:*",
"drizzle-orm": "^0.38.3",
"hono": "^4.7.0",
"jsdom": "^25.0.1",
"postgres": "^3.4.5",
"rss-parser": "^3.13.0"
"postgres": "^3.4.5"
},
"devDependencies": {
"@types/jsdom": "^21.1.7",
"drizzle-kit": "^0.30.4",
"typescript": "^5.9.3"
}

View file

@ -19,9 +19,8 @@ import { sql } from 'drizzle-orm';
import type { Database } from './db/connection';
import { curatedArticles, type NewCuratedArticle } from './db/schema';
import { SOURCES, type NewsSource } from './sources';
import { fetchFeed, type NormalizedFeedItem } from './parsers/rss';
import { parseFeedUrl, extractFromUrl, type NormalizedFeedItem } from '@mana/shared-rss';
import { fetchHackerNews } from './parsers/hn';
import { fetchAndExtract } from './parsers/readability';
const RETENTION_DAYS = 30;
@ -57,7 +56,7 @@ function readingMinutes(words: number): number {
async function fetchSourceItems(source: NewsSource): Promise<NormalizedFeedItem[]> {
if (source.type === 'hn') return fetchHackerNews(source.url);
return fetchFeed(source.url);
return parseFeedUrl(source.url);
}
/**
@ -78,7 +77,7 @@ async function buildRow(
const initialWords = wordCountOf(content);
if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) {
const extracted = await fetchAndExtract(item.url);
const extracted = await extractFromUrl(item.url);
if (extracted) {
content = extracted.content;
htmlContent = extracted.htmlContent || htmlContent;

View file

@ -8,7 +8,7 @@
* struggles with and which isn't the user's expectation for a news feed.
*/
import type { NormalizedFeedItem } from './rss';
import type { NormalizedFeedItem } from '@mana/shared-rss';
interface HnItem {
id: number;

View file

@ -1,69 +0,0 @@
/**
* Mozilla Readability fallback. Used when an RSS item only ships an
* excerpt, so we fetch the original page and extract the article body.
*
* Kept dependency-local to the ingester so this service is the canonical
* "content acquisition" boundary apps/api never has to call out to a
* crawler.
*/
import { Readability } from '@mozilla/readability';
import { JSDOM, VirtualConsole } from 'jsdom';
// JSDOM emits CSS parse errors and resource-loading warnings via its
// virtualConsole. The default console rethrows some of those as
// uncaughtException, which in a long-running ingester loop kills the
// whole process. A bare VirtualConsole with no listeners swallows
// everything quietly — exactly what we want for a "best-effort article
// extractor" that doesn't care about CSS or sub-resources.
const silentConsole = new VirtualConsole();
export interface ExtractedArticle {
title: string | null;
content: string;
htmlContent: string;
excerpt: string;
byline: string | null;
siteName: string | null;
wordCount: number;
readingTimeMinutes: number;
}
const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)';
export async function fetchAndExtract(url: string): Promise<ExtractedArticle | null> {
let html: string;
try {
const response = await fetch(url, {
headers: { 'User-Agent': USER_AGENT },
signal: AbortSignal.timeout(15_000),
});
if (!response.ok) return null;
html = await response.text();
} catch {
return null;
}
try {
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article || !article.textContent) return null;
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
return {
title: article.title ?? null,
content: article.textContent,
htmlContent: article.content ?? '',
excerpt: article.excerpt || article.textContent.slice(0, 240),
byline: article.byline ?? null,
siteName: article.siteName ?? null,
wordCount,
readingTimeMinutes,
};
} catch {
return null;
}
}

View file

@ -1,72 +0,0 @@
/**
* RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop
* doesn't have to know about feed-format quirks.
*
* `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out
* the bits we need (link, title, content/snippet, image, date) and let
* the ingester decide whether to call Readability for full-text.
*/
import Parser from 'rss-parser';
export interface NormalizedFeedItem {
url: string;
title: string;
excerpt: string | null;
content: string | null;
htmlContent: string | null;
author: string | null;
imageUrl: string | null;
publishedAt: Date | null;
}
type CustomItem = {
'media:content'?: { $: { url: string } };
'media:thumbnail'?: { $: { url: string } };
enclosure?: { url?: string };
};
const parser: Parser<unknown, CustomItem> = new Parser({
timeout: 15_000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)',
},
customFields: {
item: ['media:content', 'media:thumbnail', 'enclosure'],
},
});
export async function fetchFeed(url: string): Promise<NormalizedFeedItem[]> {
const feed = await parser.parseURL(url);
return (feed.items ?? []).map((item) => {
// rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`.
const html = (item as { content?: string }).content ?? null;
const text = (item as { contentSnippet?: string }).contentSnippet ?? null;
// Image: try a few common locations.
const mediaContent = item['media:content']?.$?.url;
const mediaThumb = item['media:thumbnail']?.$?.url;
const enclosureUrl = item.enclosure?.url;
const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null;
const link = (item as { link?: string }).link ?? '';
const title = (item as { title?: string }).title ?? '';
const author =
(item as { creator?: string; author?: string }).creator ??
(item as { author?: string }).author ??
null;
const isoDate = (item as { isoDate?: string }).isoDate ?? null;
return {
url: link,
title,
excerpt: text,
content: text,
htmlContent: html,
author,
imageUrl,
publishedAt: isoDate ? new Date(isoDate) : null,
};
});
}