mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-19 06:41:23 +02:00
refactor(shared-rss): extract RSS parsing + Readability into one package
news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ae7f99fe1
commit
b768a0ffce
16 changed files with 414 additions and 252 deletions
47
packages/shared-rss/src/extract.ts
Normal file
47
packages/shared-rss/src/extract.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
|
||||
|
||||
// JSDOM's default virtualConsole rethrows CSS parse errors as
|
||||
// uncaughtException, which kills long-running services. A bare
|
||||
// VirtualConsole with no listeners swallows everything.
|
||||
const silentConsole = new VirtualConsole();
|
||||
|
||||
export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
|
||||
try {
|
||||
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
|
||||
const article = new Readability(dom.window.document).parse();
|
||||
if (!article || !article.textContent) return null;
|
||||
|
||||
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
|
||||
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
|
||||
|
||||
return {
|
||||
title: article.title ?? null,
|
||||
content: article.textContent,
|
||||
htmlContent: article.content ?? '',
|
||||
excerpt: article.excerpt || article.textContent.slice(0, 240),
|
||||
byline: article.byline ?? null,
|
||||
siteName: article.siteName ?? null,
|
||||
wordCount,
|
||||
readingTimeMinutes,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
|
||||
let html: string;
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: { 'User-Agent': DEFAULT_USER_AGENT },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!response.ok) return null;
|
||||
html = await response.text();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return extractFromHtml(html, url);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue