mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-17 23:09:39 +02:00
news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
128 lines
3.2 KiB
TypeScript
128 lines
3.2 KiB
TypeScript
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types';
|
|
|
|
const silentConsole = new VirtualConsole();
|
|
|
|
const FEED_TYPES: Record<string, DiscoveredFeed['type']> = {
|
|
'application/rss+xml': 'rss',
|
|
'application/atom+xml': 'atom',
|
|
'application/feed+json': 'unknown',
|
|
'application/json': 'unknown',
|
|
};
|
|
|
|
const COMMON_FEED_PATHS = [
|
|
'/feed',
|
|
'/feed/',
|
|
'/rss',
|
|
'/rss.xml',
|
|
'/atom.xml',
|
|
'/index.xml',
|
|
'/feed.xml',
|
|
];
|
|
|
|
function absolutize(href: string, base: string): string | null {
|
|
try {
|
|
return new URL(href, base).toString();
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discover RSS/Atom feeds linked from a site URL.
|
|
*
|
|
* Strategy: fetch HTML, look for <link rel="alternate"> tags with a
|
|
* feed mime type. That covers ~90% of well-behaved sites. For the rest,
|
|
* the caller can fall back to `probeCommonPaths`.
|
|
*/
|
|
export async function discoverFeedsFromSite(siteUrl: string): Promise<DiscoveredFeed[]> {
|
|
let html: string;
|
|
try {
|
|
const response = await fetch(siteUrl, {
|
|
headers: { 'User-Agent': DEFAULT_USER_AGENT },
|
|
signal: AbortSignal.timeout(15_000),
|
|
redirect: 'follow',
|
|
});
|
|
if (!response.ok) return [];
|
|
html = await response.text();
|
|
} catch {
|
|
return [];
|
|
}
|
|
|
|
const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole });
|
|
const links = dom.window.document.querySelectorAll(
|
|
'link[rel="alternate"], link[rel~="alternate"]'
|
|
);
|
|
|
|
const found = new Map<string, DiscoveredFeed>();
|
|
for (const link of Array.from(links)) {
|
|
const type = (link.getAttribute('type') || '').toLowerCase();
|
|
const href = link.getAttribute('href');
|
|
if (!href || !(type in FEED_TYPES)) continue;
|
|
|
|
const abs = absolutize(href, siteUrl);
|
|
if (!abs) continue;
|
|
|
|
if (!found.has(abs)) {
|
|
found.set(abs, {
|
|
url: abs,
|
|
title: link.getAttribute('title'),
|
|
type: FEED_TYPES[type] ?? 'unknown',
|
|
siteUrl,
|
|
});
|
|
}
|
|
}
|
|
|
|
return Array.from(found.values());
|
|
}
|
|
|
|
/**
|
|
* Probe a handful of common feed paths on a domain. Cheap fallback when
|
|
* discoverFeedsFromSite returns nothing.
|
|
*/
|
|
export async function probeCommonPaths(siteUrl: string): Promise<DiscoveredFeed[]> {
|
|
const base = (() => {
|
|
try {
|
|
return new URL(siteUrl).origin;
|
|
} catch {
|
|
return null;
|
|
}
|
|
})();
|
|
if (!base) return [];
|
|
|
|
const probes = await Promise.all(
|
|
COMMON_FEED_PATHS.map(async (path) => {
|
|
const url = base + path;
|
|
try {
|
|
const res = await fetch(url, {
|
|
method: 'GET',
|
|
headers: {
|
|
'User-Agent': DEFAULT_USER_AGENT,
|
|
Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1',
|
|
},
|
|
signal: AbortSignal.timeout(8_000),
|
|
redirect: 'follow',
|
|
});
|
|
if (!res.ok) return null;
|
|
const ct = res.headers.get('content-type') || '';
|
|
if (!/xml|rss|atom/i.test(ct)) return null;
|
|
return {
|
|
url,
|
|
title: null,
|
|
type: ct.includes('atom') ? 'atom' : 'rss',
|
|
siteUrl: base,
|
|
} as DiscoveredFeed;
|
|
} catch {
|
|
return null;
|
|
}
|
|
})
|
|
);
|
|
|
|
return probes.filter((p): p is DiscoveredFeed => p !== null);
|
|
}
|
|
|
|
export async function discoverFeeds(siteUrl: string): Promise<DiscoveredFeed[]> {
|
|
const viaLinks = await discoverFeedsFromSite(siteUrl);
|
|
if (viaLinks.length > 0) return viaLinks;
|
|
return probeCommonPaths(siteUrl);
|
|
}
|