From b768a0ffce9717209ffd9e83ca1d24a76d07529c Mon Sep 17 00:00:00 2001 From: Till JS Date: Wed, 15 Apr 2026 22:30:44 +0200 Subject: [PATCH] refactor(shared-rss): extract RSS parsing + Readability into one package news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/api/package.json | 4 +- apps/api/src/modules/news/routes.ts | 98 +++----------- packages/shared-rss/package.json | 32 +++++ packages/shared-rss/src/discover.ts | 128 ++++++++++++++++++ packages/shared-rss/src/extract.ts | 47 +++++++ packages/shared-rss/src/index.ts | 6 + packages/shared-rss/src/parse.ts | 65 +++++++++ packages/shared-rss/src/types.ts | 38 ++++++ packages/shared-rss/src/validate.ts | 24 ++++ packages/shared-rss/tsconfig.json | 18 +++ pnpm-lock.yaml | 49 ++++--- services/news-ingester/package.json | 7 +- services/news-ingester/src/ingest.ts | 7 +- services/news-ingester/src/parsers/hn.ts | 2 +- .../news-ingester/src/parsers/readability.ts | 69 ---------- services/news-ingester/src/parsers/rss.ts | 72 ---------- 16 files changed, 414 insertions(+), 252 deletions(-) create mode 100644 packages/shared-rss/package.json create mode 100644 packages/shared-rss/src/discover.ts create mode 100644 packages/shared-rss/src/extract.ts create mode 100644 packages/shared-rss/src/index.ts create mode 100644 packages/shared-rss/src/parse.ts create mode 100644 packages/shared-rss/src/types.ts create mode 100644 packages/shared-rss/src/validate.ts create mode 100644 packages/shared-rss/tsconfig.json delete mode 100644 services/news-ingester/src/parsers/readability.ts delete mode 100644 services/news-ingester/src/parsers/rss.ts diff --git a/apps/api/package.json b/apps/api/package.json index 714b2e365..9c6fae060 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -16,20 +16,18 @@ "@ai-sdk/openai-compatible": "^2.0.41", "@mana/media-client": "workspace:*", "@mana/shared-hono": "workspace:*", + "@mana/shared-rss": "workspace:*", "@mana/shared-storage": "workspace:*", "@mana/shared-types": "workspace:^", - "@mozilla/readability": "^0.5.0", "ai": "^6.0.154", "drizzle-orm": "^0.38.0", "hono": "^4.7.0", - "jsdom": "^25.0.0", "postgres": "^3.4.0", "rrule": "^2.8.1", "zod": "^3.23.0" }, "devDependencies": { "@types/bun": "latest", - "@types/jsdom": "^21.1.0", "drizzle-kit": "^0.30.0", "typescript": "^5.8.0" } diff --git a/apps/api/src/modules/news/routes.ts b/apps/api/src/modules/news/routes.ts index 82879799f..a3fdd9ce9 100644 --- a/apps/api/src/modules/news/routes.ts +++ b/apps/api/src/modules/news/routes.ts @@ -11,8 +11,7 @@ */ import { Hono } from 'hono'; -import { Readability } from '@mozilla/readability'; -import { JSDOM } from 'jsdom'; +import { extractFromUrl } from '@mana/shared-rss'; import { drizzle } from 'drizzle-orm/postgres-js'; import { sql } from 'drizzle-orm'; import { getConnection } from '../../lib/db'; @@ -21,54 +20,6 @@ import { getConnection } from '../../lib/db'; const db = drizzle(getConnection()); -// ─── Extract Service (Readability fallback for ad-hoc URLs) ─ - -interface ExtractedArticle { - title: string; - content: string; - htmlContent: string; - excerpt: string; - byline: string | null; - siteName: string | null; - wordCount: number; - readingTimeMinutes: number; -} - -async function extractFromUrl(url: string): Promise { - const response = await fetch(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; ManaNews/1.0; +https://mana.how)', - }, - }); - - if (!response.ok) { - throw new Error(`Failed to fetch URL: ${response.status}`); - } - - const html = await response.text(); - const dom = new JSDOM(html, { url }); - const reader = new Readability(dom.window.document); - const article = reader.parse(); - - if (!article) { - throw new Error('Could not extract article content'); - } - - const wordCount = article.textContent.split(/\s+/).filter(Boolean).length; - const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200)); - - return { - title: article.title, - content: article.textContent, - htmlContent: article.content, - excerpt: article.excerpt || article.textContent.slice(0, 200), - byline: article.byline || null, - siteName: article.siteName || null, - wordCount, - readingTimeMinutes, - }; -} - // ─── Routes ───────────────────────────────────────────────── const routes = new Hono(); @@ -150,40 +101,33 @@ routes.post('/extract/preview', async (c) => { const { url } = await c.req.json<{ url: string }>(); if (!url) return c.json({ error: 'URL is required' }, 400); - try { - const article = await extractFromUrl(url); - return c.json(article); - } catch (err) { - return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500); - } + const article = await extractFromUrl(url); + if (!article) return c.json({ error: 'Extraction failed' }, 502); + return c.json(article); }); routes.post('/extract/save', async (c) => { const { url } = await c.req.json<{ url: string }>(); if (!url) return c.json({ error: 'URL is required' }, 400); - try { - const extracted = await extractFromUrl(url); + const extracted = await extractFromUrl(url); + if (!extracted) return c.json({ error: 'Extraction failed' }, 502); - // Return extracted data — client saves to local-first store. - return c.json({ - id: crypto.randomUUID(), - type: 'saved', - sourceOrigin: 'user_saved', - originalUrl: url, - title: extracted.title, - content: extracted.content, - htmlContent: extracted.htmlContent, - excerpt: extracted.excerpt, - author: extracted.byline, - siteName: extracted.siteName, - wordCount: extracted.wordCount, - readingTimeMinutes: extracted.readingTimeMinutes, - isArchived: false, - }); - } catch (err) { - return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500); - } + return c.json({ + id: crypto.randomUUID(), + type: 'saved', + sourceOrigin: 'user_saved', + originalUrl: url, + title: extracted.title, + content: extracted.content, + htmlContent: extracted.htmlContent, + excerpt: extracted.excerpt, + author: extracted.byline, + siteName: extracted.siteName, + wordCount: extracted.wordCount, + readingTimeMinutes: extracted.readingTimeMinutes, + isArchived: false, + }); }); export { routes as newsRoutes }; diff --git a/packages/shared-rss/package.json b/packages/shared-rss/package.json new file mode 100644 index 000000000..83f11d222 --- /dev/null +++ b/packages/shared-rss/package.json @@ -0,0 +1,32 @@ +{ + "name": "@mana/shared-rss", + "version": "0.1.0", + "private": true, + "sideEffects": false, + "description": "RSS/Atom parsing, article extraction, and feed discovery primitives.", + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts", + "./parse": "./src/parse.ts", + "./extract": "./src/extract.ts", + "./discover": "./src/discover.ts", + "./validate": "./src/validate.ts", + "./types": "./src/types.ts" + }, + "scripts": { + "type-check": "tsc --noEmit", + "clean": "rm -rf dist", + "lint": "eslint ." + }, + "dependencies": { + "@mozilla/readability": "^0.5.0", + "jsdom": "^25.0.1", + "rss-parser": "^3.13.0" + }, + "devDependencies": { + "@types/jsdom": "^21.1.7", + "@types/node": "^24.10.1", + "typescript": "^5.9.3" + } +} diff --git a/packages/shared-rss/src/discover.ts b/packages/shared-rss/src/discover.ts new file mode 100644 index 000000000..4799a0750 --- /dev/null +++ b/packages/shared-rss/src/discover.ts @@ -0,0 +1,128 @@ +import { JSDOM, VirtualConsole } from 'jsdom'; +import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types'; + +const silentConsole = new VirtualConsole(); + +const FEED_TYPES: Record = { + 'application/rss+xml': 'rss', + 'application/atom+xml': 'atom', + 'application/feed+json': 'unknown', + 'application/json': 'unknown', +}; + +const COMMON_FEED_PATHS = [ + '/feed', + '/feed/', + '/rss', + '/rss.xml', + '/atom.xml', + '/index.xml', + '/feed.xml', +]; + +function absolutize(href: string, base: string): string | null { + try { + return new URL(href, base).toString(); + } catch { + return null; + } +} + +/** + * Discover RSS/Atom feeds linked from a site URL. + * + * Strategy: fetch HTML, look for tags with a + * feed mime type. That covers ~90% of well-behaved sites. For the rest, + * the caller can fall back to `probeCommonPaths`. + */ +export async function discoverFeedsFromSite(siteUrl: string): Promise { + let html: string; + try { + const response = await fetch(siteUrl, { + headers: { 'User-Agent': DEFAULT_USER_AGENT }, + signal: AbortSignal.timeout(15_000), + redirect: 'follow', + }); + if (!response.ok) return []; + html = await response.text(); + } catch { + return []; + } + + const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole }); + const links = dom.window.document.querySelectorAll( + 'link[rel="alternate"], link[rel~="alternate"]' + ); + + const found = new Map(); + for (const link of Array.from(links)) { + const type = (link.getAttribute('type') || '').toLowerCase(); + const href = link.getAttribute('href'); + if (!href || !(type in FEED_TYPES)) continue; + + const abs = absolutize(href, siteUrl); + if (!abs) continue; + + if (!found.has(abs)) { + found.set(abs, { + url: abs, + title: link.getAttribute('title'), + type: FEED_TYPES[type] ?? 'unknown', + siteUrl, + }); + } + } + + return Array.from(found.values()); +} + +/** + * Probe a handful of common feed paths on a domain. Cheap fallback when + * discoverFeedsFromSite returns nothing. + */ +export async function probeCommonPaths(siteUrl: string): Promise { + const base = (() => { + try { + return new URL(siteUrl).origin; + } catch { + return null; + } + })(); + if (!base) return []; + + const probes = await Promise.all( + COMMON_FEED_PATHS.map(async (path) => { + const url = base + path; + try { + const res = await fetch(url, { + method: 'GET', + headers: { + 'User-Agent': DEFAULT_USER_AGENT, + Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1', + }, + signal: AbortSignal.timeout(8_000), + redirect: 'follow', + }); + if (!res.ok) return null; + const ct = res.headers.get('content-type') || ''; + if (!/xml|rss|atom/i.test(ct)) return null; + return { + url, + title: null, + type: ct.includes('atom') ? 'atom' : 'rss', + siteUrl: base, + } as DiscoveredFeed; + } catch { + return null; + } + }) + ); + + return probes.filter((p): p is DiscoveredFeed => p !== null); +} + +export async function discoverFeeds(siteUrl: string): Promise { + const viaLinks = await discoverFeedsFromSite(siteUrl); + if (viaLinks.length > 0) return viaLinks; + return probeCommonPaths(siteUrl); +} diff --git a/packages/shared-rss/src/extract.ts b/packages/shared-rss/src/extract.ts new file mode 100644 index 000000000..5b6f971e3 --- /dev/null +++ b/packages/shared-rss/src/extract.ts @@ -0,0 +1,47 @@ +import { Readability } from '@mozilla/readability'; +import { JSDOM, VirtualConsole } from 'jsdom'; +import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types'; + +// JSDOM's default virtualConsole rethrows CSS parse errors as +// uncaughtException, which kills long-running services. A bare +// VirtualConsole with no listeners swallows everything. +const silentConsole = new VirtualConsole(); + +export async function extractFromHtml(html: string, url: string): Promise { + try { + const dom = new JSDOM(html, { url, virtualConsole: silentConsole }); + const article = new Readability(dom.window.document).parse(); + if (!article || !article.textContent) return null; + + const wordCount = article.textContent.split(/\s+/).filter(Boolean).length; + const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200)); + + return { + title: article.title ?? null, + content: article.textContent, + htmlContent: article.content ?? '', + excerpt: article.excerpt || article.textContent.slice(0, 240), + byline: article.byline ?? null, + siteName: article.siteName ?? null, + wordCount, + readingTimeMinutes, + }; + } catch { + return null; + } +} + +export async function extractFromUrl(url: string): Promise { + let html: string; + try { + const response = await fetch(url, { + headers: { 'User-Agent': DEFAULT_USER_AGENT }, + signal: AbortSignal.timeout(15_000), + }); + if (!response.ok) return null; + html = await response.text(); + } catch { + return null; + } + return extractFromHtml(html, url); +} diff --git a/packages/shared-rss/src/index.ts b/packages/shared-rss/src/index.ts new file mode 100644 index 000000000..0e5394892 --- /dev/null +++ b/packages/shared-rss/src/index.ts @@ -0,0 +1,6 @@ +export type { NormalizedFeedItem, ExtractedArticle, DiscoveredFeed, FeedValidation } from './types'; +export { DEFAULT_USER_AGENT } from './types'; +export { parseFeedUrl, parseFeedXml, parseFeedMeta, type ParsedFeed } from './parse'; +export { extractFromUrl, extractFromHtml } from './extract'; +export { discoverFeeds, discoverFeedsFromSite, probeCommonPaths } from './discover'; +export { validateFeed } from './validate'; diff --git a/packages/shared-rss/src/parse.ts b/packages/shared-rss/src/parse.ts new file mode 100644 index 000000000..32c2ceaab --- /dev/null +++ b/packages/shared-rss/src/parse.ts @@ -0,0 +1,65 @@ +import Parser from 'rss-parser'; +import { DEFAULT_USER_AGENT, type NormalizedFeedItem } from './types'; + +type CustomItem = { + 'media:content'?: { $: { url: string } }; + 'media:thumbnail'?: { $: { url: string } }; + enclosure?: { url?: string }; +}; + +const parser: Parser = new Parser({ + timeout: 15_000, + headers: { 'User-Agent': DEFAULT_USER_AGENT }, + customFields: { + item: ['media:content', 'media:thumbnail', 'enclosure'], + }, +}); + +function mapItem(item: unknown): NormalizedFeedItem { + const i = item as CustomItem & { + link?: string; + title?: string; + content?: string; + contentSnippet?: string; + creator?: string; + author?: string; + isoDate?: string; + }; + + const imageUrl = + i['media:content']?.$?.url ?? i['media:thumbnail']?.$?.url ?? i.enclosure?.url ?? null; + + return { + url: i.link ?? '', + title: i.title ?? '', + excerpt: i.contentSnippet ?? null, + content: i.contentSnippet ?? null, + htmlContent: i.content ?? null, + author: i.creator ?? i.author ?? null, + imageUrl, + publishedAt: i.isoDate ? new Date(i.isoDate) : null, + }; +} + +export async function parseFeedUrl(url: string): Promise { + const feed = await parser.parseURL(url); + return (feed.items ?? []).map(mapItem); +} + +export async function parseFeedXml(xml: string): Promise { + const feed = await parser.parseString(xml); + return (feed.items ?? []).map(mapItem); +} + +export interface ParsedFeed { + title: string | null; + items: NormalizedFeedItem[]; +} + +export async function parseFeedMeta(url: string): Promise { + const feed = await parser.parseURL(url); + return { + title: feed.title ?? null, + items: (feed.items ?? []).map(mapItem), + }; +} diff --git a/packages/shared-rss/src/types.ts b/packages/shared-rss/src/types.ts new file mode 100644 index 000000000..042f5ff8d --- /dev/null +++ b/packages/shared-rss/src/types.ts @@ -0,0 +1,38 @@ +export interface NormalizedFeedItem { + url: string; + title: string; + excerpt: string | null; + content: string | null; + htmlContent: string | null; + author: string | null; + imageUrl: string | null; + publishedAt: Date | null; +} + +export interface ExtractedArticle { + title: string | null; + content: string; + htmlContent: string; + excerpt: string; + byline: string | null; + siteName: string | null; + wordCount: number; + readingTimeMinutes: number; +} + +export interface DiscoveredFeed { + url: string; + title: string | null; + type: 'rss' | 'atom' | 'unknown'; + siteUrl: string | null; +} + +export interface FeedValidation { + ok: boolean; + itemCount: number; + title: string | null; + sample: NormalizedFeedItem[]; + error?: string; +} + +export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; ManaRSS/1.0; +https://mana.how)'; diff --git a/packages/shared-rss/src/validate.ts b/packages/shared-rss/src/validate.ts new file mode 100644 index 000000000..168d68b43 --- /dev/null +++ b/packages/shared-rss/src/validate.ts @@ -0,0 +1,24 @@ +import { parseFeedMeta } from './parse'; +import type { FeedValidation } from './types'; + +const SAMPLE_LIMIT = 5; + +export async function validateFeed(url: string): Promise { + try { + const parsed = await parseFeedMeta(url); + return { + ok: parsed.items.length > 0, + itemCount: parsed.items.length, + title: parsed.title, + sample: parsed.items.slice(0, SAMPLE_LIMIT), + }; + } catch (err) { + return { + ok: false, + itemCount: 0, + title: null, + sample: [], + error: err instanceof Error ? err.message : String(err), + }; + } +} diff --git a/packages/shared-rss/tsconfig.json b/packages/shared-rss/tsconfig.json new file mode 100644 index 000000000..47aedf8fe --- /dev/null +++ b/packages/shared-rss/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "lib": ["ES2022", "DOM"], + "types": ["node"], + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "verbatimModuleSyntax": true, + "noEmit": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules"] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4316bf3e3..3c5a02217 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -72,15 +72,15 @@ importers: '@mana/shared-hono': specifier: workspace:* version: link:../../packages/shared-hono + '@mana/shared-rss': + specifier: workspace:* + version: link:../../packages/shared-rss '@mana/shared-storage': specifier: workspace:* version: link:../../packages/shared-storage '@mana/shared-types': specifier: workspace:^ version: link:../../packages/shared-types - '@mozilla/readability': - specifier: ^0.5.0 - version: 0.5.0 ai: specifier: ^6.0.154 version: 6.0.154(zod@3.25.76) @@ -90,9 +90,6 @@ importers: hono: specifier: ^4.7.0 version: 4.12.12 - jsdom: - specifier: ^25.0.0 - version: 25.0.1 postgres: specifier: ^3.4.0 version: 3.4.9 @@ -106,9 +103,6 @@ importers: '@types/bun': specifier: latest version: 1.3.12 - '@types/jsdom': - specifier: ^21.1.0 - version: 21.1.7 drizzle-kit: specifier: ^0.30.0 version: 0.30.6 @@ -2996,6 +2990,28 @@ importers: specifier: ^7.0.0 version: 7.4.0(@types/babel__core@7.20.5) + packages/shared-rss: + dependencies: + '@mozilla/readability': + specifier: ^0.5.0 + version: 0.5.0 + jsdom: + specifier: ^25.0.1 + version: 25.0.1 + rss-parser: + specifier: ^3.13.0 + version: 3.13.0 + devDependencies: + '@types/jsdom': + specifier: ^21.1.7 + version: 21.1.7 + '@types/node': + specifier: ^24.10.1 + version: 24.12.2 + typescript: + specifier: ^5.9.3 + version: 5.9.3 + packages/shared-storage: dependencies: '@aws-sdk/client-s3': @@ -3665,28 +3681,19 @@ importers: services/news-ingester: dependencies: - '@mozilla/readability': - specifier: ^0.5.0 - version: 0.5.0 + '@mana/shared-rss': + specifier: workspace:* + version: link:../../packages/shared-rss drizzle-orm: specifier: ^0.38.3 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/pg@8.6.1)(@types/react@19.2.14)(bun-types@1.3.12)(kysely@0.28.15)(postgres@3.4.9)(react@19.2.0) hono: specifier: ^4.7.0 version: 4.12.12 - jsdom: - specifier: ^25.0.1 - version: 25.0.1 postgres: specifier: ^3.4.5 version: 3.4.9 - rss-parser: - specifier: ^3.13.0 - version: 3.13.0 devDependencies: - '@types/jsdom': - specifier: ^21.1.7 - version: 21.1.7 drizzle-kit: specifier: ^0.30.4 version: 0.30.6 diff --git a/services/news-ingester/package.json b/services/news-ingester/package.json index ae071e68d..8207b83a6 100644 --- a/services/news-ingester/package.json +++ b/services/news-ingester/package.json @@ -11,15 +11,12 @@ "db:studio": "drizzle-kit studio" }, "dependencies": { - "@mozilla/readability": "^0.5.0", + "@mana/shared-rss": "workspace:*", "drizzle-orm": "^0.38.3", "hono": "^4.7.0", - "jsdom": "^25.0.1", - "postgres": "^3.4.5", - "rss-parser": "^3.13.0" + "postgres": "^3.4.5" }, "devDependencies": { - "@types/jsdom": "^21.1.7", "drizzle-kit": "^0.30.4", "typescript": "^5.9.3" } diff --git a/services/news-ingester/src/ingest.ts b/services/news-ingester/src/ingest.ts index ceed677ee..c986c4dbc 100644 --- a/services/news-ingester/src/ingest.ts +++ b/services/news-ingester/src/ingest.ts @@ -19,9 +19,8 @@ import { sql } from 'drizzle-orm'; import type { Database } from './db/connection'; import { curatedArticles, type NewCuratedArticle } from './db/schema'; import { SOURCES, type NewsSource } from './sources'; -import { fetchFeed, type NormalizedFeedItem } from './parsers/rss'; +import { parseFeedUrl, extractFromUrl, type NormalizedFeedItem } from '@mana/shared-rss'; import { fetchHackerNews } from './parsers/hn'; -import { fetchAndExtract } from './parsers/readability'; const RETENTION_DAYS = 30; @@ -57,7 +56,7 @@ function readingMinutes(words: number): number { async function fetchSourceItems(source: NewsSource): Promise { if (source.type === 'hn') return fetchHackerNews(source.url); - return fetchFeed(source.url); + return parseFeedUrl(source.url); } /** @@ -78,7 +77,7 @@ async function buildRow( const initialWords = wordCountOf(content); if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) { - const extracted = await fetchAndExtract(item.url); + const extracted = await extractFromUrl(item.url); if (extracted) { content = extracted.content; htmlContent = extracted.htmlContent || htmlContent; diff --git a/services/news-ingester/src/parsers/hn.ts b/services/news-ingester/src/parsers/hn.ts index 7eddcb0f1..b1d62a08b 100644 --- a/services/news-ingester/src/parsers/hn.ts +++ b/services/news-ingester/src/parsers/hn.ts @@ -8,7 +8,7 @@ * struggles with and which isn't the user's expectation for a news feed. */ -import type { NormalizedFeedItem } from './rss'; +import type { NormalizedFeedItem } from '@mana/shared-rss'; interface HnItem { id: number; diff --git a/services/news-ingester/src/parsers/readability.ts b/services/news-ingester/src/parsers/readability.ts deleted file mode 100644 index a9679a995..000000000 --- a/services/news-ingester/src/parsers/readability.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Mozilla Readability fallback. Used when an RSS item only ships an - * excerpt, so we fetch the original page and extract the article body. - * - * Kept dependency-local to the ingester so this service is the canonical - * "content acquisition" boundary — apps/api never has to call out to a - * crawler. - */ - -import { Readability } from '@mozilla/readability'; -import { JSDOM, VirtualConsole } from 'jsdom'; - -// JSDOM emits CSS parse errors and resource-loading warnings via its -// virtualConsole. The default console rethrows some of those as -// uncaughtException, which in a long-running ingester loop kills the -// whole process. A bare VirtualConsole with no listeners swallows -// everything quietly — exactly what we want for a "best-effort article -// extractor" that doesn't care about CSS or sub-resources. -const silentConsole = new VirtualConsole(); - -export interface ExtractedArticle { - title: string | null; - content: string; - htmlContent: string; - excerpt: string; - byline: string | null; - siteName: string | null; - wordCount: number; - readingTimeMinutes: number; -} - -const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)'; - -export async function fetchAndExtract(url: string): Promise { - let html: string; - try { - const response = await fetch(url, { - headers: { 'User-Agent': USER_AGENT }, - signal: AbortSignal.timeout(15_000), - }); - if (!response.ok) return null; - html = await response.text(); - } catch { - return null; - } - - try { - const dom = new JSDOM(html, { url, virtualConsole: silentConsole }); - const reader = new Readability(dom.window.document); - const article = reader.parse(); - if (!article || !article.textContent) return null; - - const wordCount = article.textContent.split(/\s+/).filter(Boolean).length; - const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200)); - - return { - title: article.title ?? null, - content: article.textContent, - htmlContent: article.content ?? '', - excerpt: article.excerpt || article.textContent.slice(0, 240), - byline: article.byline ?? null, - siteName: article.siteName ?? null, - wordCount, - readingTimeMinutes, - }; - } catch { - return null; - } -} diff --git a/services/news-ingester/src/parsers/rss.ts b/services/news-ingester/src/parsers/rss.ts deleted file mode 100644 index 703400e83..000000000 --- a/services/news-ingester/src/parsers/rss.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop - * doesn't have to know about feed-format quirks. - * - * `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out - * the bits we need (link, title, content/snippet, image, date) and let - * the ingester decide whether to call Readability for full-text. - */ - -import Parser from 'rss-parser'; - -export interface NormalizedFeedItem { - url: string; - title: string; - excerpt: string | null; - content: string | null; - htmlContent: string | null; - author: string | null; - imageUrl: string | null; - publishedAt: Date | null; -} - -type CustomItem = { - 'media:content'?: { $: { url: string } }; - 'media:thumbnail'?: { $: { url: string } }; - enclosure?: { url?: string }; -}; - -const parser: Parser = new Parser({ - timeout: 15_000, - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)', - }, - customFields: { - item: ['media:content', 'media:thumbnail', 'enclosure'], - }, -}); - -export async function fetchFeed(url: string): Promise { - const feed = await parser.parseURL(url); - - return (feed.items ?? []).map((item) => { - // rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`. - const html = (item as { content?: string }).content ?? null; - const text = (item as { contentSnippet?: string }).contentSnippet ?? null; - - // Image: try a few common locations. - const mediaContent = item['media:content']?.$?.url; - const mediaThumb = item['media:thumbnail']?.$?.url; - const enclosureUrl = item.enclosure?.url; - const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null; - - const link = (item as { link?: string }).link ?? ''; - const title = (item as { title?: string }).title ?? ''; - const author = - (item as { creator?: string; author?: string }).creator ?? - (item as { author?: string }).author ?? - null; - const isoDate = (item as { isoDate?: string }).isoDate ?? null; - - return { - url: link, - title, - excerpt: text, - content: text, - htmlContent: html, - author, - imageUrl, - publishedAt: isoDate ? new Date(isoDate) : null, - }; - }); -}