refactor(shared-rss): extract RSS parsing + Readability into one package

news-ingester and apps/api both shipped their own copy of rss-parser
+ jsdom + Readability glue. Single source now in packages/shared-rss.
Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed
which News Research will use. JSDOM virtualConsole is silenced once,
in the package, instead of in two parallel call sites.

- packages/shared-rss: parse, extract, discover, validate
- services/news-ingester: drop local parsers, depend on @mana/shared-rss
- apps/api: drop @mozilla/readability + jsdom direct deps, use shared

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-15 22:30:44 +02:00
parent 5ae7f99fe1
commit b768a0ffce
16 changed files with 414 additions and 252 deletions

View file

@ -11,8 +11,7 @@
*/
import { Hono } from 'hono';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { extractFromUrl } from '@mana/shared-rss';
import { drizzle } from 'drizzle-orm/postgres-js';
import { sql } from 'drizzle-orm';
import { getConnection } from '../../lib/db';
@ -21,54 +20,6 @@ import { getConnection } from '../../lib/db';
const db = drizzle(getConnection());
// ─── Extract Service (Readability fallback for ad-hoc URLs) ─
interface ExtractedArticle {
title: string;
content: string;
htmlContent: string;
excerpt: string;
byline: string | null;
siteName: string | null;
wordCount: number;
readingTimeMinutes: number;
}
async function extractFromUrl(url: string): Promise<ExtractedArticle> {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; ManaNews/1.0; +https://mana.how)',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch URL: ${response.status}`);
}
const html = await response.text();
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Could not extract article content');
}
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
return {
title: article.title,
content: article.textContent,
htmlContent: article.content,
excerpt: article.excerpt || article.textContent.slice(0, 200),
byline: article.byline || null,
siteName: article.siteName || null,
wordCount,
readingTimeMinutes,
};
}
// ─── Routes ─────────────────────────────────────────────────
const routes = new Hono();
@ -150,40 +101,33 @@ routes.post('/extract/preview', async (c) => {
const { url } = await c.req.json<{ url: string }>();
if (!url) return c.json({ error: 'URL is required' }, 400);
try {
const article = await extractFromUrl(url);
return c.json(article);
} catch (err) {
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
}
const article = await extractFromUrl(url);
if (!article) return c.json({ error: 'Extraction failed' }, 502);
return c.json(article);
});
routes.post('/extract/save', async (c) => {
const { url } = await c.req.json<{ url: string }>();
if (!url) return c.json({ error: 'URL is required' }, 400);
try {
const extracted = await extractFromUrl(url);
const extracted = await extractFromUrl(url);
if (!extracted) return c.json({ error: 'Extraction failed' }, 502);
// Return extracted data — client saves to local-first store.
return c.json({
id: crypto.randomUUID(),
type: 'saved',
sourceOrigin: 'user_saved',
originalUrl: url,
title: extracted.title,
content: extracted.content,
htmlContent: extracted.htmlContent,
excerpt: extracted.excerpt,
author: extracted.byline,
siteName: extracted.siteName,
wordCount: extracted.wordCount,
readingTimeMinutes: extracted.readingTimeMinutes,
isArchived: false,
});
} catch (err) {
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
}
return c.json({
id: crypto.randomUUID(),
type: 'saved',
sourceOrigin: 'user_saved',
originalUrl: url,
title: extracted.title,
content: extracted.content,
htmlContent: extracted.htmlContent,
excerpt: extracted.excerpt,
author: extracted.byline,
siteName: extracted.siteName,
wordCount: extracted.wordCount,
readingTimeMinutes: extracted.readingTimeMinutes,
isArchived: false,
});
});
export { routes as newsRoutes };