mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 16:41:08 +02:00
refactor(shared-rss): extract RSS parsing + Readability into one package
news-ingester and apps/api both shipped their own copy of rss-parser + jsdom + Readability glue. Single source now in packages/shared-rss. Adds discoverFeeds (rel=alternate + common-paths probe) and validateFeed which News Research will use. JSDOM virtualConsole is silenced once, in the package, instead of in two parallel call sites. - packages/shared-rss: parse, extract, discover, validate - services/news-ingester: drop local parsers, depend on @mana/shared-rss - apps/api: drop @mozilla/readability + jsdom direct deps, use shared Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ae7f99fe1
commit
b768a0ffce
16 changed files with 414 additions and 252 deletions
|
|
@ -16,20 +16,18 @@
|
|||
"@ai-sdk/openai-compatible": "^2.0.41",
|
||||
"@mana/media-client": "workspace:*",
|
||||
"@mana/shared-hono": "workspace:*",
|
||||
"@mana/shared-rss": "workspace:*",
|
||||
"@mana/shared-storage": "workspace:*",
|
||||
"@mana/shared-types": "workspace:^",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"ai": "^6.0.154",
|
||||
"drizzle-orm": "^0.38.0",
|
||||
"hono": "^4.7.0",
|
||||
"jsdom": "^25.0.0",
|
||||
"postgres": "^3.4.0",
|
||||
"rrule": "^2.8.1",
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest",
|
||||
"@types/jsdom": "^21.1.0",
|
||||
"drizzle-kit": "^0.30.0",
|
||||
"typescript": "^5.8.0"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,8 +11,7 @@
|
|||
*/
|
||||
|
||||
import { Hono } from 'hono';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import { extractFromUrl } from '@mana/shared-rss';
|
||||
import { drizzle } from 'drizzle-orm/postgres-js';
|
||||
import { sql } from 'drizzle-orm';
|
||||
import { getConnection } from '../../lib/db';
|
||||
|
|
@ -21,54 +20,6 @@ import { getConnection } from '../../lib/db';
|
|||
|
||||
const db = drizzle(getConnection());
|
||||
|
||||
// ─── Extract Service (Readability fallback for ad-hoc URLs) ─
|
||||
|
||||
interface ExtractedArticle {
|
||||
title: string;
|
||||
content: string;
|
||||
htmlContent: string;
|
||||
excerpt: string;
|
||||
byline: string | null;
|
||||
siteName: string | null;
|
||||
wordCount: number;
|
||||
readingTimeMinutes: number;
|
||||
}
|
||||
|
||||
async function extractFromUrl(url: string): Promise<ExtractedArticle> {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ManaNews/1.0; +https://mana.how)',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch URL: ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
const dom = new JSDOM(html, { url });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (!article) {
|
||||
throw new Error('Could not extract article content');
|
||||
}
|
||||
|
||||
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
|
||||
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
|
||||
|
||||
return {
|
||||
title: article.title,
|
||||
content: article.textContent,
|
||||
htmlContent: article.content,
|
||||
excerpt: article.excerpt || article.textContent.slice(0, 200),
|
||||
byline: article.byline || null,
|
||||
siteName: article.siteName || null,
|
||||
wordCount,
|
||||
readingTimeMinutes,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Routes ─────────────────────────────────────────────────
|
||||
|
||||
const routes = new Hono();
|
||||
|
|
@ -150,40 +101,33 @@ routes.post('/extract/preview', async (c) => {
|
|||
const { url } = await c.req.json<{ url: string }>();
|
||||
if (!url) return c.json({ error: 'URL is required' }, 400);
|
||||
|
||||
try {
|
||||
const article = await extractFromUrl(url);
|
||||
return c.json(article);
|
||||
} catch (err) {
|
||||
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
|
||||
}
|
||||
const article = await extractFromUrl(url);
|
||||
if (!article) return c.json({ error: 'Extraction failed' }, 502);
|
||||
return c.json(article);
|
||||
});
|
||||
|
||||
routes.post('/extract/save', async (c) => {
|
||||
const { url } = await c.req.json<{ url: string }>();
|
||||
if (!url) return c.json({ error: 'URL is required' }, 400);
|
||||
|
||||
try {
|
||||
const extracted = await extractFromUrl(url);
|
||||
const extracted = await extractFromUrl(url);
|
||||
if (!extracted) return c.json({ error: 'Extraction failed' }, 502);
|
||||
|
||||
// Return extracted data — client saves to local-first store.
|
||||
return c.json({
|
||||
id: crypto.randomUUID(),
|
||||
type: 'saved',
|
||||
sourceOrigin: 'user_saved',
|
||||
originalUrl: url,
|
||||
title: extracted.title,
|
||||
content: extracted.content,
|
||||
htmlContent: extracted.htmlContent,
|
||||
excerpt: extracted.excerpt,
|
||||
author: extracted.byline,
|
||||
siteName: extracted.siteName,
|
||||
wordCount: extracted.wordCount,
|
||||
readingTimeMinutes: extracted.readingTimeMinutes,
|
||||
isArchived: false,
|
||||
});
|
||||
} catch (err) {
|
||||
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
|
||||
}
|
||||
return c.json({
|
||||
id: crypto.randomUUID(),
|
||||
type: 'saved',
|
||||
sourceOrigin: 'user_saved',
|
||||
originalUrl: url,
|
||||
title: extracted.title,
|
||||
content: extracted.content,
|
||||
htmlContent: extracted.htmlContent,
|
||||
excerpt: extracted.excerpt,
|
||||
author: extracted.byline,
|
||||
siteName: extracted.siteName,
|
||||
wordCount: extracted.wordCount,
|
||||
readingTimeMinutes: extracted.readingTimeMinutes,
|
||||
isArchived: false,
|
||||
});
|
||||
});
|
||||
|
||||
export { routes as newsRoutes };
|
||||
|
|
|
|||
32
packages/shared-rss/package.json
Normal file
32
packages/shared-rss/package.json
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"name": "@mana/shared-rss",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"sideEffects": false,
|
||||
"description": "RSS/Atom parsing, article extraction, and feed discovery primitives.",
|
||||
"main": "./src/index.ts",
|
||||
"types": "./src/index.ts",
|
||||
"exports": {
|
||||
".": "./src/index.ts",
|
||||
"./parse": "./src/parse.ts",
|
||||
"./extract": "./src/extract.ts",
|
||||
"./discover": "./src/discover.ts",
|
||||
"./validate": "./src/validate.ts",
|
||||
"./types": "./src/types.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"type-check": "tsc --noEmit",
|
||||
"clean": "rm -rf dist",
|
||||
"lint": "eslint ."
|
||||
},
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"jsdom": "^25.0.1",
|
||||
"rss-parser": "^3.13.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"@types/node": "^24.10.1",
|
||||
"typescript": "^5.9.3"
|
||||
}
|
||||
}
|
||||
128
packages/shared-rss/src/discover.ts
Normal file
128
packages/shared-rss/src/discover.ts
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
import { DEFAULT_USER_AGENT, type DiscoveredFeed } from './types';
|
||||
|
||||
const silentConsole = new VirtualConsole();
|
||||
|
||||
const FEED_TYPES: Record<string, DiscoveredFeed['type']> = {
|
||||
'application/rss+xml': 'rss',
|
||||
'application/atom+xml': 'atom',
|
||||
'application/feed+json': 'unknown',
|
||||
'application/json': 'unknown',
|
||||
};
|
||||
|
||||
const COMMON_FEED_PATHS = [
|
||||
'/feed',
|
||||
'/feed/',
|
||||
'/rss',
|
||||
'/rss.xml',
|
||||
'/atom.xml',
|
||||
'/index.xml',
|
||||
'/feed.xml',
|
||||
];
|
||||
|
||||
function absolutize(href: string, base: string): string | null {
|
||||
try {
|
||||
return new URL(href, base).toString();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover RSS/Atom feeds linked from a site URL.
|
||||
*
|
||||
* Strategy: fetch HTML, look for <link rel="alternate"> tags with a
|
||||
* feed mime type. That covers ~90% of well-behaved sites. For the rest,
|
||||
* the caller can fall back to `probeCommonPaths`.
|
||||
*/
|
||||
export async function discoverFeedsFromSite(siteUrl: string): Promise<DiscoveredFeed[]> {
|
||||
let html: string;
|
||||
try {
|
||||
const response = await fetch(siteUrl, {
|
||||
headers: { 'User-Agent': DEFAULT_USER_AGENT },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
redirect: 'follow',
|
||||
});
|
||||
if (!response.ok) return [];
|
||||
html = await response.text();
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
const dom = new JSDOM(html, { url: siteUrl, virtualConsole: silentConsole });
|
||||
const links = dom.window.document.querySelectorAll(
|
||||
'link[rel="alternate"], link[rel~="alternate"]'
|
||||
);
|
||||
|
||||
const found = new Map<string, DiscoveredFeed>();
|
||||
for (const link of Array.from(links)) {
|
||||
const type = (link.getAttribute('type') || '').toLowerCase();
|
||||
const href = link.getAttribute('href');
|
||||
if (!href || !(type in FEED_TYPES)) continue;
|
||||
|
||||
const abs = absolutize(href, siteUrl);
|
||||
if (!abs) continue;
|
||||
|
||||
if (!found.has(abs)) {
|
||||
found.set(abs, {
|
||||
url: abs,
|
||||
title: link.getAttribute('title'),
|
||||
type: FEED_TYPES[type] ?? 'unknown',
|
||||
siteUrl,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(found.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Probe a handful of common feed paths on a domain. Cheap fallback when
|
||||
* discoverFeedsFromSite returns nothing.
|
||||
*/
|
||||
export async function probeCommonPaths(siteUrl: string): Promise<DiscoveredFeed[]> {
|
||||
const base = (() => {
|
||||
try {
|
||||
return new URL(siteUrl).origin;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})();
|
||||
if (!base) return [];
|
||||
|
||||
const probes = await Promise.all(
|
||||
COMMON_FEED_PATHS.map(async (path) => {
|
||||
const url = base + path;
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'User-Agent': DEFAULT_USER_AGENT,
|
||||
Accept: 'application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.1',
|
||||
},
|
||||
signal: AbortSignal.timeout(8_000),
|
||||
redirect: 'follow',
|
||||
});
|
||||
if (!res.ok) return null;
|
||||
const ct = res.headers.get('content-type') || '';
|
||||
if (!/xml|rss|atom/i.test(ct)) return null;
|
||||
return {
|
||||
url,
|
||||
title: null,
|
||||
type: ct.includes('atom') ? 'atom' : 'rss',
|
||||
siteUrl: base,
|
||||
} as DiscoveredFeed;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
return probes.filter((p): p is DiscoveredFeed => p !== null);
|
||||
}
|
||||
|
||||
export async function discoverFeeds(siteUrl: string): Promise<DiscoveredFeed[]> {
|
||||
const viaLinks = await discoverFeedsFromSite(siteUrl);
|
||||
if (viaLinks.length > 0) return viaLinks;
|
||||
return probeCommonPaths(siteUrl);
|
||||
}
|
||||
47
packages/shared-rss/src/extract.ts
Normal file
47
packages/shared-rss/src/extract.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
import { DEFAULT_USER_AGENT, type ExtractedArticle } from './types';
|
||||
|
||||
// JSDOM's default virtualConsole rethrows CSS parse errors as
|
||||
// uncaughtException, which kills long-running services. A bare
|
||||
// VirtualConsole with no listeners swallows everything.
|
||||
const silentConsole = new VirtualConsole();
|
||||
|
||||
export async function extractFromHtml(html: string, url: string): Promise<ExtractedArticle | null> {
|
||||
try {
|
||||
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
|
||||
const article = new Readability(dom.window.document).parse();
|
||||
if (!article || !article.textContent) return null;
|
||||
|
||||
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
|
||||
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
|
||||
|
||||
return {
|
||||
title: article.title ?? null,
|
||||
content: article.textContent,
|
||||
htmlContent: article.content ?? '',
|
||||
excerpt: article.excerpt || article.textContent.slice(0, 240),
|
||||
byline: article.byline ?? null,
|
||||
siteName: article.siteName ?? null,
|
||||
wordCount,
|
||||
readingTimeMinutes,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractFromUrl(url: string): Promise<ExtractedArticle | null> {
|
||||
let html: string;
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: { 'User-Agent': DEFAULT_USER_AGENT },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!response.ok) return null;
|
||||
html = await response.text();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return extractFromHtml(html, url);
|
||||
}
|
||||
6
packages/shared-rss/src/index.ts
Normal file
6
packages/shared-rss/src/index.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
export type { NormalizedFeedItem, ExtractedArticle, DiscoveredFeed, FeedValidation } from './types';
|
||||
export { DEFAULT_USER_AGENT } from './types';
|
||||
export { parseFeedUrl, parseFeedXml, parseFeedMeta, type ParsedFeed } from './parse';
|
||||
export { extractFromUrl, extractFromHtml } from './extract';
|
||||
export { discoverFeeds, discoverFeedsFromSite, probeCommonPaths } from './discover';
|
||||
export { validateFeed } from './validate';
|
||||
65
packages/shared-rss/src/parse.ts
Normal file
65
packages/shared-rss/src/parse.ts
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import Parser from 'rss-parser';
|
||||
import { DEFAULT_USER_AGENT, type NormalizedFeedItem } from './types';
|
||||
|
||||
type CustomItem = {
|
||||
'media:content'?: { $: { url: string } };
|
||||
'media:thumbnail'?: { $: { url: string } };
|
||||
enclosure?: { url?: string };
|
||||
};
|
||||
|
||||
const parser: Parser<unknown, CustomItem> = new Parser({
|
||||
timeout: 15_000,
|
||||
headers: { 'User-Agent': DEFAULT_USER_AGENT },
|
||||
customFields: {
|
||||
item: ['media:content', 'media:thumbnail', 'enclosure'],
|
||||
},
|
||||
});
|
||||
|
||||
function mapItem(item: unknown): NormalizedFeedItem {
|
||||
const i = item as CustomItem & {
|
||||
link?: string;
|
||||
title?: string;
|
||||
content?: string;
|
||||
contentSnippet?: string;
|
||||
creator?: string;
|
||||
author?: string;
|
||||
isoDate?: string;
|
||||
};
|
||||
|
||||
const imageUrl =
|
||||
i['media:content']?.$?.url ?? i['media:thumbnail']?.$?.url ?? i.enclosure?.url ?? null;
|
||||
|
||||
return {
|
||||
url: i.link ?? '',
|
||||
title: i.title ?? '',
|
||||
excerpt: i.contentSnippet ?? null,
|
||||
content: i.contentSnippet ?? null,
|
||||
htmlContent: i.content ?? null,
|
||||
author: i.creator ?? i.author ?? null,
|
||||
imageUrl,
|
||||
publishedAt: i.isoDate ? new Date(i.isoDate) : null,
|
||||
};
|
||||
}
|
||||
|
||||
export async function parseFeedUrl(url: string): Promise<NormalizedFeedItem[]> {
|
||||
const feed = await parser.parseURL(url);
|
||||
return (feed.items ?? []).map(mapItem);
|
||||
}
|
||||
|
||||
export async function parseFeedXml(xml: string): Promise<NormalizedFeedItem[]> {
|
||||
const feed = await parser.parseString(xml);
|
||||
return (feed.items ?? []).map(mapItem);
|
||||
}
|
||||
|
||||
export interface ParsedFeed {
|
||||
title: string | null;
|
||||
items: NormalizedFeedItem[];
|
||||
}
|
||||
|
||||
export async function parseFeedMeta(url: string): Promise<ParsedFeed> {
|
||||
const feed = await parser.parseURL(url);
|
||||
return {
|
||||
title: feed.title ?? null,
|
||||
items: (feed.items ?? []).map(mapItem),
|
||||
};
|
||||
}
|
||||
38
packages/shared-rss/src/types.ts
Normal file
38
packages/shared-rss/src/types.ts
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
export interface NormalizedFeedItem {
|
||||
url: string;
|
||||
title: string;
|
||||
excerpt: string | null;
|
||||
content: string | null;
|
||||
htmlContent: string | null;
|
||||
author: string | null;
|
||||
imageUrl: string | null;
|
||||
publishedAt: Date | null;
|
||||
}
|
||||
|
||||
export interface ExtractedArticle {
|
||||
title: string | null;
|
||||
content: string;
|
||||
htmlContent: string;
|
||||
excerpt: string;
|
||||
byline: string | null;
|
||||
siteName: string | null;
|
||||
wordCount: number;
|
||||
readingTimeMinutes: number;
|
||||
}
|
||||
|
||||
export interface DiscoveredFeed {
|
||||
url: string;
|
||||
title: string | null;
|
||||
type: 'rss' | 'atom' | 'unknown';
|
||||
siteUrl: string | null;
|
||||
}
|
||||
|
||||
export interface FeedValidation {
|
||||
ok: boolean;
|
||||
itemCount: number;
|
||||
title: string | null;
|
||||
sample: NormalizedFeedItem[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; ManaRSS/1.0; +https://mana.how)';
|
||||
24
packages/shared-rss/src/validate.ts
Normal file
24
packages/shared-rss/src/validate.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { parseFeedMeta } from './parse';
|
||||
import type { FeedValidation } from './types';
|
||||
|
||||
const SAMPLE_LIMIT = 5;
|
||||
|
||||
export async function validateFeed(url: string): Promise<FeedValidation> {
|
||||
try {
|
||||
const parsed = await parseFeedMeta(url);
|
||||
return {
|
||||
ok: parsed.items.length > 0,
|
||||
itemCount: parsed.items.length,
|
||||
title: parsed.title,
|
||||
sample: parsed.items.slice(0, SAMPLE_LIMIT),
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
ok: false,
|
||||
itemCount: 0,
|
||||
title: null,
|
||||
sample: [],
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
18
packages/shared-rss/tsconfig.json
Normal file
18
packages/shared-rss/tsconfig.json
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"lib": ["ES2022", "DOM"],
|
||||
"types": ["node"],
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"isolatedModules": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"noEmit": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
49
pnpm-lock.yaml
generated
49
pnpm-lock.yaml
generated
|
|
@ -72,15 +72,15 @@ importers:
|
|||
'@mana/shared-hono':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/shared-hono
|
||||
'@mana/shared-rss':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/shared-rss
|
||||
'@mana/shared-storage':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/shared-storage
|
||||
'@mana/shared-types':
|
||||
specifier: workspace:^
|
||||
version: link:../../packages/shared-types
|
||||
'@mozilla/readability':
|
||||
specifier: ^0.5.0
|
||||
version: 0.5.0
|
||||
ai:
|
||||
specifier: ^6.0.154
|
||||
version: 6.0.154(zod@3.25.76)
|
||||
|
|
@ -90,9 +90,6 @@ importers:
|
|||
hono:
|
||||
specifier: ^4.7.0
|
||||
version: 4.12.12
|
||||
jsdom:
|
||||
specifier: ^25.0.0
|
||||
version: 25.0.1
|
||||
postgres:
|
||||
specifier: ^3.4.0
|
||||
version: 3.4.9
|
||||
|
|
@ -106,9 +103,6 @@ importers:
|
|||
'@types/bun':
|
||||
specifier: latest
|
||||
version: 1.3.12
|
||||
'@types/jsdom':
|
||||
specifier: ^21.1.0
|
||||
version: 21.1.7
|
||||
drizzle-kit:
|
||||
specifier: ^0.30.0
|
||||
version: 0.30.6
|
||||
|
|
@ -2996,6 +2990,28 @@ importers:
|
|||
specifier: ^7.0.0
|
||||
version: 7.4.0(@types/babel__core@7.20.5)
|
||||
|
||||
packages/shared-rss:
|
||||
dependencies:
|
||||
'@mozilla/readability':
|
||||
specifier: ^0.5.0
|
||||
version: 0.5.0
|
||||
jsdom:
|
||||
specifier: ^25.0.1
|
||||
version: 25.0.1
|
||||
rss-parser:
|
||||
specifier: ^3.13.0
|
||||
version: 3.13.0
|
||||
devDependencies:
|
||||
'@types/jsdom':
|
||||
specifier: ^21.1.7
|
||||
version: 21.1.7
|
||||
'@types/node':
|
||||
specifier: ^24.10.1
|
||||
version: 24.12.2
|
||||
typescript:
|
||||
specifier: ^5.9.3
|
||||
version: 5.9.3
|
||||
|
||||
packages/shared-storage:
|
||||
dependencies:
|
||||
'@aws-sdk/client-s3':
|
||||
|
|
@ -3665,28 +3681,19 @@ importers:
|
|||
|
||||
services/news-ingester:
|
||||
dependencies:
|
||||
'@mozilla/readability':
|
||||
specifier: ^0.5.0
|
||||
version: 0.5.0
|
||||
'@mana/shared-rss':
|
||||
specifier: workspace:*
|
||||
version: link:../../packages/shared-rss
|
||||
drizzle-orm:
|
||||
specifier: ^0.38.3
|
||||
version: 0.38.4(@opentelemetry/api@1.9.1)(@types/pg@8.6.1)(@types/react@19.2.14)(bun-types@1.3.12)(kysely@0.28.15)(postgres@3.4.9)(react@19.2.0)
|
||||
hono:
|
||||
specifier: ^4.7.0
|
||||
version: 4.12.12
|
||||
jsdom:
|
||||
specifier: ^25.0.1
|
||||
version: 25.0.1
|
||||
postgres:
|
||||
specifier: ^3.4.5
|
||||
version: 3.4.9
|
||||
rss-parser:
|
||||
specifier: ^3.13.0
|
||||
version: 3.13.0
|
||||
devDependencies:
|
||||
'@types/jsdom':
|
||||
specifier: ^21.1.7
|
||||
version: 21.1.7
|
||||
drizzle-kit:
|
||||
specifier: ^0.30.4
|
||||
version: 0.30.6
|
||||
|
|
|
|||
|
|
@ -11,15 +11,12 @@
|
|||
"db:studio": "drizzle-kit studio"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@mana/shared-rss": "workspace:*",
|
||||
"drizzle-orm": "^0.38.3",
|
||||
"hono": "^4.7.0",
|
||||
"jsdom": "^25.0.1",
|
||||
"postgres": "^3.4.5",
|
||||
"rss-parser": "^3.13.0"
|
||||
"postgres": "^3.4.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jsdom": "^21.1.7",
|
||||
"drizzle-kit": "^0.30.4",
|
||||
"typescript": "^5.9.3"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,9 +19,8 @@ import { sql } from 'drizzle-orm';
|
|||
import type { Database } from './db/connection';
|
||||
import { curatedArticles, type NewCuratedArticle } from './db/schema';
|
||||
import { SOURCES, type NewsSource } from './sources';
|
||||
import { fetchFeed, type NormalizedFeedItem } from './parsers/rss';
|
||||
import { parseFeedUrl, extractFromUrl, type NormalizedFeedItem } from '@mana/shared-rss';
|
||||
import { fetchHackerNews } from './parsers/hn';
|
||||
import { fetchAndExtract } from './parsers/readability';
|
||||
|
||||
const RETENTION_DAYS = 30;
|
||||
|
||||
|
|
@ -57,7 +56,7 @@ function readingMinutes(words: number): number {
|
|||
|
||||
async function fetchSourceItems(source: NewsSource): Promise<NormalizedFeedItem[]> {
|
||||
if (source.type === 'hn') return fetchHackerNews(source.url);
|
||||
return fetchFeed(source.url);
|
||||
return parseFeedUrl(source.url);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -78,7 +77,7 @@ async function buildRow(
|
|||
|
||||
const initialWords = wordCountOf(content);
|
||||
if (FULL_TEXT_THRESHOLD_WORDS > 0 && initialWords < FULL_TEXT_THRESHOLD_WORDS) {
|
||||
const extracted = await fetchAndExtract(item.url);
|
||||
const extracted = await extractFromUrl(item.url);
|
||||
if (extracted) {
|
||||
content = extracted.content;
|
||||
htmlContent = extracted.htmlContent || htmlContent;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
* struggles with and which isn't the user's expectation for a news feed.
|
||||
*/
|
||||
|
||||
import type { NormalizedFeedItem } from './rss';
|
||||
import type { NormalizedFeedItem } from '@mana/shared-rss';
|
||||
|
||||
interface HnItem {
|
||||
id: number;
|
||||
|
|
|
|||
|
|
@ -1,69 +0,0 @@
|
|||
/**
|
||||
* Mozilla Readability fallback. Used when an RSS item only ships an
|
||||
* excerpt, so we fetch the original page and extract the article body.
|
||||
*
|
||||
* Kept dependency-local to the ingester so this service is the canonical
|
||||
* "content acquisition" boundary — apps/api never has to call out to a
|
||||
* crawler.
|
||||
*/
|
||||
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
|
||||
// JSDOM emits CSS parse errors and resource-loading warnings via its
|
||||
// virtualConsole. The default console rethrows some of those as
|
||||
// uncaughtException, which in a long-running ingester loop kills the
|
||||
// whole process. A bare VirtualConsole with no listeners swallows
|
||||
// everything quietly — exactly what we want for a "best-effort article
|
||||
// extractor" that doesn't care about CSS or sub-resources.
|
||||
const silentConsole = new VirtualConsole();
|
||||
|
||||
export interface ExtractedArticle {
|
||||
title: string | null;
|
||||
content: string;
|
||||
htmlContent: string;
|
||||
excerpt: string;
|
||||
byline: string | null;
|
||||
siteName: string | null;
|
||||
wordCount: number;
|
||||
readingTimeMinutes: number;
|
||||
}
|
||||
|
||||
const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)';
|
||||
|
||||
export async function fetchAndExtract(url: string): Promise<ExtractedArticle | null> {
|
||||
let html: string;
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: { 'User-Agent': USER_AGENT },
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!response.ok) return null;
|
||||
html = await response.text();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const dom = new JSDOM(html, { url, virtualConsole: silentConsole });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
if (!article || !article.textContent) return null;
|
||||
|
||||
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
|
||||
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
|
||||
|
||||
return {
|
||||
title: article.title ?? null,
|
||||
content: article.textContent,
|
||||
htmlContent: article.content ?? '',
|
||||
excerpt: article.excerpt || article.textContent.slice(0, 240),
|
||||
byline: article.byline ?? null,
|
||||
siteName: article.siteName ?? null,
|
||||
wordCount,
|
||||
readingTimeMinutes,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
/**
|
||||
* RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop
|
||||
* doesn't have to know about feed-format quirks.
|
||||
*
|
||||
* `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out
|
||||
* the bits we need (link, title, content/snippet, image, date) and let
|
||||
* the ingester decide whether to call Readability for full-text.
|
||||
*/
|
||||
|
||||
import Parser from 'rss-parser';
|
||||
|
||||
export interface NormalizedFeedItem {
|
||||
url: string;
|
||||
title: string;
|
||||
excerpt: string | null;
|
||||
content: string | null;
|
||||
htmlContent: string | null;
|
||||
author: string | null;
|
||||
imageUrl: string | null;
|
||||
publishedAt: Date | null;
|
||||
}
|
||||
|
||||
type CustomItem = {
|
||||
'media:content'?: { $: { url: string } };
|
||||
'media:thumbnail'?: { $: { url: string } };
|
||||
enclosure?: { url?: string };
|
||||
};
|
||||
|
||||
const parser: Parser<unknown, CustomItem> = new Parser({
|
||||
timeout: 15_000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)',
|
||||
},
|
||||
customFields: {
|
||||
item: ['media:content', 'media:thumbnail', 'enclosure'],
|
||||
},
|
||||
});
|
||||
|
||||
export async function fetchFeed(url: string): Promise<NormalizedFeedItem[]> {
|
||||
const feed = await parser.parseURL(url);
|
||||
|
||||
return (feed.items ?? []).map((item) => {
|
||||
// rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`.
|
||||
const html = (item as { content?: string }).content ?? null;
|
||||
const text = (item as { contentSnippet?: string }).contentSnippet ?? null;
|
||||
|
||||
// Image: try a few common locations.
|
||||
const mediaContent = item['media:content']?.$?.url;
|
||||
const mediaThumb = item['media:thumbnail']?.$?.url;
|
||||
const enclosureUrl = item.enclosure?.url;
|
||||
const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null;
|
||||
|
||||
const link = (item as { link?: string }).link ?? '';
|
||||
const title = (item as { title?: string }).title ?? '';
|
||||
const author =
|
||||
(item as { creator?: string; author?: string }).creator ??
|
||||
(item as { author?: string }).author ??
|
||||
null;
|
||||
const isoDate = (item as { isoDate?: string }).isoDate ?? null;
|
||||
|
||||
return {
|
||||
url: link,
|
||||
title,
|
||||
excerpt: text,
|
||||
content: text,
|
||||
htmlContent: html,
|
||||
author,
|
||||
imageUrl,
|
||||
publishedAt: isoDate ? new Date(isoDate) : null,
|
||||
};
|
||||
});
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue