managarten/services/news-ingester/src/sources.ts
Till JS 9ef97a1877 feat(news): backend ingester service + curated feed API
Adds the services/news-ingester Bun service that pulls 25 public RSS/JSON
feeds into news.curated_articles every 15 min, with Mozilla Readability
fallback for thin RSS bodies and 30-day retention. apps/api /feed is
rewritten to read from the new pool table directly instead of the
sync_changes hack, with topics/lang/since/limit/offset query params.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 15:53:26 +02:00

260 lines
6.6 KiB
TypeScript

/**
* Curated source list — single source of truth for the news ingester.
*
* Each source declares its parser type (`rss` or `hn`), the topic it
* belongs to, and the language. The `slug` MUST be stable across deploys
* because user blocklists reference it from client-side storage.
*
* Adding a source = append a row here, redeploy. The unified Mana app
* mirrors a sanitized subset (slug + name + topic + language) in
* `apps/mana/apps/web/src/lib/modules/news/sources-meta.ts` for the
* onboarding picker — keep both files in sync when editing.
*/
export type SourceParserType = 'rss' | 'hn';
export type Topic =
| 'tech'
| 'wissenschaft'
| 'weltgeschehen'
| 'wirtschaft'
| 'kultur'
| 'gesundheit'
| 'politik';
export interface NewsSource {
slug: string;
name: string;
type: SourceParserType;
url: string;
topic: Topic;
language: 'de' | 'en';
}
export const SOURCES: NewsSource[] = [
// ─── Tech ──────────────────────────────────────────────────
{
slug: 'hacker-news',
name: 'Hacker News',
type: 'hn',
url: 'https://hacker-news.firebaseio.com/v0/topstories.json',
topic: 'tech',
language: 'en',
},
{
slug: 'arstechnica',
name: 'Ars Technica',
type: 'rss',
url: 'https://feeds.arstechnica.com/arstechnica/index',
topic: 'tech',
language: 'en',
},
{
slug: 'theverge',
name: 'The Verge',
type: 'rss',
url: 'https://www.theverge.com/rss/index.xml',
topic: 'tech',
language: 'en',
},
{
slug: 'heise',
name: 'heise online',
type: 'rss',
url: 'https://www.heise.de/rss/heise-atom.xml',
topic: 'tech',
language: 'de',
},
// ─── Wissenschaft ──────────────────────────────────────────
{
slug: 'quanta-magazine',
name: 'Quanta Magazine',
type: 'rss',
url: 'https://api.quantamagazine.org/feed/',
topic: 'wissenschaft',
language: 'en',
},
{
slug: 'spektrum',
name: 'Spektrum',
type: 'rss',
url: 'https://www.spektrum.de/alias/rss/spektrum-de-rss-feed/996406',
topic: 'wissenschaft',
language: 'de',
},
{
slug: 'nature-news',
name: 'Nature News',
type: 'rss',
url: 'https://www.nature.com/nature.rss',
topic: 'wissenschaft',
language: 'en',
},
{
slug: 'phys-org',
name: 'Phys.org',
type: 'rss',
url: 'https://phys.org/rss-feed/',
topic: 'wissenschaft',
language: 'en',
},
// ─── Weltgeschehen ─────────────────────────────────────────
// Note: Reuters and AP both block automated feed fetchers as of
// 2026-04 (Reuters returns 406, AP refuses connection). Replaced
// with Al Jazeera and DW which both publish open RSS.
{
slug: 'tagesschau',
name: 'Tagesschau',
type: 'rss',
url: 'https://www.tagesschau.de/xml/rss2/',
topic: 'weltgeschehen',
language: 'de',
},
{
slug: 'bbc-world',
name: 'BBC World',
type: 'rss',
url: 'https://feeds.bbci.co.uk/news/world/rss.xml',
topic: 'weltgeschehen',
language: 'en',
},
{
slug: 'aljazeera',
name: 'Al Jazeera',
type: 'rss',
url: 'https://www.aljazeera.com/xml/rss/all.xml',
topic: 'weltgeschehen',
language: 'en',
},
{
slug: 'dw-top',
name: 'Deutsche Welle',
type: 'rss',
url: 'https://rss.dw.com/rdf/rss-en-top',
topic: 'weltgeschehen',
language: 'en',
},
// ─── Wirtschaft ────────────────────────────────────────────
{
slug: 'handelsblatt',
name: 'Handelsblatt',
type: 'rss',
url: 'https://www.handelsblatt.com/contentexport/feed/schlagzeilen',
topic: 'wirtschaft',
language: 'de',
},
{
slug: 'ft-world',
name: 'Financial Times',
type: 'rss',
url: 'https://www.ft.com/world?format=rss',
topic: 'wirtschaft',
language: 'en',
},
{
slug: 'bloomberg-markets',
name: 'Bloomberg Markets',
type: 'rss',
url: 'https://feeds.bloomberg.com/markets/news.rss',
topic: 'wirtschaft',
language: 'en',
},
{
slug: 'economist-finance',
name: 'The Economist — Finance',
type: 'rss',
url: 'https://www.economist.com/finance-and-economics/rss.xml',
topic: 'wirtschaft',
language: 'en',
},
// ─── Kultur ────────────────────────────────────────────────
// Perlentaucher and ZEIT Kultur both 404'd in testing (2026-04);
// replaced with NPR Arts and Guardian Books which are stable.
{
slug: 'guardian-culture',
name: 'The Guardian Culture',
type: 'rss',
url: 'https://www.theguardian.com/culture/rss',
topic: 'kultur',
language: 'en',
},
{
slug: 'guardian-books',
name: 'The Guardian Books',
type: 'rss',
url: 'https://www.theguardian.com/books/rss',
topic: 'kultur',
language: 'en',
},
{
slug: 'npr-arts',
name: 'NPR Arts',
type: 'rss',
url: 'https://feeds.npr.org/1008/rss.xml',
topic: 'kultur',
language: 'en',
},
// ─── Gesundheit ────────────────────────────────────────────
// Ärzteblatt and NIH both 404'd; STAT News still works. Added
// BBC Health and ScienceDaily as reliable replacements.
{
slug: 'stat-news',
name: 'STAT News',
type: 'rss',
url: 'https://www.statnews.com/feed/',
topic: 'gesundheit',
language: 'en',
},
{
slug: 'bbc-health',
name: 'BBC Health',
type: 'rss',
url: 'https://feeds.bbci.co.uk/news/health/rss.xml',
topic: 'gesundheit',
language: 'en',
},
{
slug: 'sciencedaily-health',
name: 'ScienceDaily Health',
type: 'rss',
url: 'https://www.sciencedaily.com/rss/health_medicine.xml',
topic: 'gesundheit',
language: 'en',
},
// ─── Politik ───────────────────────────────────────────────
{
slug: 'spiegel-politik',
name: 'Spiegel Politik',
type: 'rss',
url: 'https://www.spiegel.de/politik/index.rss',
topic: 'politik',
language: 'de',
},
{
slug: 'politico-eu',
name: 'Politico EU',
type: 'rss',
url: 'https://www.politico.eu/feed/',
topic: 'politik',
language: 'en',
},
{
slug: 'atlantic-politics',
name: 'The Atlantic — Politics',
type: 'rss',
url: 'https://www.theatlantic.com/feed/channel/politics/',
topic: 'politik',
language: 'en',
},
];
/** Build a quick lookup by slug. */
export const SOURCE_BY_SLUG: Record<string, NewsSource> = Object.fromEntries(
SOURCES.map((s) => [s.slug, s])
);