diff --git a/apps/api/src/modules/news/routes.ts b/apps/api/src/modules/news/routes.ts index 3a0e30eb6..335b069c7 100644 --- a/apps/api/src/modules/news/routes.ts +++ b/apps/api/src/modules/news/routes.ts @@ -1,9 +1,13 @@ /** - * News module — Article extraction + AI feed - * Ported from apps/news/apps/server + * News module — Reads the curated article pool + extracts ad-hoc URLs. * - * Saved articles handled by local-first + mana-sync. - * This module handles content extraction (Mozilla Readability) and feed from sync_changes. + * Pool population: handled by the standalone `services/news-ingester` + * Bun service, which writes into `news.curated_articles` on a 15 min + * loop. This route file just reads from that table. + * + * Saved articles (the user's personal reading list) live entirely in + * the unified Mana app's local-first IndexedDB and sync via mana-sync; + * this module never sees them. */ import { Hono } from 'hono'; @@ -13,15 +17,15 @@ import { drizzle } from 'drizzle-orm/postgres-js'; import postgres from 'postgres'; import { sql } from 'drizzle-orm'; -// ─── DB Connection (reads from sync_changes for feed) ─────── +// ─── DB Connection (reads from news.curated_articles) ────── const DATABASE_URL = - process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_sync'; + process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_platform'; const connection = postgres(DATABASE_URL, { max: 10 }); const db = drizzle(connection); -// ─── Extract Service ──────────────────────────────────────── +// ─── Extract Service (Readability fallback for ad-hoc URLs) ─ interface ExtractedArticle { title: string; @@ -73,75 +77,78 @@ async function extractFromUrl(url: string): Promise { const routes = new Hono(); -// ─── Feed (public, reads from sync_changes) ───────────────── +// ─── Feed (reads from news.curated_articles) ─────────────── +// +// Query params: +// topics — comma-separated topic slugs (tech,wissenschaft,…). If +// omitted, all topics are returned. +// lang — 'de' | 'en' | 'all' (default 'all') +// since — ISO timestamp; only articles published after this +// limit — default 50, max 200 +// offset — default 0 +// +// Returns the full article body so the client can render the reader +// without a second round-trip. Curated articles are small (≤30 KB +// each) and the client caches them locally for offline reading. routes.get('/feed', async (c) => { - const type = c.req.query('type'); - const categoryId = c.req.query('categoryId'); - const limit = parseInt(c.req.query('limit') || '20', 10); + const topicsParam = c.req.query('topics'); + const lang = c.req.query('lang') ?? 'all'; + const since = c.req.query('since'); + const limit = Math.min(parseInt(c.req.query('limit') || '50', 10), 200); const offset = parseInt(c.req.query('offset') || '0', 10); - let whereClause = sql`app_id = 'news' AND table_name = 'articles' AND op != 'delete'`; + const conditions: ReturnType[] = []; - if (type) { - whereClause = sql`${whereClause} AND data->>'type' = ${type}`; + if (topicsParam) { + const topics = topicsParam + .split(',') + .map((t) => t.trim()) + .filter(Boolean); + if (topics.length > 0) { + conditions.push(sql`topic = ANY(${topics})`); + } } - if (categoryId) { - whereClause = sql`${whereClause} AND data->>'categoryId' = ${categoryId}`; + if (lang === 'de' || lang === 'en') { + conditions.push(sql`language = ${lang}`); } + if (since) { + conditions.push(sql`published_at > ${since}`); + } + + const whereClause = + conditions.length > 0 + ? sql.join([sql`WHERE`, sql.join(conditions, sql` AND `)], sql` `) + : sql``; const result = await db.execute(sql` - SELECT DISTINCT ON (record_id) - record_id as id, - data->>'title' as title, - data->>'excerpt' as excerpt, - data->>'author' as author, - data->>'imageUrl' as "imageUrl", - data->>'type' as type, - data->>'categoryId' as "categoryId", - (data->>'wordCount')::int as "wordCount", - (data->>'readingTimeMinutes')::int as "readingTimeMinutes", - data->>'publishedAt' as "publishedAt", - created_at as "createdAt" - FROM sync_changes - WHERE ${whereClause} - ORDER BY record_id, created_at DESC + SELECT + id, + original_url AS "originalUrl", + title, + excerpt, + content, + html_content AS "htmlContent", + author, + site_name AS "siteName", + source_slug AS "sourceSlug", + image_url AS "imageUrl", + topic, + language, + word_count AS "wordCount", + reading_time_minutes AS "readingTimeMinutes", + published_at AS "publishedAt", + ingested_at AS "ingestedAt" + FROM news.curated_articles + ${whereClause} + ORDER BY published_at DESC NULLS LAST, ingested_at DESC LIMIT ${limit} OFFSET ${offset} `); return c.json(result as unknown as Record[]); }); -routes.get('/feed/:id', async (c) => { - const id = c.req.param('id'); - - const result = await db.execute(sql` - SELECT DISTINCT ON (record_id) - record_id as id, - data->>'title' as title, - data->>'content' as content, - data->>'htmlContent' as "htmlContent", - data->>'excerpt' as excerpt, - data->>'author' as author, - data->>'imageUrl' as "imageUrl", - data->>'originalUrl' as "originalUrl", - data->>'type' as type, - (data->>'wordCount')::int as "wordCount", - (data->>'readingTimeMinutes')::int as "readingTimeMinutes", - data->>'publishedAt' as "publishedAt", - created_at as "createdAt" - FROM sync_changes - WHERE app_id = 'news' AND table_name = 'articles' AND record_id = ${id} AND op != 'delete' - ORDER BY record_id, created_at DESC - LIMIT 1 - `); - - const rows = result as unknown as Record[]; - if (!rows[0]) return c.json({ error: 'Article not found' }, 404); - return c.json(rows[0]); -}); - -// ─── Extract (content extraction) ─────────────────────────── +// ─── Extract (content extraction for user-pasted URLs) ───── routes.post('/extract/preview', async (c) => { const { url } = await c.req.json<{ url: string }>(); @@ -162,7 +169,7 @@ routes.post('/extract/save', async (c) => { try { const extracted = await extractFromUrl(url); - // Return extracted data -- client saves to local-first store + // Return extracted data — client saves to local-first store. return c.json({ id: crypto.randomUUID(), type: 'saved', diff --git a/scripts/mac-mini/push-schemas.sh b/scripts/mac-mini/push-schemas.sh index d91075c4b..0ba404d70 100755 --- a/scripts/mac-mini/push-schemas.sh +++ b/scripts/mac-mini/push-schemas.sh @@ -34,6 +34,7 @@ push_schema "mana-credits" "services/mana-credits" push_schema "mana-user" "services/mana-user" push_schema "mana-subscriptions" "services/mana-subscriptions" push_schema "mana-analytics" "services/mana-analytics" +push_schema "news-ingester" "services/news-ingester" echo "" echo "Done. mana-sync creates its schema automatically on startup." diff --git a/scripts/setup-databases.sh b/scripts/setup-databases.sh index 159961da5..78a780e8f 100755 --- a/scripts/setup-databases.sh +++ b/scripts/setup-databases.sh @@ -77,6 +77,7 @@ PLATFORM_SCHEMAS=( "uload" "cards" "events" + "news" ) # Check if specific service requested @@ -122,9 +123,12 @@ setup_service() { events|mana-events) push_schema "@mana/events" "mana-events" ;; + news|news-ingester) + push_schema "@mana/news-ingester" "news-ingester" + ;; *) echo -e "${RED}Unknown service: $service${NC}" - echo "Available services: auth, credits, user, subscriptions, analytics, media, todo, traces, presi, uload, cards, events" + echo "Available services: auth, credits, user, subscriptions, analytics, media, todo, traces, presi, uload, cards, events, news" exit 1 ;; esac @@ -154,7 +158,7 @@ done echo -e "\n${GREEN}Step 3: Pushing schemas${NC}" echo "--------------------------------------" -for service in auth credits user subscriptions analytics media todo traces presi uload cards events; do +for service in auth credits user subscriptions analytics media todo traces presi uload cards events news; do setup_service "$service" 2>/dev/null || true done diff --git a/services/news-ingester/CLAUDE.md b/services/news-ingester/CLAUDE.md new file mode 100644 index 000000000..5fa8def5d --- /dev/null +++ b/services/news-ingester/CLAUDE.md @@ -0,0 +1,100 @@ +# news-ingester + +Pulls public RSS/JSON feeds into `news.curated_articles` for the News Hub +module in the unified Mana app. The unified `mana-api` reads from the +same table to serve `GET /api/v1/news/feed`. + +## Tech Stack + +| Layer | Technology | +|-------|------------| +| Runtime | Bun | +| Framework | Hono (only for health/status/manual trigger) | +| Database | PostgreSQL + Drizzle ORM (schema `news` in `mana_platform`) | +| Parsing | `rss-parser` for RSS/Atom, `@mozilla/readability` + `jsdom` for full-text fallback | + +## Port: 3066 + +## What it does + +On startup and every `TICK_INTERVAL_MS` (default 15 min): + +1. For each source in `src/sources.ts`, fetch the feed (RSS or HN JSON). +2. Normalize items and dedupe by `sha256(originalUrl)` against the + `url_hash` unique index — re-runs are safe. +3. If the feed body has fewer than 200 words, fall back to Mozilla + Readability against the original URL to get the full article text. +4. Insert into `news.curated_articles` with topic + source slug from the + source definition. Topic classification is **static** (per-source); + we do not run any content classifier. +5. Prune rows older than 30 days at the end of each tick. + +## API + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/health` | Healthcheck — returns 503 if Postgres unreachable | +| GET | `/status` | Last tick result (sources, counts, duration) | +| POST | `/ingest/run` | Trigger an ingest tick now (returns immediately) | + +No auth — service is internal-only behind the docker network. + +## Adding a source + +1. Append to `SOURCES` in `src/sources.ts` with a stable `slug`, type + (`rss` or `hn`), URL, topic, and language. +2. Mirror the slug + name into the unified web app's onboarding picker + at `apps/mana/apps/web/src/lib/modules/news/sources-meta.ts` so users + can opt out of it. **Slugs must match** — user blocklists reference + them. +3. Restart container and `curl -X POST http://localhost:3066/ingest/run` + to populate immediately. + +## Topics + +The seven shipped topics are: `tech`, `wissenschaft`, `weltgeschehen`, +`wirtschaft`, `kultur`, `gesundheit`, `politik`. Adding a new topic +means updating the `Topic` union in `src/sources.ts` AND the matching +type in the unified web app's `news/types.ts`. + +## Database + +Schema: `news` in `mana_platform`. Single table `curated_articles`, +indexed on `(topic, published_at)`, `(language, published_at)`, +`source_slug`, and `ingested_at`. + +`bun run db:push` pushes the schema. The schema is intentionally NOT +referenced from `apps/api` — `apps/api/src/modules/news/routes.ts` +queries the table via raw SQL to keep the API service free of a Drizzle +schema dependency on this service. + +## Environment Variables + +```env +PORT=3066 +DATABASE_URL=postgresql://mana:devpassword@localhost:5432/mana_platform +TICK_INTERVAL_MS=900000 # 15 minutes +RUN_ON_STARTUP=true +``` + +## Local Dev + +```bash +cd services/news-ingester +bun install +bun run db:push # creates news.curated_articles +bun run dev # starts on :3066, ticks immediately +curl -X POST http://localhost:3066/ingest/run +curl http://localhost:3066/status | jq +``` + +## Privacy / Legal + +Only public RSS feeds intended for syndication are ingested. The +`User-Agent` is `ManaNewsIngester/1.0 (+https://mana.how/news)` so site +owners can identify and contact us. Per-source rate limit is implicit +(15 min interval × ~30 items/source = ~2 req/min/source). + +User reading behavior is **not** tracked here. Personalization happens +client-side in the unified Mana app's local IndexedDB; the ingester +only knows what was published, not what was read. diff --git a/services/news-ingester/Dockerfile b/services/news-ingester/Dockerfile new file mode 100644 index 000000000..6f4b20444 --- /dev/null +++ b/services/news-ingester/Dockerfile @@ -0,0 +1,18 @@ +FROM oven/bun:1 AS production + +WORKDIR /app + +# Copy package files and install +COPY package.json bun.lock* ./ +RUN bun install --frozen-lockfile 2>/dev/null || bun install + +# Copy source +COPY src ./src +COPY tsconfig.json drizzle.config.ts ./ + +EXPOSE 3066 + +HEALTHCHECK --interval=60s --timeout=10s --start-period=30s --retries=3 \ + CMD bun -e "fetch('http://localhost:3066/health').then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))" + +CMD ["bun", "run", "src/index.ts"] diff --git a/services/news-ingester/drizzle.config.ts b/services/news-ingester/drizzle.config.ts new file mode 100644 index 000000000..da555f9e6 --- /dev/null +++ b/services/news-ingester/drizzle.config.ts @@ -0,0 +1,11 @@ +import { defineConfig } from 'drizzle-kit'; + +export default defineConfig({ + schema: './src/db/schema.ts', + out: './drizzle', + dialect: 'postgresql', + dbCredentials: { + url: process.env.DATABASE_URL || 'postgresql://mana:devpassword@localhost:5432/mana_platform', + }, + schemaFilter: ['news'], +}); diff --git a/services/news-ingester/package.json b/services/news-ingester/package.json new file mode 100644 index 000000000..14ed5ade7 --- /dev/null +++ b/services/news-ingester/package.json @@ -0,0 +1,27 @@ +{ + "name": "@mana/news-ingester", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "dev": "bun run --watch src/index.ts", + "start": "bun run src/index.ts", + "db:push": "drizzle-kit push", + "db:generate": "drizzle-kit generate", + "db:studio": "drizzle-kit studio" + }, + "dependencies": { + "@mana/shared-hono": "workspace:*", + "@mozilla/readability": "^0.5.0", + "drizzle-orm": "^0.38.3", + "hono": "^4.7.0", + "jsdom": "^25.0.1", + "postgres": "^3.4.5", + "rss-parser": "^3.13.0" + }, + "devDependencies": { + "@types/jsdom": "^21.1.7", + "drizzle-kit": "^0.30.4", + "typescript": "^5.9.3" + } +} diff --git a/services/news-ingester/src/config.ts b/services/news-ingester/src/config.ts new file mode 100644 index 000000000..e0abcd6bc --- /dev/null +++ b/services/news-ingester/src/config.ts @@ -0,0 +1,21 @@ +/** + * Environment-driven config. Defaults match the local dev setup + * (`pnpm setup:env` writes the same DATABASE_URL into .env files). + */ + +export interface Config { + port: number; + databaseUrl: string; + tickIntervalMs: number; + runOnStartup: boolean; +} + +export function loadConfig(): Config { + return { + port: parseInt(process.env.PORT || '3066', 10), + databaseUrl: + process.env.DATABASE_URL || 'postgresql://mana:devpassword@localhost:5432/mana_platform', + tickIntervalMs: parseInt(process.env.TICK_INTERVAL_MS || '900000', 10), // 15 min + runOnStartup: (process.env.RUN_ON_STARTUP || 'true') !== 'false', + }; +} diff --git a/services/news-ingester/src/db/connection.ts b/services/news-ingester/src/db/connection.ts new file mode 100644 index 000000000..89186ce93 --- /dev/null +++ b/services/news-ingester/src/db/connection.ts @@ -0,0 +1,19 @@ +/** + * Single Postgres connection pool, lazily instantiated. + */ + +import { drizzle } from 'drizzle-orm/postgres-js'; +import postgres from 'postgres'; +import * as schema from './schema'; + +let db: ReturnType> | null = null; + +export function getDb(databaseUrl: string) { + if (!db) { + const client = postgres(databaseUrl, { max: 5 }); + db = drizzle(client, { schema }); + } + return db; +} + +export type Database = ReturnType; diff --git a/services/news-ingester/src/db/schema.ts b/services/news-ingester/src/db/schema.ts new file mode 100644 index 000000000..397843c0a --- /dev/null +++ b/services/news-ingester/src/db/schema.ts @@ -0,0 +1,56 @@ +/** + * News schema — public pool of curated articles. + * + * This is the *shared* article pool that the ingester writes into and the + * unified mana-api reads from for the News Hub feed. It is intentionally + * not user-scoped: the same article row is visible to every user. Per-user + * personalization (interests, blocklist, reactions) lives client-side in + * the unified Mana app's IndexedDB, not here. + * + * Articles older than 30 days are pruned by the ingester. Saving an + * article into a user's reading list copies the row into their local + * encrypted `newsArticles` table — the pool is fire-and-forget. + */ + +import { pgSchema, uuid, integer, text, timestamp, index } from 'drizzle-orm/pg-core'; + +export const newsSchema = pgSchema('news'); + +/** + * Pool of curated articles ingested from public RSS/JSON feeds. + * + * `urlHash` (sha256 of originalUrl) is the dedupe key — if the same URL + * shows up in two feeds, only the first wins. `topic` is assigned by the + * ingester from a static source→topic mapping; we do not classify content. + */ +export const curatedArticles = newsSchema.table( + 'curated_articles', + { + id: uuid('id').primaryKey().defaultRandom(), + urlHash: text('url_hash').notNull().unique(), + originalUrl: text('original_url').notNull(), + title: text('title').notNull(), + excerpt: text('excerpt'), + content: text('content'), + htmlContent: text('html_content'), + author: text('author'), + siteName: text('site_name').notNull(), + sourceSlug: text('source_slug').notNull(), + imageUrl: text('image_url'), + topic: text('topic').notNull(), + language: text('language').notNull(), + wordCount: integer('word_count'), + readingTimeMinutes: integer('reading_time_minutes'), + publishedAt: timestamp('published_at', { withTimezone: true }), + ingestedAt: timestamp('ingested_at', { withTimezone: true }).defaultNow().notNull(), + }, + (table) => ({ + topicPublishedIdx: index('curated_topic_published_idx').on(table.topic, table.publishedAt), + langPublishedIdx: index('curated_lang_published_idx').on(table.language, table.publishedAt), + sourceIdx: index('curated_source_idx').on(table.sourceSlug), + ingestedAtIdx: index('curated_ingested_at_idx').on(table.ingestedAt), + }) +); + +export type CuratedArticle = typeof curatedArticles.$inferSelect; +export type NewCuratedArticle = typeof curatedArticles.$inferInsert; diff --git a/services/news-ingester/src/index.ts b/services/news-ingester/src/index.ts new file mode 100644 index 000000000..48954581c --- /dev/null +++ b/services/news-ingester/src/index.ts @@ -0,0 +1,84 @@ +/** + * news-ingester — pulls public RSS / JSON feeds into news.curated_articles + * on a fixed interval. Exposes a tiny Hono server for health + manual + * trigger so the container can be probed and re-kicked without a restart. + * + * Why a long-running container instead of a host cron: + * - logs land in the same docker stack as everything else + * - restarts on crash via docker + * - health endpoint for the docker-compose healthcheck + * - lets us hit /ingest/run from a shell to debug new sources without + * waiting 15 minutes + */ + +import { Hono } from 'hono'; +import { sql } from 'drizzle-orm'; +import { loadConfig } from './config'; +import { getDb } from './db/connection'; +import { runIngestTick, type TickResult } from './ingest'; + +const config = loadConfig(); +const db = getDb(config.databaseUrl); + +let lastTick: TickResult | null = null; +let running = false; + +async function tick() { + if (running) { + console.log('[news-ingester] previous tick still running, skipping'); + return; + } + running = true; + try { + lastTick = await runIngestTick(db); + } catch (err) { + console.error('[news-ingester] tick failed:', err); + } finally { + running = false; + } +} + +// ─── Hono app ────────────────────────────────────────────── + +const app = new Hono(); + +app.get('/health', async (c) => { + try { + // Cheap connectivity check — don't claim healthy if Postgres is down. + await db.execute(sql`SELECT 1`); + } catch { + return c.json({ status: 'degraded', service: 'news-ingester' }, 503); + } + return c.json({ + status: 'ok', + service: 'news-ingester', + lastTickStartedAt: lastTick?.startedAt ?? null, + lastTickInserted: lastTick?.totalInserted ?? null, + running, + }); +}); + +app.get('/status', (c) => c.json(lastTick ?? { message: 'no tick yet' })); + +app.post('/ingest/run', async (c) => { + if (running) return c.json({ status: 'busy' }, 409); + // Fire-and-forget; client polls /status. + void tick(); + return c.json({ status: 'started' }); +}); + +// ─── Bootstrap ───────────────────────────────────────────── + +console.log( + `[news-ingester] starting on port ${config.port}, tick every ${config.tickIntervalMs}ms` +); + +if (config.runOnStartup) { + // Defer one tick so the HTTP server is up first (healthchecks pass + // while we ingest). + setTimeout(() => void tick(), 5_000); +} + +setInterval(() => void tick(), config.tickIntervalMs); + +export default { port: config.port, fetch: app.fetch }; diff --git a/services/news-ingester/src/ingest.ts b/services/news-ingester/src/ingest.ts new file mode 100644 index 000000000..0245abd4d --- /dev/null +++ b/services/news-ingester/src/ingest.ts @@ -0,0 +1,179 @@ +/** + * Ingest loop — for each source, fetch the feed, normalize, dedupe by + * url-hash, optionally fall back to Readability for full text, and + * insert into `news.curated_articles`. + * + * Designed to be safe under repeated runs: + * - duplicate urls are caught by the unique index on `url_hash` and + * silently skipped via `ON CONFLICT DO NOTHING`. + * - one bad source must not poison the whole tick: every source is + * wrapped in its own try/catch. + * + * Retention: anything older than RETENTION_DAYS is pruned at the end of + * each tick. Saved articles already live in users' encrypted IndexedDB + * by then, so the pool is purely a discovery surface. + */ + +import { createHash } from 'node:crypto'; +import { sql } from 'drizzle-orm'; +import type { Database } from './db/connection'; +import { curatedArticles, type NewCuratedArticle } from './db/schema'; +import { SOURCES, type NewsSource } from './sources'; +import { fetchFeed, type NormalizedFeedItem } from './parsers/rss'; +import { fetchHackerNews } from './parsers/hn'; +import { fetchAndExtract } from './parsers/readability'; + +const RETENTION_DAYS = 30; + +/** Min word count to consider an RSS body "full enough" to skip Readability. */ +const FULL_TEXT_THRESHOLD_WORDS = 200; + +function hashUrl(url: string): string { + return createHash('sha256').update(url).digest('hex'); +} + +function wordCountOf(text: string | null | undefined): number { + if (!text) return 0; + return text.split(/\s+/).filter(Boolean).length; +} + +function readingMinutes(words: number): number { + return Math.max(1, Math.ceil(words / 200)); +} + +async function fetchSourceItems(source: NewsSource): Promise { + if (source.type === 'hn') return fetchHackerNews(source.url); + return fetchFeed(source.url); +} + +/** + * Convert a normalized feed item into a `NewCuratedArticle` row, + * optionally enriching with Readability if the feed body is too thin. + */ +async function buildRow( + item: NormalizedFeedItem, + source: NewsSource +): Promise { + if (!item.url || !item.title) return null; + + let content = item.content; + let htmlContent = item.htmlContent; + let excerpt = item.excerpt; + let author = item.author; + let imageUrl = item.imageUrl; + + const initialWords = wordCountOf(content); + if (initialWords < FULL_TEXT_THRESHOLD_WORDS) { + const extracted = await fetchAndExtract(item.url); + if (extracted) { + content = extracted.content; + htmlContent = extracted.htmlContent || htmlContent; + excerpt = excerpt || extracted.excerpt; + author = author || extracted.byline; + // imageUrl from RSS wins; Readability rarely has a good one. + imageUrl = imageUrl ?? null; + } + } + + const words = wordCountOf(content); + if (words === 0) return null; // nothing usable, skip + + return { + urlHash: hashUrl(item.url), + originalUrl: item.url, + title: item.title, + excerpt: excerpt ?? null, + content, + htmlContent: htmlContent ?? null, + author: author ?? null, + siteName: source.name, + sourceSlug: source.slug, + imageUrl, + topic: source.topic, + language: source.language, + wordCount: words, + readingTimeMinutes: readingMinutes(words), + publishedAt: item.publishedAt ?? new Date(), + }; +} + +interface SourceResult { + slug: string; + fetched: number; + inserted: number; + error?: string; +} + +async function ingestSource(db: Database, source: NewsSource): Promise { + const result: SourceResult = { slug: source.slug, fetched: 0, inserted: 0 }; + + let items: NormalizedFeedItem[]; + try { + items = await fetchSourceItems(source); + } catch (err) { + result.error = err instanceof Error ? err.message : String(err); + return result; + } + result.fetched = items.length; + + for (const item of items) { + try { + const row = await buildRow(item, source); + if (!row) continue; + const inserted = await db + .insert(curatedArticles) + .values(row) + .onConflictDoNothing({ target: curatedArticles.urlHash }) + .returning({ id: curatedArticles.id }); + if (inserted.length > 0) result.inserted += 1; + } catch (err) { + console.warn( + `[ingest] ${source.slug}: failed to insert "${item.title?.slice(0, 60) ?? '?'}":`, + err instanceof Error ? err.message : err + ); + } + } + + return result; +} + +export interface TickResult { + startedAt: string; + durationMs: number; + sources: SourceResult[]; + totalInserted: number; + pruned: number; +} + +export async function runIngestTick(db: Database): Promise { + const start = Date.now(); + const startedAt = new Date(start).toISOString(); + + const sources: SourceResult[] = []; + for (const source of SOURCES) { + const r = await ingestSource(db, source); + sources.push(r); + if (r.error) { + console.warn(`[ingest] ${r.slug}: ${r.error}`); + } else { + console.log( + `[ingest] ${r.slug}: ${r.inserted}/${r.fetched} new (topic=${SOURCES.find((s) => s.slug === r.slug)?.topic})` + ); + } + } + + // Retention sweep + const cutoff = new Date(Date.now() - RETENTION_DAYS * 24 * 60 * 60 * 1000); + const pruneRes = await db.execute( + sql`DELETE FROM news.curated_articles WHERE ingested_at < ${cutoff.toISOString()}` + ); + // drizzle's postgres-js execute returns a result with `count` on most queries. + const pruned = (pruneRes as unknown as { count?: number }).count ?? 0; + + const totalInserted = sources.reduce((acc, s) => acc + s.inserted, 0); + const durationMs = Date.now() - start; + + console.log(`[ingest] tick complete: +${totalInserted} new, -${pruned} pruned, ${durationMs}ms`); + + return { startedAt, durationMs, sources, totalInserted, pruned }; +} diff --git a/services/news-ingester/src/parsers/hn.ts b/services/news-ingester/src/parsers/hn.ts new file mode 100644 index 000000000..7eddcb0f1 --- /dev/null +++ b/services/news-ingester/src/parsers/hn.ts @@ -0,0 +1,57 @@ +/** + * Hacker News firebase API parser. + * + * The HN top-stories endpoint returns ~500 IDs; we take the first 30 and + * fetch each item. External-link stories are kept (their `url` field is + * the article); Ask-HN / job posts (no `url`) are skipped because the + * "article" lives at the HN discussion page itself, which Readability + * struggles with and which isn't the user's expectation for a news feed. + */ + +import type { NormalizedFeedItem } from './rss'; + +interface HnItem { + id: number; + type?: string; + by?: string; + time?: number; + title?: string; + url?: string; + text?: string; +} + +const TOP_LIMIT = 30; + +export async function fetchHackerNews(topStoriesUrl: string): Promise { + const idsResp = await fetch(topStoriesUrl, { signal: AbortSignal.timeout(15_000) }); + if (!idsResp.ok) return []; + const ids = (await idsResp.json()) as number[]; + const slice = ids.slice(0, TOP_LIMIT); + + const items = await Promise.all( + slice.map(async (id) => { + try { + const r = await fetch(`https://hacker-news.firebaseio.com/v0/item/${id}.json`, { + signal: AbortSignal.timeout(10_000), + }); + if (!r.ok) return null; + return (await r.json()) as HnItem; + } catch { + return null; + } + }) + ); + + return items + .filter((it): it is HnItem => !!it && it.type === 'story' && !!it.url && !!it.title) + .map((it) => ({ + url: it.url!, + title: it.title!, + excerpt: null, + content: null, + htmlContent: null, + author: it.by ?? null, + imageUrl: null, + publishedAt: it.time ? new Date(it.time * 1000) : null, + })); +} diff --git a/services/news-ingester/src/parsers/readability.ts b/services/news-ingester/src/parsers/readability.ts new file mode 100644 index 000000000..57dbe1544 --- /dev/null +++ b/services/news-ingester/src/parsers/readability.ts @@ -0,0 +1,61 @@ +/** + * Mozilla Readability fallback. Used when an RSS item only ships an + * excerpt, so we fetch the original page and extract the article body. + * + * Kept dependency-local to the ingester so this service is the canonical + * "content acquisition" boundary — apps/api never has to call out to a + * crawler. + */ + +import { Readability } from '@mozilla/readability'; +import { JSDOM } from 'jsdom'; + +export interface ExtractedArticle { + title: string | null; + content: string; + htmlContent: string; + excerpt: string; + byline: string | null; + siteName: string | null; + wordCount: number; + readingTimeMinutes: number; +} + +const USER_AGENT = 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)'; + +export async function fetchAndExtract(url: string): Promise { + let html: string; + try { + const response = await fetch(url, { + headers: { 'User-Agent': USER_AGENT }, + signal: AbortSignal.timeout(15_000), + }); + if (!response.ok) return null; + html = await response.text(); + } catch { + return null; + } + + try { + const dom = new JSDOM(html, { url }); + const reader = new Readability(dom.window.document); + const article = reader.parse(); + if (!article || !article.textContent) return null; + + const wordCount = article.textContent.split(/\s+/).filter(Boolean).length; + const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200)); + + return { + title: article.title ?? null, + content: article.textContent, + htmlContent: article.content ?? '', + excerpt: article.excerpt || article.textContent.slice(0, 240), + byline: article.byline ?? null, + siteName: article.siteName ?? null, + wordCount, + readingTimeMinutes, + }; + } catch { + return null; + } +} diff --git a/services/news-ingester/src/parsers/rss.ts b/services/news-ingester/src/parsers/rss.ts new file mode 100644 index 000000000..703400e83 --- /dev/null +++ b/services/news-ingester/src/parsers/rss.ts @@ -0,0 +1,72 @@ +/** + * RSS/Atom parser wrapper. Returns a normalized shape so the ingest loop + * doesn't have to know about feed-format quirks. + * + * `rss-parser` handles both RSS 2.0 and Atom transparently. We pull out + * the bits we need (link, title, content/snippet, image, date) and let + * the ingester decide whether to call Readability for full-text. + */ + +import Parser from 'rss-parser'; + +export interface NormalizedFeedItem { + url: string; + title: string; + excerpt: string | null; + content: string | null; + htmlContent: string | null; + author: string | null; + imageUrl: string | null; + publishedAt: Date | null; +} + +type CustomItem = { + 'media:content'?: { $: { url: string } }; + 'media:thumbnail'?: { $: { url: string } }; + enclosure?: { url?: string }; +}; + +const parser: Parser = new Parser({ + timeout: 15_000, + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; ManaNewsIngester/1.0; +https://mana.how/news)', + }, + customFields: { + item: ['media:content', 'media:thumbnail', 'enclosure'], + }, +}); + +export async function fetchFeed(url: string): Promise { + const feed = await parser.parseURL(url); + + return (feed.items ?? []).map((item) => { + // rss-parser stuffs full HTML in `content` for Atom, plain in `contentSnippet`. + const html = (item as { content?: string }).content ?? null; + const text = (item as { contentSnippet?: string }).contentSnippet ?? null; + + // Image: try a few common locations. + const mediaContent = item['media:content']?.$?.url; + const mediaThumb = item['media:thumbnail']?.$?.url; + const enclosureUrl = item.enclosure?.url; + const imageUrl = mediaContent ?? mediaThumb ?? enclosureUrl ?? null; + + const link = (item as { link?: string }).link ?? ''; + const title = (item as { title?: string }).title ?? ''; + const author = + (item as { creator?: string; author?: string }).creator ?? + (item as { author?: string }).author ?? + null; + const isoDate = (item as { isoDate?: string }).isoDate ?? null; + + return { + url: link, + title, + excerpt: text, + content: text, + htmlContent: html, + author, + imageUrl, + publishedAt: isoDate ? new Date(isoDate) : null, + }; + }); +} diff --git a/services/news-ingester/src/sources.ts b/services/news-ingester/src/sources.ts new file mode 100644 index 000000000..9adab9457 --- /dev/null +++ b/services/news-ingester/src/sources.ts @@ -0,0 +1,260 @@ +/** + * Curated source list — single source of truth for the news ingester. + * + * Each source declares its parser type (`rss` or `hn`), the topic it + * belongs to, and the language. The `slug` MUST be stable across deploys + * because user blocklists reference it from client-side storage. + * + * Adding a source = append a row here, redeploy. The unified Mana app + * mirrors a sanitized subset (slug + name + topic + language) in + * `apps/mana/apps/web/src/lib/modules/news/sources-meta.ts` for the + * onboarding picker — keep both files in sync when editing. + */ + +export type SourceParserType = 'rss' | 'hn'; + +export type Topic = + | 'tech' + | 'wissenschaft' + | 'weltgeschehen' + | 'wirtschaft' + | 'kultur' + | 'gesundheit' + | 'politik'; + +export interface NewsSource { + slug: string; + name: string; + type: SourceParserType; + url: string; + topic: Topic; + language: 'de' | 'en'; +} + +export const SOURCES: NewsSource[] = [ + // ─── Tech ────────────────────────────────────────────────── + { + slug: 'hacker-news', + name: 'Hacker News', + type: 'hn', + url: 'https://hacker-news.firebaseio.com/v0/topstories.json', + topic: 'tech', + language: 'en', + }, + { + slug: 'arstechnica', + name: 'Ars Technica', + type: 'rss', + url: 'https://feeds.arstechnica.com/arstechnica/index', + topic: 'tech', + language: 'en', + }, + { + slug: 'theverge', + name: 'The Verge', + type: 'rss', + url: 'https://www.theverge.com/rss/index.xml', + topic: 'tech', + language: 'en', + }, + { + slug: 'heise', + name: 'heise online', + type: 'rss', + url: 'https://www.heise.de/rss/heise-atom.xml', + topic: 'tech', + language: 'de', + }, + + // ─── Wissenschaft ────────────────────────────────────────── + { + slug: 'quanta-magazine', + name: 'Quanta Magazine', + type: 'rss', + url: 'https://api.quantamagazine.org/feed/', + topic: 'wissenschaft', + language: 'en', + }, + { + slug: 'spektrum', + name: 'Spektrum', + type: 'rss', + url: 'https://www.spektrum.de/alias/rss/spektrum-de-rss-feed/996406', + topic: 'wissenschaft', + language: 'de', + }, + { + slug: 'nature-news', + name: 'Nature News', + type: 'rss', + url: 'https://www.nature.com/nature.rss', + topic: 'wissenschaft', + language: 'en', + }, + { + slug: 'phys-org', + name: 'Phys.org', + type: 'rss', + url: 'https://phys.org/rss-feed/', + topic: 'wissenschaft', + language: 'en', + }, + + // ─── Weltgeschehen ───────────────────────────────────────── + // Note: Reuters and AP both block automated feed fetchers as of + // 2026-04 (Reuters returns 406, AP refuses connection). Replaced + // with Al Jazeera and DW which both publish open RSS. + { + slug: 'tagesschau', + name: 'Tagesschau', + type: 'rss', + url: 'https://www.tagesschau.de/xml/rss2/', + topic: 'weltgeschehen', + language: 'de', + }, + { + slug: 'bbc-world', + name: 'BBC World', + type: 'rss', + url: 'https://feeds.bbci.co.uk/news/world/rss.xml', + topic: 'weltgeschehen', + language: 'en', + }, + { + slug: 'aljazeera', + name: 'Al Jazeera', + type: 'rss', + url: 'https://www.aljazeera.com/xml/rss/all.xml', + topic: 'weltgeschehen', + language: 'en', + }, + { + slug: 'dw-top', + name: 'Deutsche Welle', + type: 'rss', + url: 'https://rss.dw.com/rdf/rss-en-top', + topic: 'weltgeschehen', + language: 'en', + }, + + // ─── Wirtschaft ──────────────────────────────────────────── + { + slug: 'handelsblatt', + name: 'Handelsblatt', + type: 'rss', + url: 'https://www.handelsblatt.com/contentexport/feed/schlagzeilen', + topic: 'wirtschaft', + language: 'de', + }, + { + slug: 'ft-world', + name: 'Financial Times', + type: 'rss', + url: 'https://www.ft.com/world?format=rss', + topic: 'wirtschaft', + language: 'en', + }, + { + slug: 'bloomberg-markets', + name: 'Bloomberg Markets', + type: 'rss', + url: 'https://feeds.bloomberg.com/markets/news.rss', + topic: 'wirtschaft', + language: 'en', + }, + { + slug: 'economist-finance', + name: 'The Economist — Finance', + type: 'rss', + url: 'https://www.economist.com/finance-and-economics/rss.xml', + topic: 'wirtschaft', + language: 'en', + }, + + // ─── Kultur ──────────────────────────────────────────────── + // Perlentaucher and ZEIT Kultur both 404'd in testing (2026-04); + // replaced with NPR Arts and Guardian Books which are stable. + { + slug: 'guardian-culture', + name: 'The Guardian Culture', + type: 'rss', + url: 'https://www.theguardian.com/culture/rss', + topic: 'kultur', + language: 'en', + }, + { + slug: 'guardian-books', + name: 'The Guardian Books', + type: 'rss', + url: 'https://www.theguardian.com/books/rss', + topic: 'kultur', + language: 'en', + }, + { + slug: 'npr-arts', + name: 'NPR Arts', + type: 'rss', + url: 'https://feeds.npr.org/1008/rss.xml', + topic: 'kultur', + language: 'en', + }, + + // ─── Gesundheit ──────────────────────────────────────────── + // Ärzteblatt and NIH both 404'd; STAT News still works. Added + // BBC Health and ScienceDaily as reliable replacements. + { + slug: 'stat-news', + name: 'STAT News', + type: 'rss', + url: 'https://www.statnews.com/feed/', + topic: 'gesundheit', + language: 'en', + }, + { + slug: 'bbc-health', + name: 'BBC Health', + type: 'rss', + url: 'https://feeds.bbci.co.uk/news/health/rss.xml', + topic: 'gesundheit', + language: 'en', + }, + { + slug: 'sciencedaily-health', + name: 'ScienceDaily Health', + type: 'rss', + url: 'https://www.sciencedaily.com/rss/health_medicine.xml', + topic: 'gesundheit', + language: 'en', + }, + + // ─── Politik ─────────────────────────────────────────────── + { + slug: 'spiegel-politik', + name: 'Spiegel Politik', + type: 'rss', + url: 'https://www.spiegel.de/politik/index.rss', + topic: 'politik', + language: 'de', + }, + { + slug: 'politico-eu', + name: 'Politico EU', + type: 'rss', + url: 'https://www.politico.eu/feed/', + topic: 'politik', + language: 'en', + }, + { + slug: 'atlantic-politics', + name: 'The Atlantic — Politics', + type: 'rss', + url: 'https://www.theatlantic.com/feed/channel/politics/', + topic: 'politik', + language: 'en', + }, +]; + +/** Build a quick lookup by slug. */ +export const SOURCE_BY_SLUG: Record = Object.fromEntries( + SOURCES.map((s) => [s.slug, s]) +); diff --git a/services/news-ingester/tsconfig.json b/services/news-ingester/tsconfig.json new file mode 100644 index 000000000..354a2c2dd --- /dev/null +++ b/services/news-ingester/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "outDir": "dist", + "rootDir": "src", + "declaration": true, + "paths": { + "@/*": ["./src/*"] + } + }, + "include": ["src/**/*.ts"] +}