feat(news): backend ingester service + curated feed API

Adds the services/news-ingester Bun service that pulls 25 public RSS/JSON feeds into news.curated_articles every 15 min, with Mozilla Readability fallback for thin RSS bodies and 30-day retention. apps/api /feed is rewritten to read from the new pool table directly instead of the sync_changes hack, with topics/lang/since/limit/offset query params. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-16 02:59:40 +02:00 · 2026-04-09 15:53:26 +02:00 · 2026-04-09 15:53:26 +02:00 · 9ef97a1877
commit 9ef97a1877
parent 45790ffbb8
17 changed files with 1058 additions and 64 deletions
--- a/apps/api/src/modules/news/routes.ts
+++ b/apps/api/src/modules/news/routes.ts
@ -1,9 +1,13 @@
 /**
- * News module — Article extraction + AI feed
- * Ported from apps/news/apps/server
+ * News module — Reads the curated article pool + extracts ad-hoc URLs.
 *
- * Saved articles handled by local-first + mana-sync.
- * This module handles content extraction (Mozilla Readability) and feed from sync_changes.
+ * Pool population: handled by the standalone `services/news-ingester`
+ * Bun service, which writes into `news.curated_articles` on a 15 min
+ * loop. This route file just reads from that table.
+ *
+ * Saved articles (the user's personal reading list) live entirely in
+ * the unified Mana app's local-first IndexedDB and sync via mana-sync;
+ * this module never sees them.
 */

 import { Hono } from 'hono';
@ -13,15 +17,15 @@ import { drizzle } from 'drizzle-orm/postgres-js';
 import postgres from 'postgres';
 import { sql } from 'drizzle-orm';

-// ─── DB Connection (reads from sync_changes for feed) ───────
+// ─── DB Connection (reads from news.curated_articles) ──────

 const DATABASE_URL =
-	process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_sync';
+	process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_platform';

 const connection = postgres(DATABASE_URL, { max: 10 });
 const db = drizzle(connection);

-// ─── Extract Service ────────────────────────────────────────
+// ─── Extract Service (Readability fallback for ad-hoc URLs) ─

 interface ExtractedArticle {
 	title: string;
@ -73,75 +77,78 @@ async function extractFromUrl(url: string): Promise<ExtractedArticle> {

 const routes = new Hono();

-// ─── Feed (public, reads from sync_changes) ─────────────────
+// ─── Feed (reads from news.curated_articles) ───────────────
+//
+// Query params:
+//   topics  — comma-separated topic slugs (tech,wissenschaft,…). If
+//             omitted, all topics are returned.
+//   lang    — 'de' | 'en' | 'all' (default 'all')
+//   since   — ISO timestamp; only articles published after this
+//   limit   — default 50, max 200
+//   offset  — default 0
+//
+// Returns the full article body so the client can render the reader
+// without a second round-trip. Curated articles are small (≤30 KB
+// each) and the client caches them locally for offline reading.

 routes.get('/feed', async (c) => {
-	const type = c.req.query('type');
-	const categoryId = c.req.query('categoryId');
-	const limit = parseInt(c.req.query('limit') || '20', 10);
+	const topicsParam = c.req.query('topics');
+	const lang = c.req.query('lang') ?? 'all';
+	const since = c.req.query('since');
+	const limit = Math.min(parseInt(c.req.query('limit') || '50', 10), 200);
 	const offset = parseInt(c.req.query('offset') || '0', 10);

-	let whereClause = sql`app_id = 'news' AND table_name = 'articles' AND op != 'delete'`;
+	const conditions: ReturnType<typeof sql>[] = [];

-	if (type) {
-		whereClause = sql`${whereClause} AND data->>'type' = ${type}`;
+	if (topicsParam) {
+		const topics = topicsParam
+			.split(',')
+			.map((t) => t.trim())
+			.filter(Boolean);
+		if (topics.length > 0) {
+			conditions.push(sql`topic = ANY(${topics})`);
+		}
 	}
-	if (categoryId) {
-		whereClause = sql`${whereClause} AND data->>'categoryId' = ${categoryId}`;
+	if (lang === 'de' || lang === 'en') {
+		conditions.push(sql`language = ${lang}`);
 	}
+	if (since) {
+		conditions.push(sql`published_at > ${since}`);
+	}
+
+	const whereClause =
+		conditions.length > 0
+			? sql.join([sql`WHERE`, sql.join(conditions, sql` AND `)], sql` `)
+			: sql``;

 	const result = await db.execute(sql`
-		SELECT DISTINCT ON (record_id)
-			record_id as id,
-			data->>'title' as title,
-			data->>'excerpt' as excerpt,
-			data->>'author' as author,
-			data->>'imageUrl' as "imageUrl",
-			data->>'type' as type,
-			data->>'categoryId' as "categoryId",
-			(data->>'wordCount')::int as "wordCount",
-			(data->>'readingTimeMinutes')::int as "readingTimeMinutes",
-			data->>'publishedAt' as "publishedAt",
-			created_at as "createdAt"
-		FROM sync_changes
-		WHERE ${whereClause}
-		ORDER BY record_id, created_at DESC
+		SELECT
+			id,
+			original_url   AS "originalUrl",
+			title,
+			excerpt,
+			content,
+			html_content   AS "htmlContent",
+			author,
+			site_name      AS "siteName",
+			source_slug    AS "sourceSlug",
+			image_url      AS "imageUrl",
+			topic,
+			language,
+			word_count     AS "wordCount",
+			reading_time_minutes AS "readingTimeMinutes",
+			published_at   AS "publishedAt",
+			ingested_at    AS "ingestedAt"
+		FROM news.curated_articles
+		${whereClause}
+		ORDER BY published_at DESC NULLS LAST, ingested_at DESC
 		LIMIT ${limit} OFFSET ${offset}
 	`);

 	return c.json(result as unknown as Record<string, unknown>[]);
 });

-routes.get('/feed/:id', async (c) => {
-	const id = c.req.param('id');
-
-	const result = await db.execute(sql`
-		SELECT DISTINCT ON (record_id)
-			record_id as id,
-			data->>'title' as title,
-			data->>'content' as content,
-			data->>'htmlContent' as "htmlContent",
-			data->>'excerpt' as excerpt,
-			data->>'author' as author,
-			data->>'imageUrl' as "imageUrl",
-			data->>'originalUrl' as "originalUrl",
-			data->>'type' as type,
-			(data->>'wordCount')::int as "wordCount",
-			(data->>'readingTimeMinutes')::int as "readingTimeMinutes",
-			data->>'publishedAt' as "publishedAt",
-			created_at as "createdAt"
-		FROM sync_changes
-		WHERE app_id = 'news' AND table_name = 'articles' AND record_id = ${id} AND op != 'delete'
-		ORDER BY record_id, created_at DESC
-		LIMIT 1
-	`);
-
-	const rows = result as unknown as Record<string, unknown>[];
-	if (!rows[0]) return c.json({ error: 'Article not found' }, 404);
-	return c.json(rows[0]);
-});
-
-// ─── Extract (content extraction) ───────────────────────────
+// ─── Extract (content extraction for user-pasted URLs) ─────

 routes.post('/extract/preview', async (c) => {
 	const { url } = await c.req.json<{ url: string }>();
@ -162,7 +169,7 @@ routes.post('/extract/save', async (c) => {
 	try {
 		const extracted = await extractFromUrl(url);

-		// Return extracted data -- client saves to local-first store
+		// Return extracted data — client saves to local-first store.
 		return c.json({
 			id: crypto.randomUUID(),
 			type: 'saved',