mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-16 20:19:39 +02:00
feat(news): backend ingester service + curated feed API
Adds the services/news-ingester Bun service that pulls 25 public RSS/JSON feeds into news.curated_articles every 15 min, with Mozilla Readability fallback for thin RSS bodies and 30-day retention. apps/api /feed is rewritten to read from the new pool table directly instead of the sync_changes hack, with topics/lang/since/limit/offset query params. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
45790ffbb8
commit
9ef97a1877
17 changed files with 1058 additions and 64 deletions
|
|
@ -1,9 +1,13 @@
|
|||
/**
|
||||
* News module — Article extraction + AI feed
|
||||
* Ported from apps/news/apps/server
|
||||
* News module — Reads the curated article pool + extracts ad-hoc URLs.
|
||||
*
|
||||
* Saved articles handled by local-first + mana-sync.
|
||||
* This module handles content extraction (Mozilla Readability) and feed from sync_changes.
|
||||
* Pool population: handled by the standalone `services/news-ingester`
|
||||
* Bun service, which writes into `news.curated_articles` on a 15 min
|
||||
* loop. This route file just reads from that table.
|
||||
*
|
||||
* Saved articles (the user's personal reading list) live entirely in
|
||||
* the unified Mana app's local-first IndexedDB and sync via mana-sync;
|
||||
* this module never sees them.
|
||||
*/
|
||||
|
||||
import { Hono } from 'hono';
|
||||
|
|
@ -13,15 +17,15 @@ import { drizzle } from 'drizzle-orm/postgres-js';
|
|||
import postgres from 'postgres';
|
||||
import { sql } from 'drizzle-orm';
|
||||
|
||||
// ─── DB Connection (reads from sync_changes for feed) ───────
|
||||
// ─── DB Connection (reads from news.curated_articles) ──────
|
||||
|
||||
const DATABASE_URL =
|
||||
process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_sync';
|
||||
process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_platform';
|
||||
|
||||
const connection = postgres(DATABASE_URL, { max: 10 });
|
||||
const db = drizzle(connection);
|
||||
|
||||
// ─── Extract Service ────────────────────────────────────────
|
||||
// ─── Extract Service (Readability fallback for ad-hoc URLs) ─
|
||||
|
||||
interface ExtractedArticle {
|
||||
title: string;
|
||||
|
|
@ -73,75 +77,78 @@ async function extractFromUrl(url: string): Promise<ExtractedArticle> {
|
|||
|
||||
const routes = new Hono();
|
||||
|
||||
// ─── Feed (public, reads from sync_changes) ─────────────────
|
||||
// ─── Feed (reads from news.curated_articles) ───────────────
|
||||
//
|
||||
// Query params:
|
||||
// topics — comma-separated topic slugs (tech,wissenschaft,…). If
|
||||
// omitted, all topics are returned.
|
||||
// lang — 'de' | 'en' | 'all' (default 'all')
|
||||
// since — ISO timestamp; only articles published after this
|
||||
// limit — default 50, max 200
|
||||
// offset — default 0
|
||||
//
|
||||
// Returns the full article body so the client can render the reader
|
||||
// without a second round-trip. Curated articles are small (≤30 KB
|
||||
// each) and the client caches them locally for offline reading.
|
||||
|
||||
routes.get('/feed', async (c) => {
|
||||
const type = c.req.query('type');
|
||||
const categoryId = c.req.query('categoryId');
|
||||
const limit = parseInt(c.req.query('limit') || '20', 10);
|
||||
const topicsParam = c.req.query('topics');
|
||||
const lang = c.req.query('lang') ?? 'all';
|
||||
const since = c.req.query('since');
|
||||
const limit = Math.min(parseInt(c.req.query('limit') || '50', 10), 200);
|
||||
const offset = parseInt(c.req.query('offset') || '0', 10);
|
||||
|
||||
let whereClause = sql`app_id = 'news' AND table_name = 'articles' AND op != 'delete'`;
|
||||
const conditions: ReturnType<typeof sql>[] = [];
|
||||
|
||||
if (type) {
|
||||
whereClause = sql`${whereClause} AND data->>'type' = ${type}`;
|
||||
if (topicsParam) {
|
||||
const topics = topicsParam
|
||||
.split(',')
|
||||
.map((t) => t.trim())
|
||||
.filter(Boolean);
|
||||
if (topics.length > 0) {
|
||||
conditions.push(sql`topic = ANY(${topics})`);
|
||||
}
|
||||
}
|
||||
if (categoryId) {
|
||||
whereClause = sql`${whereClause} AND data->>'categoryId' = ${categoryId}`;
|
||||
if (lang === 'de' || lang === 'en') {
|
||||
conditions.push(sql`language = ${lang}`);
|
||||
}
|
||||
if (since) {
|
||||
conditions.push(sql`published_at > ${since}`);
|
||||
}
|
||||
|
||||
const whereClause =
|
||||
conditions.length > 0
|
||||
? sql.join([sql`WHERE`, sql.join(conditions, sql` AND `)], sql` `)
|
||||
: sql``;
|
||||
|
||||
const result = await db.execute(sql`
|
||||
SELECT DISTINCT ON (record_id)
|
||||
record_id as id,
|
||||
data->>'title' as title,
|
||||
data->>'excerpt' as excerpt,
|
||||
data->>'author' as author,
|
||||
data->>'imageUrl' as "imageUrl",
|
||||
data->>'type' as type,
|
||||
data->>'categoryId' as "categoryId",
|
||||
(data->>'wordCount')::int as "wordCount",
|
||||
(data->>'readingTimeMinutes')::int as "readingTimeMinutes",
|
||||
data->>'publishedAt' as "publishedAt",
|
||||
created_at as "createdAt"
|
||||
FROM sync_changes
|
||||
WHERE ${whereClause}
|
||||
ORDER BY record_id, created_at DESC
|
||||
SELECT
|
||||
id,
|
||||
original_url AS "originalUrl",
|
||||
title,
|
||||
excerpt,
|
||||
content,
|
||||
html_content AS "htmlContent",
|
||||
author,
|
||||
site_name AS "siteName",
|
||||
source_slug AS "sourceSlug",
|
||||
image_url AS "imageUrl",
|
||||
topic,
|
||||
language,
|
||||
word_count AS "wordCount",
|
||||
reading_time_minutes AS "readingTimeMinutes",
|
||||
published_at AS "publishedAt",
|
||||
ingested_at AS "ingestedAt"
|
||||
FROM news.curated_articles
|
||||
${whereClause}
|
||||
ORDER BY published_at DESC NULLS LAST, ingested_at DESC
|
||||
LIMIT ${limit} OFFSET ${offset}
|
||||
`);
|
||||
|
||||
return c.json(result as unknown as Record<string, unknown>[]);
|
||||
});
|
||||
|
||||
routes.get('/feed/:id', async (c) => {
|
||||
const id = c.req.param('id');
|
||||
|
||||
const result = await db.execute(sql`
|
||||
SELECT DISTINCT ON (record_id)
|
||||
record_id as id,
|
||||
data->>'title' as title,
|
||||
data->>'content' as content,
|
||||
data->>'htmlContent' as "htmlContent",
|
||||
data->>'excerpt' as excerpt,
|
||||
data->>'author' as author,
|
||||
data->>'imageUrl' as "imageUrl",
|
||||
data->>'originalUrl' as "originalUrl",
|
||||
data->>'type' as type,
|
||||
(data->>'wordCount')::int as "wordCount",
|
||||
(data->>'readingTimeMinutes')::int as "readingTimeMinutes",
|
||||
data->>'publishedAt' as "publishedAt",
|
||||
created_at as "createdAt"
|
||||
FROM sync_changes
|
||||
WHERE app_id = 'news' AND table_name = 'articles' AND record_id = ${id} AND op != 'delete'
|
||||
ORDER BY record_id, created_at DESC
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
const rows = result as unknown as Record<string, unknown>[];
|
||||
if (!rows[0]) return c.json({ error: 'Article not found' }, 404);
|
||||
return c.json(rows[0]);
|
||||
});
|
||||
|
||||
// ─── Extract (content extraction) ───────────────────────────
|
||||
// ─── Extract (content extraction for user-pasted URLs) ─────
|
||||
|
||||
routes.post('/extract/preview', async (c) => {
|
||||
const { url } = await c.req.json<{ url: string }>();
|
||||
|
|
@ -162,7 +169,7 @@ routes.post('/extract/save', async (c) => {
|
|||
try {
|
||||
const extracted = await extractFromUrl(url);
|
||||
|
||||
// Return extracted data -- client saves to local-first store
|
||||
// Return extracted data — client saves to local-first store.
|
||||
return c.json({
|
||||
id: crypto.randomUUID(),
|
||||
type: 'saved',
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue