mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-20 14:29:23 +02:00
Adds the services/news-ingester Bun service that pulls 25 public RSS/JSON feeds into news.curated_articles every 15 min, with Mozilla Readability fallback for thin RSS bodies and 30-day retention. apps/api /feed is rewritten to read from the new pool table directly instead of the sync_changes hack, with topics/lang/since/limit/offset query params. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
193 lines
5.7 KiB
TypeScript
193 lines
5.7 KiB
TypeScript
/**
|
|
* News module — Reads the curated article pool + extracts ad-hoc URLs.
|
|
*
|
|
* Pool population: handled by the standalone `services/news-ingester`
|
|
* Bun service, which writes into `news.curated_articles` on a 15 min
|
|
* loop. This route file just reads from that table.
|
|
*
|
|
* Saved articles (the user's personal reading list) live entirely in
|
|
* the unified Mana app's local-first IndexedDB and sync via mana-sync;
|
|
* this module never sees them.
|
|
*/
|
|
|
|
import { Hono } from 'hono';
|
|
import { Readability } from '@mozilla/readability';
|
|
import { JSDOM } from 'jsdom';
|
|
import { drizzle } from 'drizzle-orm/postgres-js';
|
|
import postgres from 'postgres';
|
|
import { sql } from 'drizzle-orm';
|
|
|
|
// ─── DB Connection (reads from news.curated_articles) ──────
|
|
|
|
const DATABASE_URL =
|
|
process.env.DATABASE_URL ?? 'postgresql://mana:devpassword@localhost:5432/mana_platform';
|
|
|
|
const connection = postgres(DATABASE_URL, { max: 10 });
|
|
const db = drizzle(connection);
|
|
|
|
// ─── Extract Service (Readability fallback for ad-hoc URLs) ─
|
|
|
|
interface ExtractedArticle {
|
|
title: string;
|
|
content: string;
|
|
htmlContent: string;
|
|
excerpt: string;
|
|
byline: string | null;
|
|
siteName: string | null;
|
|
wordCount: number;
|
|
readingTimeMinutes: number;
|
|
}
|
|
|
|
async function extractFromUrl(url: string): Promise<ExtractedArticle> {
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; ManaNews/1.0; +https://mana.how)',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch URL: ${response.status}`);
|
|
}
|
|
|
|
const html = await response.text();
|
|
const dom = new JSDOM(html, { url });
|
|
const reader = new Readability(dom.window.document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error('Could not extract article content');
|
|
}
|
|
|
|
const wordCount = article.textContent.split(/\s+/).filter(Boolean).length;
|
|
const readingTimeMinutes = Math.max(1, Math.ceil(wordCount / 200));
|
|
|
|
return {
|
|
title: article.title,
|
|
content: article.textContent,
|
|
htmlContent: article.content,
|
|
excerpt: article.excerpt || article.textContent.slice(0, 200),
|
|
byline: article.byline || null,
|
|
siteName: article.siteName || null,
|
|
wordCount,
|
|
readingTimeMinutes,
|
|
};
|
|
}
|
|
|
|
// ─── Routes ─────────────────────────────────────────────────
|
|
|
|
const routes = new Hono();
|
|
|
|
// ─── Feed (reads from news.curated_articles) ───────────────
|
|
//
|
|
// Query params:
|
|
// topics — comma-separated topic slugs (tech,wissenschaft,…). If
|
|
// omitted, all topics are returned.
|
|
// lang — 'de' | 'en' | 'all' (default 'all')
|
|
// since — ISO timestamp; only articles published after this
|
|
// limit — default 50, max 200
|
|
// offset — default 0
|
|
//
|
|
// Returns the full article body so the client can render the reader
|
|
// without a second round-trip. Curated articles are small (≤30 KB
|
|
// each) and the client caches them locally for offline reading.
|
|
|
|
routes.get('/feed', async (c) => {
|
|
const topicsParam = c.req.query('topics');
|
|
const lang = c.req.query('lang') ?? 'all';
|
|
const since = c.req.query('since');
|
|
const limit = Math.min(parseInt(c.req.query('limit') || '50', 10), 200);
|
|
const offset = parseInt(c.req.query('offset') || '0', 10);
|
|
|
|
const conditions: ReturnType<typeof sql>[] = [];
|
|
|
|
if (topicsParam) {
|
|
const topics = topicsParam
|
|
.split(',')
|
|
.map((t) => t.trim())
|
|
.filter(Boolean);
|
|
if (topics.length > 0) {
|
|
conditions.push(sql`topic = ANY(${topics})`);
|
|
}
|
|
}
|
|
if (lang === 'de' || lang === 'en') {
|
|
conditions.push(sql`language = ${lang}`);
|
|
}
|
|
if (since) {
|
|
conditions.push(sql`published_at > ${since}`);
|
|
}
|
|
|
|
const whereClause =
|
|
conditions.length > 0
|
|
? sql.join([sql`WHERE`, sql.join(conditions, sql` AND `)], sql` `)
|
|
: sql``;
|
|
|
|
const result = await db.execute(sql`
|
|
SELECT
|
|
id,
|
|
original_url AS "originalUrl",
|
|
title,
|
|
excerpt,
|
|
content,
|
|
html_content AS "htmlContent",
|
|
author,
|
|
site_name AS "siteName",
|
|
source_slug AS "sourceSlug",
|
|
image_url AS "imageUrl",
|
|
topic,
|
|
language,
|
|
word_count AS "wordCount",
|
|
reading_time_minutes AS "readingTimeMinutes",
|
|
published_at AS "publishedAt",
|
|
ingested_at AS "ingestedAt"
|
|
FROM news.curated_articles
|
|
${whereClause}
|
|
ORDER BY published_at DESC NULLS LAST, ingested_at DESC
|
|
LIMIT ${limit} OFFSET ${offset}
|
|
`);
|
|
|
|
return c.json(result as unknown as Record<string, unknown>[]);
|
|
});
|
|
|
|
// ─── Extract (content extraction for user-pasted URLs) ─────
|
|
|
|
routes.post('/extract/preview', async (c) => {
|
|
const { url } = await c.req.json<{ url: string }>();
|
|
if (!url) return c.json({ error: 'URL is required' }, 400);
|
|
|
|
try {
|
|
const article = await extractFromUrl(url);
|
|
return c.json(article);
|
|
} catch (err) {
|
|
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
|
|
}
|
|
});
|
|
|
|
routes.post('/extract/save', async (c) => {
|
|
const { url } = await c.req.json<{ url: string }>();
|
|
if (!url) return c.json({ error: 'URL is required' }, 400);
|
|
|
|
try {
|
|
const extracted = await extractFromUrl(url);
|
|
|
|
// Return extracted data — client saves to local-first store.
|
|
return c.json({
|
|
id: crypto.randomUUID(),
|
|
type: 'saved',
|
|
sourceOrigin: 'user_saved',
|
|
originalUrl: url,
|
|
title: extracted.title,
|
|
content: extracted.content,
|
|
htmlContent: extracted.htmlContent,
|
|
excerpt: extracted.excerpt,
|
|
author: extracted.byline,
|
|
siteName: extracted.siteName,
|
|
wordCount: extracted.wordCount,
|
|
readingTimeMinutes: extracted.readingTimeMinutes,
|
|
isArchived: false,
|
|
});
|
|
} catch (err) {
|
|
return c.json({ error: err instanceof Error ? err.message : 'Extraction failed' }, 500);
|
|
}
|
|
});
|
|
|
|
export { routes as newsRoutes };
|