feat(news-research): RSS feed discovery, filter, and AI-context export

New sibling module to news/. Discovers topic-matched RSS feeds via
SearXNG (mana-search) or rel="alternate" probing of a site URL,
filters articles by keyword with a recency + title-match boost,
and exports the top hits as a markdown context block for the AI.

- API: /api/v1/news-research/{discover,validate,search,extract}
- Frontend: /news-research route + workbench ListView (compact card)
- Tool: research_news LLM tool (read-only, runs auto)
- Pin feeds → newsPreferences.customFeeds (encrypted) — covers the
  long-missing custom-RSS subscription gap; reading-list saves still
  go through articlesStore.saveFromUrl into the existing newsArticles
- shared-branding: new news-research entry + binoculars icon

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-15 22:31:07 +02:00
parent b768a0ffce
commit fdd643f4b4
16 changed files with 1586 additions and 2 deletions

View file

@ -29,6 +29,7 @@ import { foodRoutes } from './modules/food/routes';
import { guidesRoutes } from './modules/guides/routes';
import { moodlitRoutes } from './modules/moodlit/routes';
import { newsRoutes } from './modules/news/routes';
import { newsResearchRoutes } from './modules/news-research/routes';
import { tracesRoutes } from './modules/traces/routes';
import { presiRoutes } from './modules/presi/routes';
import { researchRoutes } from './modules/research/routes';
@ -61,6 +62,7 @@ app.route('/api/v1/food', foodRoutes);
app.route('/api/v1/guides', guidesRoutes);
app.route('/api/v1/moodlit', moodlitRoutes);
app.route('/api/v1/news', newsRoutes);
app.route('/api/v1/news-research', newsResearchRoutes);
app.route('/api/v1/traces', tracesRoutes);
app.route('/api/v1/presi', presiRoutes);
app.route('/api/v1/research', researchRoutes);

View file

@ -0,0 +1,219 @@
/**
* News Research module feed discovery, validation, and topic-scoped
* article search over user-selected feeds. Stateless: every request is a
* fresh fetch. Consumers are the /news-research UI and the `research_news`
* LLM tool.
*/
import { Hono } from 'hono';
import {
discoverFeeds,
validateFeed,
parseFeedUrl,
extractFromUrl,
type NormalizedFeedItem,
} from '@mana/shared-rss';
import { webSearch } from '../../lib/search';
const routes = new Hono();
const MAX_FEEDS_PER_SEARCH = 12;
const MAX_ARTICLES_PER_FEED = 40;
// ─── POST /discover ─────────────────────────────────────────
// Input shapes: { siteUrl } or { query } (with optional language).
// Returns a list of discovered feeds. For query mode we run a web
// search and then attempt feed discovery on each top result.
routes.post('/discover', async (c) => {
const body = await c.req.json<{
siteUrl?: string;
query?: string;
language?: string;
limit?: number;
}>();
const limit = Math.min(body.limit ?? 10, 20);
if (body.siteUrl) {
const feeds = await discoverFeeds(body.siteUrl);
return c.json({ feeds });
}
if (body.query) {
let hits: Awaited<ReturnType<typeof webSearch>>;
try {
hits = await webSearch({
query: `${body.query} rss feed`,
limit,
language: body.language,
categories: ['general', 'news'],
});
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
console.warn('[news-research] webSearch failed:', message);
return c.json(
{
error: 'web-search-unavailable',
message: `Websuche fehlgeschlagen: ${message}. Läuft mana-search (Port 3021)?`,
},
502
);
}
const siteUrls = Array.from(new Set(hits.map((h) => h.url).filter(Boolean))).slice(0, limit);
const perSite = await Promise.all(
siteUrls.map(async (url) => {
const feeds = await discoverFeeds(url).catch(() => []);
return feeds.map((f) => ({ ...f, sourceHit: url }));
})
);
const seen = new Set<string>();
const feeds = perSite
.flat()
.filter((f) => {
if (seen.has(f.url)) return false;
seen.add(f.url);
return true;
})
.slice(0, MAX_FEEDS_PER_SEARCH);
return c.json({ feeds, searched: siteUrls.length });
}
return c.json({ error: 'Provide either siteUrl or query' }, 400);
});
// ─── POST /validate ─────────────────────────────────────────
routes.post('/validate', async (c) => {
const { url } = await c.req.json<{ url: string }>();
if (!url) return c.json({ error: 'url required' }, 400);
const result = await validateFeed(url);
return c.json(result);
});
// ─── POST /search ───────────────────────────────────────────
// Fetch selected feeds in parallel, score their items against the
// query, return top N. Scoring is plain keyword frequency for v1;
// BM25/embeddings can replace `scoreItem` later.
interface ScoredArticle extends NormalizedFeedItem {
feedUrl: string;
score: number;
}
const STOPWORDS = new Set([
'der',
'die',
'das',
'und',
'oder',
'aber',
'the',
'a',
'an',
'of',
'to',
'in',
'for',
'on',
'with',
]);
function tokenize(text: string): string[] {
return text
.toLowerCase()
.normalize('NFKD')
.replace(/[^\p{L}\p{N}\s]/gu, ' ')
.split(/\s+/)
.filter((t) => t.length > 2 && !STOPWORDS.has(t));
}
function scoreItem(item: NormalizedFeedItem, queryTokens: string[]): number {
if (queryTokens.length === 0) return 0;
const haystack = `${item.title} ${item.excerpt ?? ''} ${item.content ?? ''}`.toLowerCase();
let score = 0;
for (const q of queryTokens) {
let from = 0;
while ((from = haystack.indexOf(q, from)) !== -1) {
score += 1;
from += q.length;
}
}
// Title matches count extra.
const title = item.title.toLowerCase();
for (const q of queryTokens) {
if (title.includes(q)) score += 3;
}
// Recency boost.
if (item.publishedAt) {
const ageDays = (Date.now() - item.publishedAt.getTime()) / 86_400_000;
if (ageDays < 1) score += 2;
else if (ageDays < 7) score += 1;
}
return score;
}
routes.post('/search', async (c) => {
const body = await c.req.json<{
feeds: string[];
query: string;
limit?: number;
sinceIso?: string;
}>();
if (!Array.isArray(body.feeds) || body.feeds.length === 0) {
return c.json({ error: 'feeds[] required' }, 400);
}
if (!body.query || typeof body.query !== 'string') {
return c.json({ error: 'query required' }, 400);
}
const queryTokens = tokenize(body.query);
const limit = Math.min(body.limit ?? 25, 100);
const since = body.sinceIso ? new Date(body.sinceIso) : null;
const feeds = body.feeds.slice(0, MAX_FEEDS_PER_SEARCH);
const perFeed = await Promise.all(
feeds.map(async (url) => {
try {
const items = await parseFeedUrl(url);
return items.slice(0, MAX_ARTICLES_PER_FEED).map((item) => ({ url, item }));
} catch {
return [];
}
})
);
const scored: ScoredArticle[] = perFeed
.flat()
.filter(({ item }) => !since || (item.publishedAt && item.publishedAt >= since))
.map(({ url, item }) => ({
...item,
feedUrl: url,
score: scoreItem(item, queryTokens),
}))
.filter((a) => a.score > 0)
.sort((a, b) => b.score - a.score)
.slice(0, limit);
return c.json({ articles: scored, feedCount: feeds.length });
});
// ─── POST /extract ──────────────────────────────────────────
// Readability for a single URL. Thin wrapper so the news-research
// client doesn't need to hit the legacy /news/extract/save path.
routes.post('/extract', async (c) => {
const { url } = await c.req.json<{ url: string }>();
if (!url) return c.json({ error: 'url required' }, 400);
const article = await extractFromUrl(url);
if (!article) return c.json({ error: 'Extraction failed' }, 502);
return c.json({ url, ...article });
});
export { routes as newsResearchRoutes };