managarten/services/mana-crawler/src/parser/parser.service.ts
Till-JS 4a3295d1d0 feat(mana-crawler): add web crawler service
NestJS-based web crawler service for structured content extraction.

Features:
- Depth-controlled crawling with URL pattern filtering
- robots.txt compliance
- HTML/PDF/Markdown content extraction
- BullMQ job queue for async processing
- Redis caching layer
- Prometheus metrics
2026-01-29 22:00:36 +01:00

245 lines
5.9 KiB
TypeScript

import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import * as cheerio from 'cheerio';
import TurndownService from 'turndown';
import { CrawlSelectors } from '../db/schema';
export interface ParsedPage {
title: string;
content: string;
markdown?: string;
html?: string;
links: string[];
metadata: Record<string, unknown>;
}
export interface ParseOptions {
selectors?: CrawlSelectors;
includeMarkdown?: boolean;
includeHtml?: boolean;
baseUrl: string;
}
@Injectable()
export class ParserService {
private readonly logger = new Logger(ParserService.name);
private readonly turndown: TurndownService;
constructor(private readonly configService: ConfigService) {
this.turndown = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
});
// Custom rules for better Markdown output
this.turndown.addRule('codeBlocks', {
filter: ['pre'],
replacement: (content: string) => `\n\`\`\`\n${content}\n\`\`\`\n`,
});
this.turndown.addRule('inlineCode', {
filter: ['code'],
replacement: (content: string) => `\`${content}\``,
});
// Remove script and style elements
this.turndown.remove(['script', 'style', 'noscript']);
}
parse(html: string, options: ParseOptions): ParsedPage {
const $ = cheerio.load(html);
const { selectors, includeMarkdown, includeHtml, baseUrl } = options;
// Remove unwanted elements
$('script, style, noscript, iframe, svg').remove();
// Extract title
const title = this.extractTitle($, selectors?.title);
// Extract main content
const contentHtml = this.extractContent($, selectors?.content);
const content = this.cleanText(contentHtml);
// Extract links
const links = this.extractLinks($, baseUrl, selectors?.links);
// Extract metadata
const metadata = this.extractMetadata($);
// Extract custom selectors
if (selectors?.custom) {
for (const [key, selector] of Object.entries(selectors.custom)) {
try {
metadata[key] = $(selector).text().trim() || $(selector).attr('content');
} catch {
this.logger.warn(`Failed to extract custom selector: ${key}`);
}
}
}
const result: ParsedPage = {
title,
content,
links,
metadata,
};
if (includeMarkdown && contentHtml) {
result.markdown = this.turndown.turndown(contentHtml);
}
if (includeHtml) {
result.html = contentHtml;
}
return result;
}
private extractTitle($: cheerio.CheerioAPI, selector?: string): string {
if (selector) {
const customTitle = $(selector).text().trim();
if (customTitle) return customTitle;
}
// Try common title patterns
const h1 = $('h1').first().text().trim();
if (h1) return h1;
const title = $('title').text().trim();
if (title) return title;
const ogTitle = $('meta[property="og:title"]').attr('content');
if (ogTitle) return ogTitle;
return '';
}
private extractContent($: cheerio.CheerioAPI, selector?: string): string {
if (selector) {
const customContent = $(selector).html();
if (customContent) return customContent;
}
// Try common content patterns
const contentSelectors = [
'article',
'main',
'[role="main"]',
'.main-content',
'.content',
'.post-content',
'.article-content',
'.entry-content',
'#content',
'#main',
];
for (const sel of contentSelectors) {
const content = $(sel).html();
if (content && content.length > 100) {
return content;
}
}
// Fallback to body
return $('body').html() || '';
}
private extractLinks(
$: cheerio.CheerioAPI,
baseUrl: string,
selector?: string,
): string[] {
const links = new Set<string>();
const baseUrlObj = new URL(baseUrl);
const linkSelector = selector || 'a[href]';
$(linkSelector).each((_, element) => {
const href = $(element).attr('href');
if (!href) return;
try {
// Skip non-http links
if (
href.startsWith('javascript:') ||
href.startsWith('mailto:') ||
href.startsWith('tel:') ||
href.startsWith('#')
) {
return;
}
// Resolve relative URLs
const absoluteUrl = new URL(href, baseUrl);
// Only include same-origin links (or all if needed)
if (absoluteUrl.origin === baseUrlObj.origin) {
// Remove hash and normalize
absoluteUrl.hash = '';
links.add(absoluteUrl.href);
}
} catch {
// Invalid URL, skip
}
});
return Array.from(links);
}
private extractMetadata($: cheerio.CheerioAPI): Record<string, unknown> {
const metadata: Record<string, unknown> = {};
// OpenGraph metadata
$('meta[property^="og:"]').each((_, element) => {
const property = $(element).attr('property')?.replace('og:', '');
const content = $(element).attr('content');
if (property && content) {
metadata[`og_${property}`] = content;
}
});
// Standard meta tags
const description = $('meta[name="description"]').attr('content');
if (description) metadata.description = description;
const keywords = $('meta[name="keywords"]').attr('content');
if (keywords) metadata.keywords = keywords;
const author = $('meta[name="author"]').attr('content');
if (author) metadata.author = author;
const canonical = $('link[rel="canonical"]').attr('href');
if (canonical) metadata.canonical = canonical;
// Schema.org JSON-LD
$('script[type="application/ld+json"]').each((_, element) => {
try {
const json = JSON.parse($(element).html() || '');
if (!metadata.jsonLd) {
metadata.jsonLd = [];
}
(metadata.jsonLd as unknown[]).push(json);
} catch {
// Invalid JSON, skip
}
});
return metadata;
}
private cleanText(html: string): string {
return html
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/\s+/g, ' ')
.trim();
}
}