managarten/services/mana-crawler/src/robots/robots.service.ts
Till-JS 4a3295d1d0 feat(mana-crawler): add web crawler service
NestJS-based web crawler service for structured content extraction.

Features:
- Depth-controlled crawling with URL pattern filtering
- robots.txt compliance
- HTML/PDF/Markdown content extraction
- BullMQ job queue for async processing
- Redis caching layer
- Prometheus metrics
2026-01-29 22:00:36 +01:00

143 lines
3.9 KiB
TypeScript

import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import robotsParser from 'robots-parser';
import { CacheService } from '../cache/cache.service';
interface RobotsData {
allowed: boolean;
crawlDelay?: number;
sitemaps: string[];
}
@Injectable()
export class RobotsService {
private readonly logger = new Logger(RobotsService.name);
private readonly userAgent: string;
private readonly cacheTtl: number;
constructor(
private readonly configService: ConfigService,
private readonly cacheService: CacheService,
) {
this.userAgent = this.configService.get<string>(
'crawler.userAgent',
'ManaCoreCrawler/1.0',
);
this.cacheTtl = this.configService.get<number>('cache.robotsTtl', 86400);
}
async isAllowed(url: string): Promise<boolean> {
try {
const urlObj = new URL(url);
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
const cacheKey = `robots:${urlObj.host}`;
// Check cache first
const cached = await this.cacheService.get<RobotsData>(cacheKey);
if (cached !== null) {
return this.checkUrl(cached, url);
}
// Fetch robots.txt
const robotsData = await this.fetchRobots(robotsUrl, urlObj.host);
// Cache the result
await this.cacheService.set(cacheKey, robotsData, this.cacheTtl);
return this.checkUrl(robotsData, url);
} catch (error) {
this.logger.warn(`Error checking robots.txt for ${url}: ${error}`);
// If we can't check, allow by default
return true;
}
}
async getCrawlDelay(domain: string): Promise<number | undefined> {
const cacheKey = `robots:${domain}`;
const cached = await this.cacheService.get<RobotsData>(cacheKey);
return cached?.crawlDelay;
}
async getSitemaps(domain: string): Promise<string[]> {
const cacheKey = `robots:${domain}`;
const cached = await this.cacheService.get<RobotsData>(cacheKey);
return cached?.sitemaps || [];
}
private async fetchRobots(robotsUrl: string, host: string): Promise<RobotsData> {
try {
const response = await fetch(robotsUrl, {
headers: {
'User-Agent': this.userAgent,
},
signal: AbortSignal.timeout(5000),
});
if (!response.ok) {
// No robots.txt or error - allow all
this.logger.debug(`No robots.txt found for ${host} (${response.status})`);
return { allowed: true, sitemaps: [] };
}
const robotsTxt = await response.text();
const robots = robotsParser(robotsUrl, robotsTxt);
// Get crawl delay
const crawlDelay = robots.getCrawlDelay(this.userAgent);
// Get sitemaps
const sitemaps = robots.getSitemaps();
return {
allowed: true, // Will be checked per-URL
crawlDelay: crawlDelay ? Number(crawlDelay) : undefined,
sitemaps,
};
} catch (error) {
this.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}: ${error}`);
return { allowed: true, sitemaps: [] };
}
}
private checkUrl(robotsData: RobotsData, url: string): boolean {
// For now, we're caching a simplified version
// In production, you might want to cache the full robots.txt
// and parse it each time for more accurate checking
return robotsData.allowed;
}
async checkUrlWithRobots(url: string): Promise<{
allowed: boolean;
crawlDelay?: number;
}> {
try {
const urlObj = new URL(url);
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
const response = await fetch(robotsUrl, {
headers: {
'User-Agent': this.userAgent,
},
signal: AbortSignal.timeout(5000),
});
if (!response.ok) {
return { allowed: true };
}
const robotsTxt = await response.text();
const robots = robotsParser(robotsUrl, robotsTxt);
const allowed = robots.isAllowed(url, this.userAgent) ?? true;
const crawlDelay = robots.getCrawlDelay(this.userAgent);
return {
allowed,
crawlDelay: crawlDelay ? Number(crawlDelay) : undefined,
};
} catch (error) {
this.logger.warn(`Error checking robots.txt for ${url}: ${error}`);
return { allowed: true };
}
}
}