mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 23:56:43 +02:00
NestJS-based web crawler service for structured content extraction. Features: - Depth-controlled crawling with URL pattern filtering - robots.txt compliance - HTML/PDF/Markdown content extraction - BullMQ job queue for async processing - Redis caching layer - Prometheus metrics
143 lines
3.9 KiB
TypeScript
143 lines
3.9 KiB
TypeScript
import { Injectable, Logger } from '@nestjs/common';
|
|
import { ConfigService } from '@nestjs/config';
|
|
import robotsParser from 'robots-parser';
|
|
import { CacheService } from '../cache/cache.service';
|
|
|
|
interface RobotsData {
|
|
allowed: boolean;
|
|
crawlDelay?: number;
|
|
sitemaps: string[];
|
|
}
|
|
|
|
@Injectable()
|
|
export class RobotsService {
|
|
private readonly logger = new Logger(RobotsService.name);
|
|
private readonly userAgent: string;
|
|
private readonly cacheTtl: number;
|
|
|
|
constructor(
|
|
private readonly configService: ConfigService,
|
|
private readonly cacheService: CacheService,
|
|
) {
|
|
this.userAgent = this.configService.get<string>(
|
|
'crawler.userAgent',
|
|
'ManaCoreCrawler/1.0',
|
|
);
|
|
this.cacheTtl = this.configService.get<number>('cache.robotsTtl', 86400);
|
|
}
|
|
|
|
async isAllowed(url: string): Promise<boolean> {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
|
|
const cacheKey = `robots:${urlObj.host}`;
|
|
|
|
// Check cache first
|
|
const cached = await this.cacheService.get<RobotsData>(cacheKey);
|
|
if (cached !== null) {
|
|
return this.checkUrl(cached, url);
|
|
}
|
|
|
|
// Fetch robots.txt
|
|
const robotsData = await this.fetchRobots(robotsUrl, urlObj.host);
|
|
|
|
// Cache the result
|
|
await this.cacheService.set(cacheKey, robotsData, this.cacheTtl);
|
|
|
|
return this.checkUrl(robotsData, url);
|
|
} catch (error) {
|
|
this.logger.warn(`Error checking robots.txt for ${url}: ${error}`);
|
|
// If we can't check, allow by default
|
|
return true;
|
|
}
|
|
}
|
|
|
|
async getCrawlDelay(domain: string): Promise<number | undefined> {
|
|
const cacheKey = `robots:${domain}`;
|
|
const cached = await this.cacheService.get<RobotsData>(cacheKey);
|
|
return cached?.crawlDelay;
|
|
}
|
|
|
|
async getSitemaps(domain: string): Promise<string[]> {
|
|
const cacheKey = `robots:${domain}`;
|
|
const cached = await this.cacheService.get<RobotsData>(cacheKey);
|
|
return cached?.sitemaps || [];
|
|
}
|
|
|
|
private async fetchRobots(robotsUrl: string, host: string): Promise<RobotsData> {
|
|
try {
|
|
const response = await fetch(robotsUrl, {
|
|
headers: {
|
|
'User-Agent': this.userAgent,
|
|
},
|
|
signal: AbortSignal.timeout(5000),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
// No robots.txt or error - allow all
|
|
this.logger.debug(`No robots.txt found for ${host} (${response.status})`);
|
|
return { allowed: true, sitemaps: [] };
|
|
}
|
|
|
|
const robotsTxt = await response.text();
|
|
const robots = robotsParser(robotsUrl, robotsTxt);
|
|
|
|
// Get crawl delay
|
|
const crawlDelay = robots.getCrawlDelay(this.userAgent);
|
|
|
|
// Get sitemaps
|
|
const sitemaps = robots.getSitemaps();
|
|
|
|
return {
|
|
allowed: true, // Will be checked per-URL
|
|
crawlDelay: crawlDelay ? Number(crawlDelay) : undefined,
|
|
sitemaps,
|
|
};
|
|
} catch (error) {
|
|
this.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}: ${error}`);
|
|
return { allowed: true, sitemaps: [] };
|
|
}
|
|
}
|
|
|
|
private checkUrl(robotsData: RobotsData, url: string): boolean {
|
|
// For now, we're caching a simplified version
|
|
// In production, you might want to cache the full robots.txt
|
|
// and parse it each time for more accurate checking
|
|
return robotsData.allowed;
|
|
}
|
|
|
|
async checkUrlWithRobots(url: string): Promise<{
|
|
allowed: boolean;
|
|
crawlDelay?: number;
|
|
}> {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
|
|
|
|
const response = await fetch(robotsUrl, {
|
|
headers: {
|
|
'User-Agent': this.userAgent,
|
|
},
|
|
signal: AbortSignal.timeout(5000),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return { allowed: true };
|
|
}
|
|
|
|
const robotsTxt = await response.text();
|
|
const robots = robotsParser(robotsUrl, robotsTxt);
|
|
|
|
const allowed = robots.isAllowed(url, this.userAgent) ?? true;
|
|
const crawlDelay = robots.getCrawlDelay(this.userAgent);
|
|
|
|
return {
|
|
allowed,
|
|
crawlDelay: crawlDelay ? Number(crawlDelay) : undefined,
|
|
};
|
|
} catch (error) {
|
|
this.logger.warn(`Error checking robots.txt for ${url}: ${error}`);
|
|
return { allowed: true };
|
|
}
|
|
}
|
|
}
|