feat(mana-crawler): add web crawler service

NestJS-based web crawler service for structured content extraction.

Features:
- Depth-controlled crawling with URL pattern filtering
- robots.txt compliance
- HTML/PDF/Markdown content extraction
- BullMQ job queue for async processing
- Redis caching layer
- Prometheus metrics
This commit is contained in:
Till-JS 2026-01-29 22:00:36 +01:00
parent c64b4d6ac9
commit 4a3295d1d0
39 changed files with 2795 additions and 0 deletions

View file

@ -0,0 +1,20 @@
import { Module, forwardRef } from '@nestjs/common';
import { CrawlProcessor } from './processors/crawl.processor';
import { ParserModule } from '../parser/parser.module';
import { RobotsModule } from '../robots/robots.module';
import { CacheModule } from '../cache/cache.module';
import { MetricsModule } from '../metrics/metrics.module';
import { QueueModule } from './queue.module';
import { CRAWL_QUEUE } from './constants';
@Module({
imports: [
forwardRef(() => QueueModule),
ParserModule,
RobotsModule,
CacheModule,
MetricsModule,
],
providers: [CrawlProcessor],
})
export class ProcessorModule {}