mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-20 08:13:39 +02:00
✨ feat(mana-crawler): add web crawler service
NestJS-based web crawler service for structured content extraction. Features: - Depth-controlled crawling with URL pattern filtering - robots.txt compliance - HTML/PDF/Markdown content extraction - BullMQ job queue for async processing - Redis caching layer - Prometheus metrics
This commit is contained in:
parent
c64b4d6ac9
commit
4a3295d1d0
39 changed files with 2795 additions and 0 deletions
20
services/mana-crawler/src/queue/processor.module.ts
Normal file
20
services/mana-crawler/src/queue/processor.module.ts
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
import { Module, forwardRef } from '@nestjs/common';
|
||||
import { CrawlProcessor } from './processors/crawl.processor';
|
||||
import { ParserModule } from '../parser/parser.module';
|
||||
import { RobotsModule } from '../robots/robots.module';
|
||||
import { CacheModule } from '../cache/cache.module';
|
||||
import { MetricsModule } from '../metrics/metrics.module';
|
||||
import { QueueModule } from './queue.module';
|
||||
import { CRAWL_QUEUE } from './constants';
|
||||
|
||||
@Module({
|
||||
imports: [
|
||||
forwardRef(() => QueueModule),
|
||||
ParserModule,
|
||||
RobotsModule,
|
||||
CacheModule,
|
||||
MetricsModule,
|
||||
],
|
||||
providers: [CrawlProcessor],
|
||||
})
|
||||
export class ProcessorModule {}
|
||||
Loading…
Add table
Add a link
Reference in a new issue