mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-18 09:49:40 +02:00
NestJS-based web crawler service for structured content extraction. Features: - Depth-controlled crawling with URL pattern filtering - robots.txt compliance - HTML/PDF/Markdown content extraction - BullMQ job queue for async processing - Redis caching layer - Prometheus metrics
74 lines
2 KiB
TypeScript
74 lines
2 KiB
TypeScript
import {
|
|
Controller,
|
|
Post,
|
|
Get,
|
|
Delete,
|
|
Body,
|
|
Param,
|
|
Query,
|
|
ParseUUIDPipe,
|
|
ParseIntPipe,
|
|
DefaultValuePipe,
|
|
HttpCode,
|
|
HttpStatus,
|
|
} from '@nestjs/common';
|
|
import { CrawlerService } from './crawler.service';
|
|
import { StartCrawlDto } from './dto/start-crawl.dto';
|
|
import { CrawlJobResponse, CrawlResultResponse, PaginatedResults } from './dto/crawl-response.dto';
|
|
|
|
@Controller('crawl')
|
|
export class CrawlerController {
|
|
constructor(private readonly crawlerService: CrawlerService) {}
|
|
|
|
@Post()
|
|
async startCrawl(@Body() dto: StartCrawlDto): Promise<CrawlJobResponse> {
|
|
return this.crawlerService.startCrawl(dto);
|
|
}
|
|
|
|
@Get()
|
|
async listJobs(
|
|
@Query('page', new DefaultValuePipe(1), ParseIntPipe) page: number,
|
|
@Query('limit', new DefaultValuePipe(20), ParseIntPipe) limit: number,
|
|
@Query('status') status?: string,
|
|
): Promise<PaginatedResults<CrawlJobResponse>> {
|
|
return this.crawlerService.listJobs(page, limit, status);
|
|
}
|
|
|
|
@Get(':jobId')
|
|
async getJob(
|
|
@Param('jobId', ParseUUIDPipe) jobId: string,
|
|
): Promise<CrawlJobResponse> {
|
|
return this.crawlerService.getJob(jobId);
|
|
}
|
|
|
|
@Get(':jobId/results')
|
|
async getJobResults(
|
|
@Param('jobId', ParseUUIDPipe) jobId: string,
|
|
@Query('page', new DefaultValuePipe(1), ParseIntPipe) page: number,
|
|
@Query('limit', new DefaultValuePipe(50), ParseIntPipe) limit: number,
|
|
): Promise<PaginatedResults<CrawlResultResponse>> {
|
|
return this.crawlerService.getJobResults(jobId, page, limit);
|
|
}
|
|
|
|
@Delete(':jobId')
|
|
@HttpCode(HttpStatus.NO_CONTENT)
|
|
async cancelJob(
|
|
@Param('jobId', ParseUUIDPipe) jobId: string,
|
|
): Promise<CrawlJobResponse> {
|
|
return this.crawlerService.cancelJob(jobId);
|
|
}
|
|
|
|
@Post(':jobId/pause')
|
|
async pauseJob(
|
|
@Param('jobId', ParseUUIDPipe) jobId: string,
|
|
): Promise<CrawlJobResponse> {
|
|
return this.crawlerService.pauseJob(jobId);
|
|
}
|
|
|
|
@Post(':jobId/resume')
|
|
async resumeJob(
|
|
@Param('jobId', ParseUUIDPipe) jobId: string,
|
|
): Promise<CrawlJobResponse> {
|
|
return this.crawlerService.resumeJob(jobId);
|
|
}
|
|
}
|