mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:21:09 +02:00
NestJS-based web crawler service for structured content extraction. Features: - Depth-controlled crawling with URL pattern filtering - robots.txt compliance - HTML/PDF/Markdown content extraction - BullMQ job queue for async processing - Redis caching layer - Prometheus metrics
24 lines
575 B
Text
24 lines
575 B
Text
# Server
|
|
PORT=3023
|
|
NODE_ENV=development
|
|
|
|
# Database
|
|
DATABASE_URL=postgresql://manacore:devpassword@localhost:5432/manacore
|
|
|
|
# Redis (Queue)
|
|
REDIS_HOST=localhost
|
|
REDIS_PORT=6379
|
|
REDIS_PASSWORD=
|
|
|
|
# Crawling
|
|
CRAWLER_USER_AGENT=ManaCoreCrawler/1.0 (+https://manacore.io/bot)
|
|
CRAWLER_DEFAULT_RATE_LIMIT=2
|
|
CRAWLER_DEFAULT_MAX_DEPTH=3
|
|
CRAWLER_DEFAULT_MAX_PAGES=100
|
|
CRAWLER_TIMEOUT=30000
|
|
|
|
# External Services (optional - for single-page extraction fallback)
|
|
MANA_SEARCH_URL=http://localhost:3021
|
|
|
|
# CORS
|
|
CORS_ORIGINS=http://localhost:3000,http://localhost:5173,http://localhost:8081
|