mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 22:01:09 +02:00
mana-search-go → mana-search mana-notify-go → mana-notify mana-crawler-go → mana-crawler mana-api-gateway-go → mana-api-gateway Legacy NestJS versions are deleted, suffix no longer needed. Updated all references in docker-compose, CLAUDE.md, package.json, Forgejo workflows, and service package.json files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
364 lines
9.4 KiB
Go
364 lines
9.4 KiB
Go
package crawler
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5/pgxpool"
|
|
"github.com/manacore/mana-crawler/internal/parser"
|
|
"github.com/manacore/mana-crawler/internal/robots"
|
|
)
|
|
|
|
// CrawlConfig holds configuration for a crawl job.
|
|
type CrawlConfig struct {
|
|
MaxDepth int `json:"maxDepth"`
|
|
MaxPages int `json:"maxPages"`
|
|
RateLimit int `json:"rateLimit"` // requests/second
|
|
RespectRobots bool `json:"respectRobots"`
|
|
IncludePatterns []string `json:"includePatterns"`
|
|
ExcludePatterns []string `json:"excludePatterns"`
|
|
Selectors *parser.Selectors `json:"selectors"`
|
|
OutputFormat string `json:"format"` // text, html, markdown
|
|
}
|
|
|
|
// Progress tracks crawl progress.
|
|
type Progress struct {
|
|
Discovered int `json:"discovered"`
|
|
Crawled int `json:"crawled"`
|
|
Failed int `json:"failed"`
|
|
Queued int `json:"queued"`
|
|
}
|
|
|
|
// CrawlJob represents a running crawl job.
|
|
type CrawlJob struct {
|
|
ID string
|
|
StartURL string
|
|
Domain string
|
|
Config CrawlConfig
|
|
Status string // pending, running, paused, completed, failed, cancelled
|
|
Progress Progress
|
|
Error string
|
|
StartedAt *time.Time
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
// Crawler manages crawl jobs.
|
|
type Crawler struct {
|
|
pool *pgxpool.Pool
|
|
robots *robots.Checker
|
|
httpClient *http.Client
|
|
userAgent string
|
|
concurrency int
|
|
|
|
mu sync.RWMutex
|
|
jobs map[string]context.CancelFunc // active job cancellation
|
|
}
|
|
|
|
// New creates a new Crawler.
|
|
func New(pool *pgxpool.Pool, robotsChecker *robots.Checker, userAgent string, concurrency int, timeout time.Duration) *Crawler {
|
|
// Skip TLS verification for outgoing crawl requests.
|
|
// Required in Docker Desktop for Mac (TLS proxy) and for crawling
|
|
// sites with self-signed or expired certificates.
|
|
transport := &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
MaxIdleConns: 100,
|
|
MaxIdleConnsPerHost: 10,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
}
|
|
|
|
return &Crawler{
|
|
pool: pool,
|
|
robots: robotsChecker,
|
|
httpClient: &http.Client{
|
|
Timeout: timeout,
|
|
Transport: transport,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if len(via) >= 10 {
|
|
return fmt.Errorf("too many redirects")
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
userAgent: userAgent,
|
|
concurrency: concurrency,
|
|
jobs: make(map[string]context.CancelFunc),
|
|
}
|
|
}
|
|
|
|
// StartJob begins a new crawl job.
|
|
func (c *Crawler) StartJob(ctx context.Context, jobID, startURL string, cfg CrawlConfig) error {
|
|
jobCtx, cancel := context.WithCancel(ctx)
|
|
|
|
c.mu.Lock()
|
|
c.jobs[jobID] = cancel
|
|
c.mu.Unlock()
|
|
|
|
// Update job status to running
|
|
now := time.Now()
|
|
c.pool.Exec(ctx, `UPDATE crawler.crawl_jobs SET status='running', started_at=$2, updated_at=NOW() WHERE id=$1`, jobID, now)
|
|
|
|
go c.runCrawl(jobCtx, jobID, startURL, cfg)
|
|
return nil
|
|
}
|
|
|
|
// CancelJob cancels a running job.
|
|
func (c *Crawler) CancelJob(jobID string) {
|
|
c.mu.Lock()
|
|
if cancel, ok := c.jobs[jobID]; ok {
|
|
cancel()
|
|
delete(c.jobs, jobID)
|
|
}
|
|
c.mu.Unlock()
|
|
}
|
|
|
|
func (c *Crawler) runCrawl(ctx context.Context, jobID, startURL string, cfg CrawlConfig) {
|
|
defer func() {
|
|
c.mu.Lock()
|
|
delete(c.jobs, jobID)
|
|
c.mu.Unlock()
|
|
}()
|
|
|
|
slog.Info("crawl started", "job", jobID, "url", startURL)
|
|
|
|
base, err := url.Parse(startURL)
|
|
if err != nil {
|
|
c.failJob(jobID, "invalid start URL: "+err.Error())
|
|
return
|
|
}
|
|
|
|
// Track visited URLs
|
|
visited := &sync.Map{}
|
|
var crawled, failed atomic.Int32
|
|
|
|
// Work queue (channel-based instead of BullMQ)
|
|
type workItem struct {
|
|
url string
|
|
parentURL string
|
|
depth int
|
|
}
|
|
|
|
queue := make(chan workItem, cfg.MaxPages*2)
|
|
queue <- workItem{url: startURL, depth: 0}
|
|
visited.Store(startURL, true)
|
|
|
|
// Rate limiter
|
|
delay := time.Duration(float64(time.Second) / float64(cfg.RateLimit))
|
|
ticker := time.NewTicker(delay)
|
|
defer ticker.Stop()
|
|
|
|
// Worker pool
|
|
var wg sync.WaitGroup
|
|
sem := make(chan struct{}, c.concurrency)
|
|
|
|
done := false
|
|
for !done {
|
|
select {
|
|
case <-ctx.Done():
|
|
c.pool.Exec(context.Background(), `UPDATE crawler.crawl_jobs SET status='cancelled', updated_at=NOW() WHERE id=$1`, jobID)
|
|
slog.Info("crawl cancelled", "job", jobID)
|
|
return
|
|
|
|
case item, ok := <-queue:
|
|
if !ok {
|
|
done = true
|
|
break
|
|
}
|
|
|
|
if int(crawled.Load()) >= cfg.MaxPages {
|
|
done = true
|
|
break
|
|
}
|
|
|
|
// Rate limit
|
|
<-ticker.C
|
|
|
|
sem <- struct{}{}
|
|
wg.Add(1)
|
|
|
|
go func(item workItem) {
|
|
defer wg.Done()
|
|
defer func() { <-sem }()
|
|
|
|
// Check robots.txt
|
|
if cfg.RespectRobots {
|
|
allowed, _ := c.robots.IsAllowed(ctx, item.url)
|
|
if !allowed {
|
|
slog.Debug("blocked by robots.txt", "url", item.url)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Fetch and parse
|
|
result, statusCode, fetchErr := c.fetchAndParse(ctx, item.url, base, &cfg)
|
|
|
|
if fetchErr != nil {
|
|
failed.Add(1)
|
|
c.saveResult(ctx, jobID, item.url, item.parentURL, item.depth, nil, 0, fetchErr.Error())
|
|
} else {
|
|
crawled.Add(1)
|
|
c.saveResult(ctx, jobID, item.url, item.parentURL, item.depth, result, statusCode, "")
|
|
|
|
// Queue discovered links
|
|
if item.depth < cfg.MaxDepth && result != nil {
|
|
for _, link := range result.Links {
|
|
if _, loaded := visited.LoadOrStore(link, true); !loaded {
|
|
if matchesPatterns(link, cfg.IncludePatterns, cfg.ExcludePatterns) {
|
|
select {
|
|
case queue <- workItem{url: link, parentURL: item.url, depth: item.depth + 1}:
|
|
default:
|
|
// Queue full
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update progress
|
|
prog := Progress{
|
|
Crawled: int(crawled.Load()),
|
|
Failed: int(failed.Load()),
|
|
}
|
|
progJSON, _ := json.Marshal(prog)
|
|
c.pool.Exec(ctx, `UPDATE crawler.crawl_jobs SET progress=$2, updated_at=NOW() WHERE id=$1`, jobID, string(progJSON))
|
|
|
|
}(item)
|
|
|
|
default:
|
|
// If queue is empty and no workers running, we're done
|
|
if len(queue) == 0 {
|
|
// Wait a bit for workers to finish and potentially add more URLs
|
|
time.Sleep(500 * time.Millisecond)
|
|
if len(queue) == 0 {
|
|
done = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
// Mark completed
|
|
c.pool.Exec(context.Background(), `UPDATE crawler.crawl_jobs SET status='completed', completed_at=NOW(), updated_at=NOW() WHERE id=$1`, jobID)
|
|
slog.Info("crawl completed", "job", jobID, "crawled", crawled.Load(), "failed", failed.Load())
|
|
}
|
|
|
|
func (c *Crawler) fetchAndParse(ctx context.Context, rawURL string, base *url.URL, cfg *CrawlConfig) (*parser.Result, int, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
|
|
|
start := time.Now()
|
|
resp, err := c.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 400 {
|
|
return nil, resp.StatusCode, fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if !strings.Contains(contentType, "text/html") && !strings.Contains(contentType, "application/xhtml") {
|
|
return nil, resp.StatusCode, fmt.Errorf("not HTML: %s", contentType)
|
|
}
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit
|
|
if err != nil {
|
|
return nil, resp.StatusCode, err
|
|
}
|
|
|
|
_ = time.Since(start) // fetchDuration
|
|
|
|
result, err := parser.Parse(string(body), rawURL, cfg.Selectors)
|
|
if err != nil {
|
|
return nil, resp.StatusCode, err
|
|
}
|
|
|
|
return result, resp.StatusCode, nil
|
|
}
|
|
|
|
func (c *Crawler) saveResult(ctx context.Context, jobID, url, parentURL string, depth int, result *parser.Result, statusCode int, errMsg string) {
|
|
var title, content, markdown, linksJSON *string
|
|
var metadataJSON *string
|
|
|
|
if result != nil {
|
|
if result.Title != "" {
|
|
title = &result.Title
|
|
}
|
|
if result.Content != "" {
|
|
content = &result.Content
|
|
}
|
|
if result.Markdown != "" {
|
|
markdown = &result.Markdown
|
|
}
|
|
if len(result.Links) > 0 {
|
|
b, _ := json.Marshal(result.Links)
|
|
s := string(b)
|
|
linksJSON = &s
|
|
}
|
|
if len(result.Metadata) > 0 {
|
|
b, _ := json.Marshal(result.Metadata)
|
|
s := string(b)
|
|
metadataJSON = &s
|
|
}
|
|
}
|
|
|
|
var parentPtr *string
|
|
if parentURL != "" {
|
|
parentPtr = &parentURL
|
|
}
|
|
var errPtr *string
|
|
if errMsg != "" {
|
|
errPtr = &errMsg
|
|
}
|
|
|
|
c.pool.Exec(ctx, `
|
|
INSERT INTO crawler.crawl_results (job_id, url, parent_url, depth, title, content, markdown, links, metadata, status_code, error)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
|
`, jobID, url, parentPtr, depth, title, content, markdown, linksJSON, metadataJSON, statusCode, errPtr)
|
|
}
|
|
|
|
func (c *Crawler) failJob(jobID, errMsg string) {
|
|
c.pool.Exec(context.Background(), `UPDATE crawler.crawl_jobs SET status='failed', error=$2, updated_at=NOW() WHERE id=$1`, jobID, errMsg)
|
|
slog.Error("crawl failed", "job", jobID, "error", errMsg)
|
|
}
|
|
|
|
func matchesPatterns(u string, include, exclude []string) bool {
|
|
// If include patterns specified, URL must match at least one
|
|
if len(include) > 0 {
|
|
matched := false
|
|
for _, pattern := range include {
|
|
if strings.Contains(u, strings.TrimSuffix(strings.TrimPrefix(pattern, "*"), "*")) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// If exclude patterns specified, URL must not match any
|
|
for _, pattern := range exclude {
|
|
if strings.Contains(u, strings.TrimSuffix(strings.TrimPrefix(pattern, "*"), "*")) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|