managarten/services/mana-crawler/internal/robots/robots.go
Till JS 7e931b1c6d refactor(services): rename Go services, remove -go suffix
mana-search-go → mana-search
mana-notify-go → mana-notify
mana-crawler-go → mana-crawler
mana-api-gateway-go → mana-api-gateway

Legacy NestJS versions are deleted, suffix no longer needed.
Updated all references in docker-compose, CLAUDE.md, package.json,
Forgejo workflows, and service package.json files.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 10:18:40 +01:00

158 lines
3.3 KiB
Go

package robots
import (
"context"
"fmt"
"io"
"log/slog"
"net/http"
"sync"
"time"
"github.com/temoto/robotstxt"
)
// Checker checks robots.txt rules for URLs.
type Checker struct {
userAgent string
client *http.Client
mu sync.RWMutex
cache map[string]*cacheEntry
}
type cacheEntry struct {
data *robotstxt.RobotsData
expiresAt time.Time
}
// NewChecker creates a new robots.txt checker.
func NewChecker(userAgent string) *Checker {
return &Checker{
userAgent: userAgent,
client: &http.Client{Timeout: 5 * time.Second},
cache: make(map[string]*cacheEntry),
}
}
// IsAllowed checks if a URL can be crawled.
func (c *Checker) IsAllowed(ctx context.Context, rawURL string) (bool, error) {
u, err := parseHost(rawURL)
if err != nil {
return true, nil
}
robots, err := c.getRobots(ctx, u.scheme, u.host)
if err != nil {
return true, nil // Allow on error
}
group := robots.FindGroup(c.userAgent)
if group == nil {
return true, nil
}
return group.Test(rawURL), nil
}
// GetCrawlDelay returns the crawl delay for a domain.
func (c *Checker) GetCrawlDelay(ctx context.Context, rawURL string) time.Duration {
u, err := parseHost(rawURL)
if err != nil {
return 0
}
robots, err := c.getRobots(ctx, u.scheme, u.host)
if err != nil {
return 0
}
group := robots.FindGroup(c.userAgent)
if group == nil {
return 0
}
return group.CrawlDelay
}
func (c *Checker) getRobots(ctx context.Context, scheme, host string) (*robotstxt.RobotsData, error) {
c.mu.RLock()
entry, ok := c.cache[host]
c.mu.RUnlock()
if ok && time.Now().Before(entry.expiresAt) {
return entry.data, nil
}
// Fetch
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
req, err := http.NewRequestWithContext(ctx, "GET", robotsURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.userAgent)
resp, err := c.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
// No robots.txt → allow all
empty := &robotstxt.RobotsData{}
c.cacheSet(host, empty)
return empty, nil
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
robots, err := robotstxt.FromBytes(body)
if err != nil {
slog.Warn("invalid robots.txt", "host", host, "error", err)
empty := &robotstxt.RobotsData{}
c.cacheSet(host, empty)
return empty, nil
}
c.cacheSet(host, robots)
return robots, nil
}
func (c *Checker) cacheSet(host string, data *robotstxt.RobotsData) {
c.mu.Lock()
c.cache[host] = &cacheEntry{data: data, expiresAt: time.Now().Add(24 * time.Hour)}
c.mu.Unlock()
}
type hostInfo struct {
scheme string
host string
}
func parseHost(rawURL string) (hostInfo, error) {
// Simple parsing without importing net/url to avoid circular deps
scheme := "https"
rest := rawURL
if idx := len("https://"); len(rawURL) > idx && rawURL[:idx] == "https://" {
rest = rawURL[idx:]
} else if idx := len("http://"); len(rawURL) > idx && rawURL[:idx] == "http://" {
scheme = "http"
rest = rawURL[idx:]
}
if slashIdx := indexByte(rest, '/'); slashIdx > 0 {
rest = rest[:slashIdx]
}
return hostInfo{scheme: scheme, host: rest}, nil
}
func indexByte(s string, b byte) int {
for i := 0; i < len(s); i++ {
if s[i] == b {
return i
}
}
return -1
}