managarten/services/mana-crawler/internal/handler/handler.go
Till JS 7e931b1c6d refactor(services): rename Go services, remove -go suffix
mana-search-go → mana-search
mana-notify-go → mana-notify
mana-crawler-go → mana-crawler
mana-api-gateway-go → mana-api-gateway

Legacy NestJS versions are deleted, suffix no longer needed.
Updated all references in docker-compose, CLAUDE.md, package.json,
Forgejo workflows, and service package.json files.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 10:18:40 +01:00

291 lines
9.2 KiB
Go

package handler
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"net/url"
"strconv"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/manacore/mana-crawler/internal/crawler"
)
// Handler serves the crawler HTTP API.
type Handler struct {
pool *pgxpool.Pool
crawler *crawler.Crawler
}
// NewHandler creates a new handler.
func NewHandler(pool *pgxpool.Pool, c *crawler.Crawler) *Handler {
return &Handler{pool: pool, crawler: c}
}
// StartCrawl handles POST /api/v1/crawl
func (h *Handler) StartCrawl(w http.ResponseWriter, r *http.Request) {
var body struct {
StartURL string `json:"startUrl"`
Config *crawler.CrawlConfig `json:"config"`
WebhookURL string `json:"webhookUrl"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid request"})
return
}
if body.StartURL == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "startUrl is required"})
return
}
parsed, err := url.Parse(body.StartURL)
if err != nil || parsed.Host == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid URL"})
return
}
// Defaults
cfg := crawler.CrawlConfig{
MaxDepth: 3,
MaxPages: 100,
RateLimit: 2,
RespectRobots: true,
OutputFormat: "markdown",
}
if body.Config != nil {
if body.Config.MaxDepth > 0 {
cfg.MaxDepth = body.Config.MaxDepth
}
if body.Config.MaxPages > 0 {
cfg.MaxPages = body.Config.MaxPages
}
if body.Config.RateLimit > 0 {
cfg.RateLimit = body.Config.RateLimit
}
cfg.RespectRobots = body.Config.RespectRobots
cfg.IncludePatterns = body.Config.IncludePatterns
cfg.ExcludePatterns = body.Config.ExcludePatterns
cfg.Selectors = body.Config.Selectors
if body.Config.OutputFormat != "" {
cfg.OutputFormat = body.Config.OutputFormat
}
}
// Insert job
var jobID string
configJSON, _ := json.Marshal(cfg)
err = h.pool.QueryRow(r.Context(), `
INSERT INTO crawler.crawl_jobs (start_url, domain, max_depth, max_pages, rate_limit, respect_robots, selectors, output, status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending')
RETURNING id
`, body.StartURL, parsed.Host, cfg.MaxDepth, cfg.MaxPages, cfg.RateLimit, cfg.RespectRobots,
string(configJSON), fmt.Sprintf(`{"format":"%s"}`, cfg.OutputFormat)).Scan(&jobID)
if err != nil {
slog.Error("create job failed", "error", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to create job"})
return
}
// Start crawl (use background context so it outlives the HTTP request)
if err := h.crawler.StartJob(context.Background(), jobID, body.StartURL, cfg); err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to start crawl"})
return
}
writeJSON(w, http.StatusCreated, map[string]any{
"jobId": jobID,
"status": "running",
"startUrl": body.StartURL,
"domain": parsed.Host,
"config": cfg,
})
}
// GetJob handles GET /api/v1/crawl/{jobId}
func (h *Handler) GetJob(w http.ResponseWriter, r *http.Request) {
jobID := r.PathValue("jobId")
var job struct {
ID string `json:"jobId"`
StartURL string `json:"startUrl"`
Domain string `json:"domain"`
Status string `json:"status"`
Progress string `json:"progress"`
Error *string `json:"error"`
StartedAt *time.Time `json:"startedAt"`
CompletedAt *time.Time `json:"completedAt"`
CreatedAt time.Time `json:"createdAt"`
}
err := h.pool.QueryRow(r.Context(), `
SELECT id, start_url, domain, status, COALESCE(progress::text, '{}'), error, started_at, completed_at, created_at
FROM crawler.crawl_jobs WHERE id = $1
`, jobID).Scan(&job.ID, &job.StartURL, &job.Domain, &job.Status, &job.Progress, &job.Error, &job.StartedAt, &job.CompletedAt, &job.CreatedAt)
if err != nil {
writeJSON(w, http.StatusNotFound, map[string]string{"error": "job not found"})
return
}
writeJSON(w, http.StatusOK, job)
}
// GetJobResults handles GET /api/v1/crawl/{jobId}/results
func (h *Handler) GetJobResults(w http.ResponseWriter, r *http.Request) {
jobID := r.PathValue("jobId")
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
if page < 1 {
page = 1
}
if limit < 1 || limit > 100 {
limit = 50
}
offset := (page - 1) * limit
// Count total
var total int
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_results WHERE job_id = $1`, jobID).Scan(&total)
rows, err := h.pool.Query(r.Context(), `
SELECT id, url, parent_url, depth, title, content, markdown, links, metadata, status_code, error, created_at
FROM crawler.crawl_results WHERE job_id = $1 ORDER BY created_at LIMIT $2 OFFSET $3
`, jobID, limit, offset)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
return
}
defer rows.Close()
var results []map[string]any
for rows.Next() {
var id, u string
var parentURL, title, content, markdown, links, metadata, errMsg *string
var depth, statusCode int
var createdAt time.Time
rows.Scan(&id, &u, &parentURL, &depth, &title, &content, &markdown, &links, &metadata, &statusCode, &errMsg, &createdAt)
results = append(results, map[string]any{
"id": id, "url": u, "parentUrl": parentURL, "depth": depth,
"title": title, "content": content, "markdown": markdown,
"links": links, "metadata": metadata,
"statusCode": statusCode, "error": errMsg, "createdAt": createdAt,
})
}
if results == nil {
results = []map[string]any{}
}
writeJSON(w, http.StatusOK, map[string]any{
"results": results,
"pagination": map[string]any{
"page": page, "limit": limit, "total": total,
"totalPages": (total + limit - 1) / limit,
},
})
}
// ListJobs handles GET /api/v1/crawl
func (h *Handler) ListJobs(w http.ResponseWriter, r *http.Request) {
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
if page < 1 {
page = 1
}
if limit < 1 || limit > 100 {
limit = 20
}
offset := (page - 1) * limit
status := r.URL.Query().Get("status")
var total int
if status != "" {
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status=$1`, status).Scan(&total)
} else {
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs`).Scan(&total)
}
query := `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs ORDER BY created_at DESC LIMIT $1 OFFSET $2`
args := []any{limit, offset}
if status != "" {
query = `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs WHERE status=$3 ORDER BY created_at DESC LIMIT $1 OFFSET $2`
args = append(args, status)
}
rows, err := h.pool.Query(r.Context(), query, args...)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
return
}
defer rows.Close()
var jobs []map[string]any
for rows.Next() {
var id, startURL, domain, st, progress string
var createdAt time.Time
rows.Scan(&id, &startURL, &domain, &st, &progress, &createdAt)
jobs = append(jobs, map[string]any{
"jobId": id, "startUrl": startURL, "domain": domain,
"status": st, "progress": progress, "createdAt": createdAt,
})
}
if jobs == nil {
jobs = []map[string]any{}
}
writeJSON(w, http.StatusOK, map[string]any{
"results": jobs,
"pagination": map[string]any{
"page": page, "limit": limit, "total": total,
},
})
}
// CancelJob handles DELETE /api/v1/crawl/{jobId}
func (h *Handler) CancelJob(w http.ResponseWriter, r *http.Request) {
jobID := r.PathValue("jobId")
h.crawler.CancelJob(jobID)
h.pool.Exec(r.Context(), `UPDATE crawler.crawl_jobs SET status='cancelled', updated_at=NOW() WHERE id=$1`, jobID)
w.WriteHeader(http.StatusNoContent)
}
// Health handles GET /health
func (h *Handler) Health(w http.ResponseWriter, r *http.Request) {
dbOK := "ok"
if err := h.pool.Ping(r.Context()); err != nil {
dbOK = "error"
}
status := "ok"
if dbOK != "ok" {
status = "degraded"
}
writeJSON(w, http.StatusOK, map[string]any{
"status": status, "service": "mana-crawler", "database": dbOK,
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
}
// Metrics handles GET /metrics
func (h *Handler) Metrics(w http.ResponseWriter, r *http.Request) {
var running, completed, failed int
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='running'`).Scan(&running)
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='completed'`).Scan(&completed)
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='failed'`).Scan(&failed)
w.Header().Set("Content-Type", "text/plain")
fmt.Fprintf(w, "# HELP mana_crawler_jobs Crawl jobs by status\n")
fmt.Fprintf(w, "# TYPE mana_crawler_jobs gauge\n")
fmt.Fprintf(w, "mana_crawler_jobs{status=\"running\"} %d\n", running)
fmt.Fprintf(w, "mana_crawler_jobs{status=\"completed\"} %d\n", completed)
fmt.Fprintf(w, "mana_crawler_jobs{status=\"failed\"} %d\n", failed)
}
func writeJSON(w http.ResponseWriter, status int, data any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}