managarten/services/mana-crawler/internal/handler/handler.go

package handler

import (
	"context"
	"encoding/json"
	"fmt"
	"log/slog"
	"net/http"

	"github.com/mana/shared-go/httputil"
	"net/url"
	"strconv"
	"time"

	"github.com/jackc/pgx/v5/pgxpool"
	"github.com/mana/mana-crawler/internal/crawler"
)

// Handler serves the crawler HTTP API.
type Handler struct {
	pool    *pgxpool.Pool
	crawler *crawler.Crawler
}

// NewHandler creates a new handler.
func NewHandler(pool *pgxpool.Pool, c *crawler.Crawler) *Handler {
	return &Handler{pool: pool, crawler: c}
}

// StartCrawl handles POST /api/v1/crawl
func (h *Handler) StartCrawl(w http.ResponseWriter, r *http.Request) {
	var body struct {
		StartURL   string               `json:"startUrl"`
		Config     *crawler.CrawlConfig `json:"config"`
		WebhookURL string               `json:"webhookUrl"`
	}
	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
		httputil.WriteJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid request"})
		return
	}

	if body.StartURL == "" {
		httputil.WriteJSON(w, http.StatusBadRequest, map[string]string{"error": "startUrl is required"})
		return
	}

	parsed, err := url.Parse(body.StartURL)
	if err != nil || parsed.Host == "" {
		httputil.WriteJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid URL"})
		return
	}

	// Defaults
	cfg := crawler.CrawlConfig{
		MaxDepth:      3,
		MaxPages:      100,
		RateLimit:     2,
		RespectRobots: true,
		OutputFormat:  "markdown",
	}
	if body.Config != nil {
		if body.Config.MaxDepth > 0 {
			cfg.MaxDepth = body.Config.MaxDepth
		}
		if body.Config.MaxPages > 0 {
			cfg.MaxPages = body.Config.MaxPages
		}
		if body.Config.RateLimit > 0 {
			cfg.RateLimit = body.Config.RateLimit
		}
		cfg.RespectRobots = body.Config.RespectRobots
		cfg.IncludePatterns = body.Config.IncludePatterns
		cfg.ExcludePatterns = body.Config.ExcludePatterns
		cfg.Selectors = body.Config.Selectors
		if body.Config.OutputFormat != "" {
			cfg.OutputFormat = body.Config.OutputFormat
		}
	}

	// Insert job
	var jobID string
	configJSON, _ := json.Marshal(cfg)
	err = h.pool.QueryRow(r.Context(), `
		INSERT INTO crawler.crawl_jobs (start_url, domain, max_depth, max_pages, rate_limit, respect_robots, selectors, output, status)
		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending')
		RETURNING id
	`, body.StartURL, parsed.Host, cfg.MaxDepth, cfg.MaxPages, cfg.RateLimit, cfg.RespectRobots,
		string(configJSON), fmt.Sprintf(`{"format":"%s"}`, cfg.OutputFormat)).Scan(&jobID)
	if err != nil {
		slog.Error("create job failed", "error", err)
		httputil.WriteJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to create job"})
		return
	}

	// Start crawl (use background context so it outlives the HTTP request)
	if err := h.crawler.StartJob(context.Background(), jobID, body.StartURL, cfg); err != nil {
		httputil.WriteJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to start crawl"})
		return
	}

	httputil.WriteJSON(w, http.StatusCreated, map[string]any{
		"jobId":    jobID,
		"status":   "running",
		"startUrl": body.StartURL,
		"domain":   parsed.Host,
		"config":   cfg,
	})
}

// GetJob handles GET /api/v1/crawl/{jobId}
func (h *Handler) GetJob(w http.ResponseWriter, r *http.Request) {
	jobID := r.PathValue("jobId")

	var job struct {
		ID          string     `json:"jobId"`
		StartURL    string     `json:"startUrl"`
		Domain      string     `json:"domain"`
		Status      string     `json:"status"`
		Progress    string     `json:"progress"`
		Error       *string    `json:"error"`
		StartedAt   *time.Time `json:"startedAt"`
		CompletedAt *time.Time `json:"completedAt"`
		CreatedAt   time.Time  `json:"createdAt"`
	}

	err := h.pool.QueryRow(r.Context(), `
		SELECT id, start_url, domain, status, COALESCE(progress::text, '{}'), error, started_at, completed_at, created_at
		FROM crawler.crawl_jobs WHERE id = $1
	`, jobID).Scan(&job.ID, &job.StartURL, &job.Domain, &job.Status, &job.Progress, &job.Error, &job.StartedAt, &job.CompletedAt, &job.CreatedAt)
	if err != nil {
		httputil.WriteJSON(w, http.StatusNotFound, map[string]string{"error": "job not found"})
		return
	}

	httputil.WriteJSON(w, http.StatusOK, job)
}

// GetJobResults handles GET /api/v1/crawl/{jobId}/results
func (h *Handler) GetJobResults(w http.ResponseWriter, r *http.Request) {
	jobID := r.PathValue("jobId")
	page, _ := strconv.Atoi(r.URL.Query().Get("page"))
	limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
	if page < 1 {
		page = 1
	}
	if limit < 1 || limit > 100 {
		limit = 50
	}
	offset := (page - 1) * limit

	// Count total
	var total int
	h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_results WHERE job_id = $1`, jobID).Scan(&total)

	rows, err := h.pool.Query(r.Context(), `
		SELECT id, url, parent_url, depth, title, content, markdown, links, metadata, status_code, error, created_at
		FROM crawler.crawl_results WHERE job_id = $1 ORDER BY created_at LIMIT $2 OFFSET $3
	`, jobID, limit, offset)
	if err != nil {
		httputil.WriteJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
		return
	}
	defer rows.Close()

	var results []map[string]any
	for rows.Next() {
		var id, u string
		var parentURL, title, content, markdown, links, metadata, errMsg *string
		var depth, statusCode int
		var createdAt time.Time

		rows.Scan(&id, &u, &parentURL, &depth, &title, &content, &markdown, &links, &metadata, &statusCode, &errMsg, &createdAt)
		results = append(results, map[string]any{
			"id": id, "url": u, "parentUrl": parentURL, "depth": depth,
			"title": title, "content": content, "markdown": markdown,
			"links": links, "metadata": metadata,
			"statusCode": statusCode, "error": errMsg, "createdAt": createdAt,
		})
	}

	if results == nil {
		results = []map[string]any{}
	}

	httputil.WriteJSON(w, http.StatusOK, map[string]any{
		"results": results,
		"pagination": map[string]any{
			"page": page, "limit": limit, "total": total,
			"totalPages": (total + limit - 1) / limit,
		},
	})
}

// ListJobs handles GET /api/v1/crawl
func (h *Handler) ListJobs(w http.ResponseWriter, r *http.Request) {
	page, _ := strconv.Atoi(r.URL.Query().Get("page"))
	limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
	if page < 1 {
		page = 1
	}
	if limit < 1 || limit > 100 {
		limit = 20
	}
	offset := (page - 1) * limit
	status := r.URL.Query().Get("status")

	var total int
	if status != "" {
		h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status=$1`, status).Scan(&total)
	} else {
		h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs`).Scan(&total)
	}

	query := `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs ORDER BY created_at DESC LIMIT $1 OFFSET $2`
	args := []any{limit, offset}
	if status != "" {
		query = `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs WHERE status=$3 ORDER BY created_at DESC LIMIT $1 OFFSET $2`
		args = append(args, status)
	}

	rows, err := h.pool.Query(r.Context(), query, args...)
	if err != nil {
		httputil.WriteJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
		return
	}
	defer rows.Close()

	var jobs []map[string]any
	for rows.Next() {
		var id, startURL, domain, st, progress string
		var createdAt time.Time
		rows.Scan(&id, &startURL, &domain, &st, &progress, &createdAt)
		jobs = append(jobs, map[string]any{
			"jobId": id, "startUrl": startURL, "domain": domain,
			"status": st, "progress": progress, "createdAt": createdAt,
		})
	}
	if jobs == nil {
		jobs = []map[string]any{}
	}

	httputil.WriteJSON(w, http.StatusOK, map[string]any{
		"results": jobs,
		"pagination": map[string]any{
			"page": page, "limit": limit, "total": total,
		},
	})
}

// CancelJob handles DELETE /api/v1/crawl/{jobId}
func (h *Handler) CancelJob(w http.ResponseWriter, r *http.Request) {
	jobID := r.PathValue("jobId")
	h.crawler.CancelJob(jobID)
	h.pool.Exec(r.Context(), `UPDATE crawler.crawl_jobs SET status='cancelled', updated_at=NOW() WHERE id=$1`, jobID)
	w.WriteHeader(http.StatusNoContent)
}

// Health handles GET /health
func (h *Handler) Health(w http.ResponseWriter, r *http.Request) {
	dbOK := "ok"
	if err := h.pool.Ping(r.Context()); err != nil {
		dbOK = "error"
	}
	status := "ok"
	if dbOK != "ok" {
		status = "degraded"
	}
	httputil.WriteJSON(w, http.StatusOK, map[string]any{
		"status": status, "service": "mana-crawler", "database": dbOK,
		"timestamp": time.Now().UTC().Format(time.RFC3339),
	})
}

// Metrics handles GET /metrics
func (h *Handler) Metrics(w http.ResponseWriter, r *http.Request) {
	var running, completed, failed int
	h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='running'`).Scan(&running)
	h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='completed'`).Scan(&completed)
	h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='failed'`).Scan(&failed)

	w.Header().Set("Content-Type", "text/plain")
	fmt.Fprintf(w, "# HELP mana_crawler_jobs Crawl jobs by status\n")
	fmt.Fprintf(w, "# TYPE mana_crawler_jobs gauge\n")
	fmt.Fprintf(w, "mana_crawler_jobs{status=\"running\"} %d\n", running)
	fmt.Fprintf(w, "mana_crawler_jobs{status=\"completed\"} %d\n", completed)
	fmt.Fprintf(w, "mana_crawler_jobs{status=\"failed\"} %d\n", failed)
}