mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-22 13:26:42 +02:00
refactor(services): rename Go services, remove -go suffix
mana-search-go → mana-search mana-notify-go → mana-notify mana-crawler-go → mana-crawler mana-api-gateway-go → mana-api-gateway Legacy NestJS versions are deleted, suffix no longer needed. Updated all references in docker-compose, CLAUDE.md, package.json, Forgejo workflows, and service package.json files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
79080d6654
commit
7e931b1c6d
90 changed files with 41 additions and 38 deletions
291
services/mana-crawler/internal/handler/handler.go
Normal file
291
services/mana-crawler/internal/handler/handler.go
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/manacore/mana-crawler/internal/crawler"
|
||||
)
|
||||
|
||||
// Handler serves the crawler HTTP API.
|
||||
type Handler struct {
|
||||
pool *pgxpool.Pool
|
||||
crawler *crawler.Crawler
|
||||
}
|
||||
|
||||
// NewHandler creates a new handler.
|
||||
func NewHandler(pool *pgxpool.Pool, c *crawler.Crawler) *Handler {
|
||||
return &Handler{pool: pool, crawler: c}
|
||||
}
|
||||
|
||||
// StartCrawl handles POST /api/v1/crawl
|
||||
func (h *Handler) StartCrawl(w http.ResponseWriter, r *http.Request) {
|
||||
var body struct {
|
||||
StartURL string `json:"startUrl"`
|
||||
Config *crawler.CrawlConfig `json:"config"`
|
||||
WebhookURL string `json:"webhookUrl"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid request"})
|
||||
return
|
||||
}
|
||||
|
||||
if body.StartURL == "" {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "startUrl is required"})
|
||||
return
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(body.StartURL)
|
||||
if err != nil || parsed.Host == "" {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid URL"})
|
||||
return
|
||||
}
|
||||
|
||||
// Defaults
|
||||
cfg := crawler.CrawlConfig{
|
||||
MaxDepth: 3,
|
||||
MaxPages: 100,
|
||||
RateLimit: 2,
|
||||
RespectRobots: true,
|
||||
OutputFormat: "markdown",
|
||||
}
|
||||
if body.Config != nil {
|
||||
if body.Config.MaxDepth > 0 {
|
||||
cfg.MaxDepth = body.Config.MaxDepth
|
||||
}
|
||||
if body.Config.MaxPages > 0 {
|
||||
cfg.MaxPages = body.Config.MaxPages
|
||||
}
|
||||
if body.Config.RateLimit > 0 {
|
||||
cfg.RateLimit = body.Config.RateLimit
|
||||
}
|
||||
cfg.RespectRobots = body.Config.RespectRobots
|
||||
cfg.IncludePatterns = body.Config.IncludePatterns
|
||||
cfg.ExcludePatterns = body.Config.ExcludePatterns
|
||||
cfg.Selectors = body.Config.Selectors
|
||||
if body.Config.OutputFormat != "" {
|
||||
cfg.OutputFormat = body.Config.OutputFormat
|
||||
}
|
||||
}
|
||||
|
||||
// Insert job
|
||||
var jobID string
|
||||
configJSON, _ := json.Marshal(cfg)
|
||||
err = h.pool.QueryRow(r.Context(), `
|
||||
INSERT INTO crawler.crawl_jobs (start_url, domain, max_depth, max_pages, rate_limit, respect_robots, selectors, output, status)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'pending')
|
||||
RETURNING id
|
||||
`, body.StartURL, parsed.Host, cfg.MaxDepth, cfg.MaxPages, cfg.RateLimit, cfg.RespectRobots,
|
||||
string(configJSON), fmt.Sprintf(`{"format":"%s"}`, cfg.OutputFormat)).Scan(&jobID)
|
||||
if err != nil {
|
||||
slog.Error("create job failed", "error", err)
|
||||
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to create job"})
|
||||
return
|
||||
}
|
||||
|
||||
// Start crawl (use background context so it outlives the HTTP request)
|
||||
if err := h.crawler.StartJob(context.Background(), jobID, body.StartURL, cfg); err != nil {
|
||||
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to start crawl"})
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusCreated, map[string]any{
|
||||
"jobId": jobID,
|
||||
"status": "running",
|
||||
"startUrl": body.StartURL,
|
||||
"domain": parsed.Host,
|
||||
"config": cfg,
|
||||
})
|
||||
}
|
||||
|
||||
// GetJob handles GET /api/v1/crawl/{jobId}
|
||||
func (h *Handler) GetJob(w http.ResponseWriter, r *http.Request) {
|
||||
jobID := r.PathValue("jobId")
|
||||
|
||||
var job struct {
|
||||
ID string `json:"jobId"`
|
||||
StartURL string `json:"startUrl"`
|
||||
Domain string `json:"domain"`
|
||||
Status string `json:"status"`
|
||||
Progress string `json:"progress"`
|
||||
Error *string `json:"error"`
|
||||
StartedAt *time.Time `json:"startedAt"`
|
||||
CompletedAt *time.Time `json:"completedAt"`
|
||||
CreatedAt time.Time `json:"createdAt"`
|
||||
}
|
||||
|
||||
err := h.pool.QueryRow(r.Context(), `
|
||||
SELECT id, start_url, domain, status, COALESCE(progress::text, '{}'), error, started_at, completed_at, created_at
|
||||
FROM crawler.crawl_jobs WHERE id = $1
|
||||
`, jobID).Scan(&job.ID, &job.StartURL, &job.Domain, &job.Status, &job.Progress, &job.Error, &job.StartedAt, &job.CompletedAt, &job.CreatedAt)
|
||||
if err != nil {
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{"error": "job not found"})
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, job)
|
||||
}
|
||||
|
||||
// GetJobResults handles GET /api/v1/crawl/{jobId}/results
|
||||
func (h *Handler) GetJobResults(w http.ResponseWriter, r *http.Request) {
|
||||
jobID := r.PathValue("jobId")
|
||||
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
|
||||
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
|
||||
if page < 1 {
|
||||
page = 1
|
||||
}
|
||||
if limit < 1 || limit > 100 {
|
||||
limit = 50
|
||||
}
|
||||
offset := (page - 1) * limit
|
||||
|
||||
// Count total
|
||||
var total int
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_results WHERE job_id = $1`, jobID).Scan(&total)
|
||||
|
||||
rows, err := h.pool.Query(r.Context(), `
|
||||
SELECT id, url, parent_url, depth, title, content, markdown, links, metadata, status_code, error, created_at
|
||||
FROM crawler.crawl_results WHERE job_id = $1 ORDER BY created_at LIMIT $2 OFFSET $3
|
||||
`, jobID, limit, offset)
|
||||
if err != nil {
|
||||
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var results []map[string]any
|
||||
for rows.Next() {
|
||||
var id, u string
|
||||
var parentURL, title, content, markdown, links, metadata, errMsg *string
|
||||
var depth, statusCode int
|
||||
var createdAt time.Time
|
||||
|
||||
rows.Scan(&id, &u, &parentURL, &depth, &title, &content, &markdown, &links, &metadata, &statusCode, &errMsg, &createdAt)
|
||||
results = append(results, map[string]any{
|
||||
"id": id, "url": u, "parentUrl": parentURL, "depth": depth,
|
||||
"title": title, "content": content, "markdown": markdown,
|
||||
"links": links, "metadata": metadata,
|
||||
"statusCode": statusCode, "error": errMsg, "createdAt": createdAt,
|
||||
})
|
||||
}
|
||||
|
||||
if results == nil {
|
||||
results = []map[string]any{}
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"results": results,
|
||||
"pagination": map[string]any{
|
||||
"page": page, "limit": limit, "total": total,
|
||||
"totalPages": (total + limit - 1) / limit,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// ListJobs handles GET /api/v1/crawl
|
||||
func (h *Handler) ListJobs(w http.ResponseWriter, r *http.Request) {
|
||||
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
|
||||
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
|
||||
if page < 1 {
|
||||
page = 1
|
||||
}
|
||||
if limit < 1 || limit > 100 {
|
||||
limit = 20
|
||||
}
|
||||
offset := (page - 1) * limit
|
||||
status := r.URL.Query().Get("status")
|
||||
|
||||
var total int
|
||||
if status != "" {
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status=$1`, status).Scan(&total)
|
||||
} else {
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs`).Scan(&total)
|
||||
}
|
||||
|
||||
query := `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs ORDER BY created_at DESC LIMIT $1 OFFSET $2`
|
||||
args := []any{limit, offset}
|
||||
if status != "" {
|
||||
query = `SELECT id, start_url, domain, status, COALESCE(progress::text,'{}'), created_at FROM crawler.crawl_jobs WHERE status=$3 ORDER BY created_at DESC LIMIT $1 OFFSET $2`
|
||||
args = append(args, status)
|
||||
}
|
||||
|
||||
rows, err := h.pool.Query(r.Context(), query, args...)
|
||||
if err != nil {
|
||||
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "query failed"})
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var jobs []map[string]any
|
||||
for rows.Next() {
|
||||
var id, startURL, domain, st, progress string
|
||||
var createdAt time.Time
|
||||
rows.Scan(&id, &startURL, &domain, &st, &progress, &createdAt)
|
||||
jobs = append(jobs, map[string]any{
|
||||
"jobId": id, "startUrl": startURL, "domain": domain,
|
||||
"status": st, "progress": progress, "createdAt": createdAt,
|
||||
})
|
||||
}
|
||||
if jobs == nil {
|
||||
jobs = []map[string]any{}
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"results": jobs,
|
||||
"pagination": map[string]any{
|
||||
"page": page, "limit": limit, "total": total,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// CancelJob handles DELETE /api/v1/crawl/{jobId}
|
||||
func (h *Handler) CancelJob(w http.ResponseWriter, r *http.Request) {
|
||||
jobID := r.PathValue("jobId")
|
||||
h.crawler.CancelJob(jobID)
|
||||
h.pool.Exec(r.Context(), `UPDATE crawler.crawl_jobs SET status='cancelled', updated_at=NOW() WHERE id=$1`, jobID)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
// Health handles GET /health
|
||||
func (h *Handler) Health(w http.ResponseWriter, r *http.Request) {
|
||||
dbOK := "ok"
|
||||
if err := h.pool.Ping(r.Context()); err != nil {
|
||||
dbOK = "error"
|
||||
}
|
||||
status := "ok"
|
||||
if dbOK != "ok" {
|
||||
status = "degraded"
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"status": status, "service": "mana-crawler", "database": dbOK,
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
}
|
||||
|
||||
// Metrics handles GET /metrics
|
||||
func (h *Handler) Metrics(w http.ResponseWriter, r *http.Request) {
|
||||
var running, completed, failed int
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='running'`).Scan(&running)
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='completed'`).Scan(&completed)
|
||||
h.pool.QueryRow(r.Context(), `SELECT COUNT(*) FROM crawler.crawl_jobs WHERE status='failed'`).Scan(&failed)
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
fmt.Fprintf(w, "# HELP mana_crawler_jobs Crawl jobs by status\n")
|
||||
fmt.Fprintf(w, "# TYPE mana_crawler_jobs gauge\n")
|
||||
fmt.Fprintf(w, "mana_crawler_jobs{status=\"running\"} %d\n", running)
|
||||
fmt.Fprintf(w, "mana_crawler_jobs{status=\"completed\"} %d\n", completed)
|
||||
fmt.Fprintf(w, "mana_crawler_jobs{status=\"failed\"} %d\n", failed)
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, data any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue