mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-18 21:01:23 +02:00
Goroutine-based crawler replacing NestJS mana-crawler: - goquery for HTML parsing (title, content, links, metadata) - robots.txt checker with 24h cache - Worker pool with configurable concurrency + rate limiting - PostgreSQL for job/result storage - Same API surface: POST/GET/DELETE /api/v1/crawl 11 MB binary, ~15 MB Docker image vs ~200 MB NestJS. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
23 lines
594 B
Docker
23 lines
594 B
Docker
# Build stage
|
|
FROM golang:1.25-alpine AS builder
|
|
|
|
WORKDIR /app
|
|
COPY services/mana-crawler-go/go.mod services/mana-crawler-go/go.sum ./
|
|
RUN go mod download
|
|
|
|
COPY services/mana-crawler-go/ .
|
|
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /mana-crawler ./cmd/server
|
|
|
|
# Runtime stage
|
|
FROM alpine:3.21
|
|
|
|
RUN apk --no-cache add ca-certificates tzdata
|
|
|
|
COPY --from=builder /mana-crawler /usr/local/bin/mana-crawler
|
|
|
|
EXPOSE 3023
|
|
|
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
|
|
CMD wget -q --spider http://localhost:3023/health || exit 1
|
|
|
|
ENTRYPOINT ["mana-crawler"]
|