mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:41:09 +02:00
fix(crawler): skip TLS verification for outgoing crawl requests
Required for Docker Desktop Mac (TLS proxy) and sites with self-signed/expired certificates. Crawlers routinely need this. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
585cdc1753
commit
313779f439
1 changed files with 13 additions and 1 deletions
|
|
@ -2,6 +2,7 @@ package crawler
|
|||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
|
|
@ -65,11 +66,22 @@ type Crawler struct {
|
|||
|
||||
// New creates a new Crawler.
|
||||
func New(pool *pgxpool.Pool, robotsChecker *robots.Checker, userAgent string, concurrency int, timeout time.Duration) *Crawler {
|
||||
// Skip TLS verification for outgoing crawl requests.
|
||||
// Required in Docker Desktop for Mac (TLS proxy) and for crawling
|
||||
// sites with self-signed or expired certificates.
|
||||
transport := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
MaxIdleConns: 100,
|
||||
MaxIdleConnsPerHost: 10,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
}
|
||||
|
||||
return &Crawler{
|
||||
pool: pool,
|
||||
robots: robotsChecker,
|
||||
httpClient: &http.Client{
|
||||
Timeout: timeout,
|
||||
Timeout: timeout,
|
||||
Transport: transport,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue