mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-23 20:36:42 +02:00
refactor(services): rename Go services, remove -go suffix
mana-search-go → mana-search mana-notify-go → mana-notify mana-crawler-go → mana-crawler mana-api-gateway-go → mana-api-gateway Legacy NestJS versions are deleted, suffix no longer needed. Updated all references in docker-compose, CLAUDE.md, package.json, Forgejo workflows, and service package.json files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
79080d6654
commit
7e931b1c6d
90 changed files with 41 additions and 38 deletions
219
services/mana-crawler/internal/parser/parser.go
Normal file
219
services/mana-crawler/internal/parser/parser.go
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
package parser
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// Result holds the extracted content from a page.
|
||||
type Result struct {
|
||||
Title string
|
||||
Content string
|
||||
Markdown string
|
||||
Links []string
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
// Selectors defines custom CSS selectors for extraction.
|
||||
type Selectors struct {
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
Links string `json:"links"`
|
||||
}
|
||||
|
||||
var (
|
||||
reScript = regexp.MustCompile(`(?is)<script.*?</script>`)
|
||||
reStyle = regexp.MustCompile(`(?is)<style.*?</style>`)
|
||||
reTags = regexp.MustCompile(`<[^>]+>`)
|
||||
reSpaces = regexp.MustCompile(`\s+`)
|
||||
)
|
||||
|
||||
// Parse extracts content from HTML.
|
||||
func Parse(html string, baseURL string, selectors *Selectors) (*Result, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
base, _ := url.Parse(baseURL)
|
||||
|
||||
result := &Result{
|
||||
Title: extractTitle(doc, selectors),
|
||||
Metadata: extractMetadata(doc),
|
||||
}
|
||||
|
||||
// Extract content
|
||||
contentHTML := extractContentHTML(doc, selectors)
|
||||
result.Content = cleanText(contentHTML)
|
||||
result.Markdown = htmlToMarkdown(contentHTML)
|
||||
|
||||
// Extract links
|
||||
result.Links = extractLinks(doc, base, selectors)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func extractTitle(doc *goquery.Document, sel *Selectors) string {
|
||||
if sel != nil && sel.Title != "" {
|
||||
if t := doc.Find(sel.Title).First().Text(); t != "" {
|
||||
return strings.TrimSpace(t)
|
||||
}
|
||||
}
|
||||
if t := doc.Find("h1").First().Text(); t != "" {
|
||||
return strings.TrimSpace(t)
|
||||
}
|
||||
if t := doc.Find("title").First().Text(); t != "" {
|
||||
return strings.TrimSpace(t)
|
||||
}
|
||||
if t, _ := doc.Find(`meta[property="og:title"]`).Attr("content"); t != "" {
|
||||
return t
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractContentHTML(doc *goquery.Document, sel *Selectors) string {
|
||||
if sel != nil && sel.Content != "" {
|
||||
if h, err := doc.Find(sel.Content).First().Html(); err == nil && h != "" {
|
||||
return h
|
||||
}
|
||||
}
|
||||
|
||||
contentSelectors := []string{
|
||||
"article", "main", `[role="main"]`,
|
||||
".main-content", ".content", ".post-content", ".article-content", ".entry-content",
|
||||
"#content", "#main",
|
||||
}
|
||||
for _, s := range contentSelectors {
|
||||
if h, err := doc.Find(s).First().Html(); err == nil && h != "" {
|
||||
return h
|
||||
}
|
||||
}
|
||||
|
||||
h, _ := doc.Find("body").Html()
|
||||
return h
|
||||
}
|
||||
|
||||
func extractLinks(doc *goquery.Document, base *url.URL, sel *Selectors) []string {
|
||||
linkSel := "a[href]"
|
||||
if sel != nil && sel.Links != "" {
|
||||
linkSel = sel.Links
|
||||
}
|
||||
|
||||
seen := make(map[string]bool)
|
||||
var links []string
|
||||
|
||||
doc.Find(linkSel).Each(func(_ int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
href = strings.TrimSpace(href)
|
||||
|
||||
// Skip non-HTTP
|
||||
if strings.HasPrefix(href, "javascript:") || strings.HasPrefix(href, "mailto:") ||
|
||||
strings.HasPrefix(href, "tel:") || strings.HasPrefix(href, "#") {
|
||||
return
|
||||
}
|
||||
|
||||
// Resolve relative
|
||||
parsed, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
resolved := base.ResolveReference(parsed)
|
||||
|
||||
// Same origin only
|
||||
if resolved.Host != base.Host {
|
||||
return
|
||||
}
|
||||
|
||||
resolved.Fragment = ""
|
||||
u := resolved.String()
|
||||
if !seen[u] {
|
||||
seen[u] = true
|
||||
links = append(links, u)
|
||||
}
|
||||
})
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
func extractMetadata(doc *goquery.Document) map[string]string {
|
||||
meta := make(map[string]string)
|
||||
|
||||
// OpenGraph
|
||||
doc.Find(`meta[property^="og:"]`).Each(func(_ int, s *goquery.Selection) {
|
||||
prop, _ := s.Attr("property")
|
||||
content, _ := s.Attr("content")
|
||||
if prop != "" && content != "" {
|
||||
meta[prop] = content
|
||||
}
|
||||
})
|
||||
|
||||
// Standard meta
|
||||
for _, name := range []string{"description", "keywords", "author"} {
|
||||
if content, _ := doc.Find(`meta[name="` + name + `"]`).Attr("content"); content != "" {
|
||||
meta[name] = content
|
||||
}
|
||||
}
|
||||
|
||||
// Canonical
|
||||
if href, _ := doc.Find(`link[rel="canonical"]`).Attr("href"); href != "" {
|
||||
meta["canonical"] = href
|
||||
}
|
||||
|
||||
return meta
|
||||
}
|
||||
|
||||
func cleanText(html string) string {
|
||||
text := reScript.ReplaceAllString(html, "")
|
||||
text = reStyle.ReplaceAllString(text, "")
|
||||
text = reTags.ReplaceAllString(text, " ")
|
||||
text = strings.ReplaceAll(text, " ", " ")
|
||||
text = strings.ReplaceAll(text, "&", "&")
|
||||
text = strings.ReplaceAll(text, "<", "<")
|
||||
text = strings.ReplaceAll(text, ">", ">")
|
||||
text = strings.ReplaceAll(text, """, `"`)
|
||||
text = reSpaces.ReplaceAllString(text, " ")
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
|
||||
// htmlToMarkdown does a basic HTML → Markdown conversion.
|
||||
func htmlToMarkdown(html string) string {
|
||||
// Remove scripts/styles
|
||||
md := reScript.ReplaceAllString(html, "")
|
||||
md = reStyle.ReplaceAllString(md, "")
|
||||
|
||||
// Headings
|
||||
for i := 6; i >= 1; i-- {
|
||||
prefix := strings.Repeat("#", i)
|
||||
re := regexp.MustCompile(`(?i)<h` + strings.Repeat("", 0) + string(rune('0'+i)) + `[^>]*>(.*?)</h` + string(rune('0'+i)) + `>`)
|
||||
md = re.ReplaceAllString(md, "\n"+prefix+" $1\n")
|
||||
}
|
||||
// Paragraphs
|
||||
md = regexp.MustCompile(`(?i)<p[^>]*>`).ReplaceAllString(md, "\n")
|
||||
md = strings.ReplaceAll(md, "</p>", "\n")
|
||||
// Line breaks
|
||||
md = regexp.MustCompile(`(?i)<br\s*/?\s*>`).ReplaceAllString(md, "\n")
|
||||
// Bold
|
||||
md = regexp.MustCompile(`(?i)<(?:strong|b)>(.*?)</(?:strong|b)>`).ReplaceAllString(md, "**$1**")
|
||||
// Italic
|
||||
md = regexp.MustCompile(`(?i)<(?:em|i)>(.*?)</(?:em|i)>`).ReplaceAllString(md, "*$1*")
|
||||
// Code
|
||||
md = regexp.MustCompile(`(?i)<code>(.*?)</code>`).ReplaceAllString(md, "`$1`")
|
||||
// Pre
|
||||
md = regexp.MustCompile(`(?i)<pre[^>]*>(.*?)</pre>`).ReplaceAllString(md, "\n```\n$1\n```\n")
|
||||
// Links
|
||||
md = regexp.MustCompile(`(?i)<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>`).ReplaceAllString(md, "[$2]($1)")
|
||||
// Lists
|
||||
md = regexp.MustCompile(`(?i)<li[^>]*>`).ReplaceAllString(md, "- ")
|
||||
md = strings.ReplaceAll(md, "</li>", "\n")
|
||||
// Remove remaining tags
|
||||
md = reTags.ReplaceAllString(md, "")
|
||||
// Clean up whitespace
|
||||
md = regexp.MustCompile(`\n{3,}`).ReplaceAllString(md, "\n\n")
|
||||
return strings.TrimSpace(md)
|
||||
}
|
||||
131
services/mana-crawler/internal/parser/parser_test.go
Normal file
131
services/mana-crawler/internal/parser/parser_test.go
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
package parser
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParse_Title(t *testing.T) {
|
||||
html := `<html><head><title>Page Title</title></head><body><h1>Main Heading</h1><p>Content</p></body></html>`
|
||||
result, err := Parse(html, "https://example.com", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if result.Title != "Main Heading" {
|
||||
t.Errorf("title = %q, want %q", result.Title, "Main Heading")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Links(t *testing.T) {
|
||||
html := `<html><body>
|
||||
<a href="/page1">Page 1</a>
|
||||
<a href="https://example.com/page2">Page 2</a>
|
||||
<a href="https://other.com/ext">External</a>
|
||||
<a href="mailto:test@test.com">Email</a>
|
||||
<a href="#section">Anchor</a>
|
||||
</body></html>`
|
||||
|
||||
result, err := Parse(html, "https://example.com", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Should have page1 and page2 (same origin), not external, mailto, or anchor
|
||||
if len(result.Links) != 2 {
|
||||
t.Errorf("links count = %d, want 2, got: %v", len(result.Links), result.Links)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Metadata(t *testing.T) {
|
||||
html := `<html><head>
|
||||
<meta name="description" content="Test description">
|
||||
<meta property="og:title" content="OG Title">
|
||||
<link rel="canonical" href="https://example.com/canonical">
|
||||
</head><body></body></html>`
|
||||
|
||||
result, err := Parse(html, "https://example.com", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result.Metadata["description"] != "Test description" {
|
||||
t.Errorf("description = %q", result.Metadata["description"])
|
||||
}
|
||||
if result.Metadata["og:title"] != "OG Title" {
|
||||
t.Errorf("og:title = %q", result.Metadata["og:title"])
|
||||
}
|
||||
if result.Metadata["canonical"] != "https://example.com/canonical" {
|
||||
t.Errorf("canonical = %q", result.Metadata["canonical"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanText(t *testing.T) {
|
||||
html := `<p>Hello <strong>world</strong></p><script>alert('x')</script>`
|
||||
got := cleanText(html)
|
||||
if got != "Hello world" {
|
||||
t.Errorf("cleanText = %q, want %q", got, "Hello world")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchesPatterns(t *testing.T) {
|
||||
tests := []struct {
|
||||
url string
|
||||
include []string
|
||||
exclude []string
|
||||
want bool
|
||||
}{
|
||||
{"https://example.com/docs/page", []string{"/docs/"}, nil, true},
|
||||
{"https://example.com/api/v1", []string{"/docs/"}, nil, false},
|
||||
{"https://example.com/docs/page", nil, []string{"/api/"}, true},
|
||||
{"https://example.com/api/page", nil, []string{"/api/"}, false},
|
||||
{"https://example.com/any", nil, nil, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
// Use the crawler package's matchesPatterns — testing indirectly via parser
|
||||
// Here we just test the logic inline
|
||||
got := matchPatterns(tt.url, tt.include, tt.exclude)
|
||||
if got != tt.want {
|
||||
t.Errorf("matchPatterns(%q, %v, %v) = %v, want %v", tt.url, tt.include, tt.exclude, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func matchPatterns(u string, include, exclude []string) bool {
|
||||
if len(include) > 0 {
|
||||
matched := false
|
||||
for _, p := range include {
|
||||
if len(p) > 0 && containsPattern(u, p) {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !matched {
|
||||
return false
|
||||
}
|
||||
}
|
||||
for _, p := range exclude {
|
||||
if containsPattern(u, p) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func containsPattern(u, pattern string) bool {
|
||||
// Strip wildcards
|
||||
p := pattern
|
||||
if len(p) > 0 && p[0] == '*' {
|
||||
p = p[1:]
|
||||
}
|
||||
if len(p) > 0 && p[len(p)-1] == '*' {
|
||||
p = p[:len(p)-1]
|
||||
}
|
||||
return len(p) > 0 && len(u) > 0 && indexOf(u, p) >= 0
|
||||
}
|
||||
|
||||
func indexOf(s, sub string) int {
|
||||
for i := 0; i <= len(s)-len(sub); i++ {
|
||||
if s[i:i+len(sub)] == sub {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue