refactor(services): rename Go services, remove -go suffix

mana-search-go → mana-search mana-notify-go → mana-notify mana-crawler-go → mana-crawler mana-api-gateway-go → mana-api-gateway Legacy NestJS versions are deleted, suffix no longer needed. Updated all references in docker-compose, CLAUDE.md, package.json, Forgejo workflows, and service package.json files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-23 20:36:42 +02:00 · 2026-03-28 10:18:40 +01:00 · 2026-03-28 10:18:40 +01:00 · 7e931b1c6d
commit 7e931b1c6d
parent 79080d6654
90 changed files with 41 additions and 38 deletions
--- a/services/mana-crawler/internal/parser/parser.go
+++ b/services/mana-crawler/internal/parser/parser.go
@ -0,0 +1,219 @@
+package parser
+
+import (
+	"net/url"
+	"regexp"
+	"strings"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+// Result holds the extracted content from a page.
+type Result struct {
+	Title    string
+	Content  string
+	Markdown string
+	Links    []string
+	Metadata map[string]string
+}
+
+// Selectors defines custom CSS selectors for extraction.
+type Selectors struct {
+	Title   string `json:"title"`
+	Content string `json:"content"`
+	Links   string `json:"links"`
+}
+
+var (
+	reScript = regexp.MustCompile(`(?is)<script.*?</script>`)
+	reStyle  = regexp.MustCompile(`(?is)<style.*?</style>`)
+	reTags   = regexp.MustCompile(`<[^>]+>`)
+	reSpaces = regexp.MustCompile(`\s+`)
+)
+
+// Parse extracts content from HTML.
+func Parse(html string, baseURL string, selectors *Selectors) (*Result, error) {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+	if err != nil {
+		return nil, err
+	}
+
+	base, _ := url.Parse(baseURL)
+
+	result := &Result{
+		Title:    extractTitle(doc, selectors),
+		Metadata: extractMetadata(doc),
+	}
+
+	// Extract content
+	contentHTML := extractContentHTML(doc, selectors)
+	result.Content = cleanText(contentHTML)
+	result.Markdown = htmlToMarkdown(contentHTML)
+
+	// Extract links
+	result.Links = extractLinks(doc, base, selectors)
+
+	return result, nil
+}
+
+func extractTitle(doc *goquery.Document, sel *Selectors) string {
+	if sel != nil && sel.Title != "" {
+		if t := doc.Find(sel.Title).First().Text(); t != "" {
+			return strings.TrimSpace(t)
+		}
+	}
+	if t := doc.Find("h1").First().Text(); t != "" {
+		return strings.TrimSpace(t)
+	}
+	if t := doc.Find("title").First().Text(); t != "" {
+		return strings.TrimSpace(t)
+	}
+	if t, _ := doc.Find(`meta[property="og:title"]`).Attr("content"); t != "" {
+		return t
+	}
+	return ""
+}
+
+func extractContentHTML(doc *goquery.Document, sel *Selectors) string {
+	if sel != nil && sel.Content != "" {
+		if h, err := doc.Find(sel.Content).First().Html(); err == nil && h != "" {
+			return h
+		}
+	}
+
+	contentSelectors := []string{
+		"article", "main", `[role="main"]`,
+		".main-content", ".content", ".post-content", ".article-content", ".entry-content",
+		"#content", "#main",
+	}
+	for _, s := range contentSelectors {
+		if h, err := doc.Find(s).First().Html(); err == nil && h != "" {
+			return h
+		}
+	}
+
+	h, _ := doc.Find("body").Html()
+	return h
+}
+
+func extractLinks(doc *goquery.Document, base *url.URL, sel *Selectors) []string {
+	linkSel := "a[href]"
+	if sel != nil && sel.Links != "" {
+		linkSel = sel.Links
+	}
+
+	seen := make(map[string]bool)
+	var links []string
+
+	doc.Find(linkSel).Each(func(_ int, s *goquery.Selection) {
+		href, exists := s.Attr("href")
+		if !exists {
+			return
+		}
+		href = strings.TrimSpace(href)
+
+		// Skip non-HTTP
+		if strings.HasPrefix(href, "javascript:") || strings.HasPrefix(href, "mailto:") ||
+			strings.HasPrefix(href, "tel:") || strings.HasPrefix(href, "#") {
+			return
+		}
+
+		// Resolve relative
+		parsed, err := url.Parse(href)
+		if err != nil {
+			return
+		}
+		resolved := base.ResolveReference(parsed)
+
+		// Same origin only
+		if resolved.Host != base.Host {
+			return
+		}
+
+		resolved.Fragment = ""
+		u := resolved.String()
+		if !seen[u] {
+			seen[u] = true
+			links = append(links, u)
+		}
+	})
+
+	return links
+}
+
+func extractMetadata(doc *goquery.Document) map[string]string {
+	meta := make(map[string]string)
+
+	// OpenGraph
+	doc.Find(`meta[property^="og:"]`).Each(func(_ int, s *goquery.Selection) {
+		prop, _ := s.Attr("property")
+		content, _ := s.Attr("content")
+		if prop != "" && content != "" {
+			meta[prop] = content
+		}
+	})
+
+	// Standard meta
+	for _, name := range []string{"description", "keywords", "author"} {
+		if content, _ := doc.Find(`meta[name="` + name + `"]`).Attr("content"); content != "" {
+			meta[name] = content
+		}
+	}
+
+	// Canonical
+	if href, _ := doc.Find(`link[rel="canonical"]`).Attr("href"); href != "" {
+		meta["canonical"] = href
+	}
+
+	return meta
+}
+
+func cleanText(html string) string {
+	text := reScript.ReplaceAllString(html, "")
+	text = reStyle.ReplaceAllString(text, "")
+	text = reTags.ReplaceAllString(text, " ")
+	text = strings.ReplaceAll(text, "&nbsp;", " ")
+	text = strings.ReplaceAll(text, "&amp;", "&")
+	text = strings.ReplaceAll(text, "&lt;", "<")
+	text = strings.ReplaceAll(text, "&gt;", ">")
+	text = strings.ReplaceAll(text, "&quot;", `"`)
+	text = reSpaces.ReplaceAllString(text, " ")
+	return strings.TrimSpace(text)
+}
+
+// htmlToMarkdown does a basic HTML → Markdown conversion.
+func htmlToMarkdown(html string) string {
+	// Remove scripts/styles
+	md := reScript.ReplaceAllString(html, "")
+	md = reStyle.ReplaceAllString(md, "")
+
+	// Headings
+	for i := 6; i >= 1; i-- {
+		prefix := strings.Repeat("#", i)
+		re := regexp.MustCompile(`(?i)<h` + strings.Repeat("", 0) + string(rune('0'+i)) + `[^>]*>(.*?)</h` + string(rune('0'+i)) + `>`)
+		md = re.ReplaceAllString(md, "\n"+prefix+" $1\n")
+	}
+	// Paragraphs
+	md = regexp.MustCompile(`(?i)<p[^>]*>`).ReplaceAllString(md, "\n")
+	md = strings.ReplaceAll(md, "</p>", "\n")
+	// Line breaks
+	md = regexp.MustCompile(`(?i)<br\s*/?\s*>`).ReplaceAllString(md, "\n")
+	// Bold
+	md = regexp.MustCompile(`(?i)<(?:strong|b)>(.*?)</(?:strong|b)>`).ReplaceAllString(md, "**$1**")
+	// Italic
+	md = regexp.MustCompile(`(?i)<(?:em|i)>(.*?)</(?:em|i)>`).ReplaceAllString(md, "*$1*")
+	// Code
+	md = regexp.MustCompile(`(?i)<code>(.*?)</code>`).ReplaceAllString(md, "`$1`")
+	// Pre
+	md = regexp.MustCompile(`(?i)<pre[^>]*>(.*?)</pre>`).ReplaceAllString(md, "\n```\n$1\n```\n")
+	// Links
+	md = regexp.MustCompile(`(?i)<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>`).ReplaceAllString(md, "[$2]($1)")
+	// Lists
+	md = regexp.MustCompile(`(?i)<li[^>]*>`).ReplaceAllString(md, "- ")
+	md = strings.ReplaceAll(md, "</li>", "\n")
+	// Remove remaining tags
+	md = reTags.ReplaceAllString(md, "")
+	// Clean up whitespace
+	md = regexp.MustCompile(`\n{3,}`).ReplaceAllString(md, "\n\n")
+	return strings.TrimSpace(md)
+}
--- a/services/mana-crawler/internal/parser/parser_test.go
+++ b/services/mana-crawler/internal/parser/parser_test.go
@ -0,0 +1,131 @@
+package parser
+
+import "testing"
+
+func TestParse_Title(t *testing.T) {
+	html := `<html><head><title>Page Title</title></head><body><h1>Main Heading</h1><p>Content</p></body></html>`
+	result, err := Parse(html, "https://example.com", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if result.Title != "Main Heading" {
+		t.Errorf("title = %q, want %q", result.Title, "Main Heading")
+	}
+}
+
+func TestParse_Links(t *testing.T) {
+	html := `<html><body>
+		<a href="/page1">Page 1</a>
+		<a href="https://example.com/page2">Page 2</a>
+		<a href="https://other.com/ext">External</a>
+		<a href="mailto:test@test.com">Email</a>
+		<a href="#section">Anchor</a>
+	</body></html>`
+
+	result, err := Parse(html, "https://example.com", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Should have page1 and page2 (same origin), not external, mailto, or anchor
+	if len(result.Links) != 2 {
+		t.Errorf("links count = %d, want 2, got: %v", len(result.Links), result.Links)
+	}
+}
+
+func TestParse_Metadata(t *testing.T) {
+	html := `<html><head>
+		<meta name="description" content="Test description">
+		<meta property="og:title" content="OG Title">
+		<link rel="canonical" href="https://example.com/canonical">
+	</head><body></body></html>`
+
+	result, err := Parse(html, "https://example.com", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if result.Metadata["description"] != "Test description" {
+		t.Errorf("description = %q", result.Metadata["description"])
+	}
+	if result.Metadata["og:title"] != "OG Title" {
+		t.Errorf("og:title = %q", result.Metadata["og:title"])
+	}
+	if result.Metadata["canonical"] != "https://example.com/canonical" {
+		t.Errorf("canonical = %q", result.Metadata["canonical"])
+	}
+}
+
+func TestCleanText(t *testing.T) {
+	html := `<p>Hello <strong>world</strong></p><script>alert('x')</script>`
+	got := cleanText(html)
+	if got != "Hello world" {
+		t.Errorf("cleanText = %q, want %q", got, "Hello world")
+	}
+}
+
+func TestMatchesPatterns(t *testing.T) {
+	tests := []struct {
+		url     string
+		include []string
+		exclude []string
+		want    bool
+	}{
+		{"https://example.com/docs/page", []string{"/docs/"}, nil, true},
+		{"https://example.com/api/v1", []string{"/docs/"}, nil, false},
+		{"https://example.com/docs/page", nil, []string{"/api/"}, true},
+		{"https://example.com/api/page", nil, []string{"/api/"}, false},
+		{"https://example.com/any", nil, nil, true},
+	}
+
+	for _, tt := range tests {
+		// Use the crawler package's matchesPatterns — testing indirectly via parser
+		// Here we just test the logic inline
+		got := matchPatterns(tt.url, tt.include, tt.exclude)
+		if got != tt.want {
+			t.Errorf("matchPatterns(%q, %v, %v) = %v, want %v", tt.url, tt.include, tt.exclude, got, tt.want)
+		}
+	}
+}
+
+func matchPatterns(u string, include, exclude []string) bool {
+	if len(include) > 0 {
+		matched := false
+		for _, p := range include {
+			if len(p) > 0 && containsPattern(u, p) {
+				matched = true
+				break
+			}
+		}
+		if !matched {
+			return false
+		}
+	}
+	for _, p := range exclude {
+		if containsPattern(u, p) {
+			return false
+		}
+	}
+	return true
+}
+
+func containsPattern(u, pattern string) bool {
+	// Strip wildcards
+	p := pattern
+	if len(p) > 0 && p[0] == '*' {
+		p = p[1:]
+	}
+	if len(p) > 0 && p[len(p)-1] == '*' {
+		p = p[:len(p)-1]
+	}
+	return len(p) > 0 && len(u) > 0 && indexOf(u, p) >= 0
+}
+
+func indexOf(s, sub string) int {
+	for i := 0; i <= len(s)-len(sub); i++ {
+		if s[i:i+len(sub)] == sub {
+			return i
+		}
+	}
+	return -1
+}