diff --git a/package.json b/package.json index 9785afb43..258da2209 100644 --- a/package.json +++ b/package.json @@ -210,6 +210,14 @@ "cf:login": "npx wrangler login", "cf:projects:list": "npx wrangler pages project list", "cf:projects:create": "echo 'Creating Cloudflare Pages projects...' && npx wrangler pages project create chat-landing --production-branch=main && npx wrangler pages project create picture-landing --production-branch=main && npx wrangler pages project create manacore-landing --production-branch=main && npx wrangler pages project create manadeck-landing --production-branch=main && npx wrangler pages project create zitare-landing --production-branch=main", + "dev:search": "pnpm --filter @manacore/mana-search dev", + "dev:search:docker": "docker-compose -f services/mana-search/docker-compose.dev.yml up -d", + "dev:search:docker:down": "docker-compose -f services/mana-search/docker-compose.dev.yml down", + "dev:search:docker:logs": "docker-compose -f services/mana-search/docker-compose.dev.yml logs -f", + "dev:search:full": "docker-compose -f services/mana-search/docker-compose.dev.yml up -d && pnpm --filter @manacore/mana-search dev", + "search:docker:up": "docker-compose -f services/mana-search/docker-compose.yml up -d", + "search:docker:down": "docker-compose -f services/mana-search/docker-compose.yml down", + "search:docker:logs": "docker-compose -f services/mana-search/docker-compose.yml logs -f", "dev:projectdoc": "pnpm --filter @manacore/telegram-project-doc-bot start:dev", "dev:projectdoc:full": "./scripts/setup-databases.sh projectdoc && pnpm dev:projectdoc", "projectdoc:db:push": "pnpm --filter @manacore/telegram-project-doc-bot db:push", diff --git a/services/mana-search/.env.example b/services/mana-search/.env.example new file mode 100644 index 000000000..9345f091e --- /dev/null +++ b/services/mana-search/.env.example @@ -0,0 +1,28 @@ +# Mana Search Service Environment Variables + +# Server +PORT=3021 +NODE_ENV=development + +# SearXNG +SEARXNG_URL=http://localhost:8080 +SEARXNG_TIMEOUT=15000 +SEARXNG_DEFAULT_LANGUAGE=de-DE +SEARXNG_SECRET=change-me-in-production + +# Redis +REDIS_HOST=localhost +REDIS_PORT=6380 +REDIS_PASSWORD= + +# Cache TTL (seconds) +CACHE_SEARCH_TTL=3600 +CACHE_EXTRACT_TTL=86400 + +# Content Extraction +EXTRACT_TIMEOUT=10000 +EXTRACT_MAX_LENGTH=50000 +EXTRACT_USER_AGENT=Mozilla/5.0 (compatible; ManaSearchBot/1.0; +https://manacore.app) + +# CORS (comma-separated origins) +CORS_ORIGINS=http://localhost:3000,http://localhost:5173,http://localhost:8081 diff --git a/services/mana-search/CLAUDE.md b/services/mana-search/CLAUDE.md new file mode 100644 index 000000000..a258a0e2a --- /dev/null +++ b/services/mana-search/CLAUDE.md @@ -0,0 +1,250 @@ +# Mana Search Service + +Central search microservice providing web search and content extraction for all ManaCore apps. + +## Overview + +- **Port**: 3021 +- **Technology**: NestJS + SearXNG + Redis +- **Purpose**: Unified search and extraction API + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Consumer Apps │ +│ Questions │ Chat │ Project Doc Bot │ Future Apps │ +└─────────────────────────┬───────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ mana-search (Port 3021) │ +│ Search API │ Extract API │ Redis Cache │ +└─────────────────────────┬───────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ SearXNG (Port 8080, internal) │ +│ Google │ Bing │ DuckDuckGo │ Wikipedia │ arXiv │ ... │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Quick Start + +### Development (Local NestJS + Docker SearXNG/Redis) + +```bash +# 1. Start SearXNG and Redis +docker-compose -f docker-compose.dev.yml up -d + +# 2. Install dependencies +pnpm install + +# 3. Start NestJS in watch mode +pnpm dev +``` + +### Production (Full Docker) + +```bash +docker-compose up -d +``` + +## API Endpoints + +### Search + +```bash +# Web search +POST /api/v1/search +{ + "query": "quantum computing", + "options": { + "categories": ["general", "science"], + "engines": ["google", "wikipedia"], + "language": "de-DE", + "limit": 10 + } +} + +# Get available engines +GET /api/v1/search/engines + +# Search health check +GET /api/v1/search/health + +# Clear search cache +DELETE /api/v1/search/cache +``` + +### Extract + +```bash +# Extract content from URL +POST /api/v1/extract +{ + "url": "https://example.com/article", + "options": { + "includeMarkdown": true, + "maxLength": 5000 + } +} + +# Bulk extract (max 20 URLs) +POST /api/v1/extract/bulk +{ + "urls": ["https://...", "https://..."], + "options": { "includeMarkdown": true }, + "concurrency": 5 +} +``` + +### Health & Metrics + +```bash +# Health check +GET /health + +# Prometheus metrics +GET /metrics +``` + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `PORT` | 3021 | API port | +| `SEARXNG_URL` | http://localhost:8080 | SearXNG URL | +| `SEARXNG_TIMEOUT` | 15000 | Search timeout (ms) | +| `SEARXNG_DEFAULT_LANGUAGE` | de-DE | Default language | +| `REDIS_HOST` | localhost | Redis host | +| `REDIS_PORT` | 6379 | Redis port | +| `CACHE_SEARCH_TTL` | 3600 | Search cache TTL (seconds) | +| `CACHE_EXTRACT_TTL` | 86400 | Extract cache TTL (seconds) | +| `EXTRACT_TIMEOUT` | 10000 | Extraction timeout (ms) | +| `EXTRACT_MAX_LENGTH` | 50000 | Max extracted text length | + +### SearXNG Configuration + +Edit `searxng/settings.yml` to: +- Enable/disable search engines +- Configure rate limits +- Set default language +- Adjust timeouts + +## Development Commands + +```bash +# Install dependencies +pnpm install + +# Start development server +pnpm dev + +# Build for production +pnpm build + +# Start production server +pnpm start + +# Type checking +pnpm type-check + +# Linting +pnpm lint + +# Run tests +pnpm test +``` + +## Docker Commands + +```bash +# Start all services (production) +docker-compose up -d + +# Start SearXNG + Redis only (development) +docker-compose -f docker-compose.dev.yml up -d + +# View logs +docker-compose logs -f + +# Stop services +docker-compose down + +# Rebuild +docker-compose build --no-cache +``` + +## Testing the API + +```bash +# Search test +curl -X POST http://localhost:3021/api/v1/search \ + -H "Content-Type: application/json" \ + -d '{"query": "typescript tutorial"}' + +# Extract test +curl -X POST http://localhost:3021/api/v1/extract \ + -H "Content-Type: application/json" \ + -d '{"url": "https://en.wikipedia.org/wiki/TypeScript", "options": {"includeMarkdown": true}}' + +# Health check +curl http://localhost:3021/health +``` + +## Search Categories + +| Category | Engines | +|----------|---------| +| `general` | Google, Bing, DuckDuckGo, Brave, Wikipedia | +| `news` | Google News, Bing News | +| `science` | arXiv, Google Scholar, PubMed, Semantic Scholar | +| `it` | GitHub, StackOverflow, NPM, MDN | +| `images` | Google Images, Bing Images, Unsplash | +| `videos` | YouTube, Vimeo, PeerTube | + +## Integration Example + +```typescript +// In another service +const response = await fetch('http://mana-search:3021/api/v1/search', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + query: 'machine learning basics', + options: { + categories: ['general', 'science'], + limit: 5 + } + }) +}); + +const { results, meta } = await response.json(); +``` + +## Troubleshooting + +### SearXNG not responding + +```bash +# Check SearXNG health +curl http://localhost:8080/healthz + +# Check logs +docker logs mana-searxng-dev +``` + +### Redis connection issues + +```bash +# Check Redis +docker exec mana-search-redis-dev redis-cli ping + +# Clear Redis data +docker exec mana-search-redis-dev redis-cli FLUSHALL +``` + +### High memory usage + +SearXNG can use significant memory. Adjust `maxmemory` in docker-compose if needed. diff --git a/services/mana-search/Dockerfile b/services/mana-search/Dockerfile new file mode 100644 index 000000000..527e94098 --- /dev/null +++ b/services/mana-search/Dockerfile @@ -0,0 +1,60 @@ +# ================================ +# Build Stage +# ================================ +FROM node:20-slim AS builder + +# Install pnpm +RUN npm install -g pnpm@9.15.0 + +WORKDIR /app + +# Copy package files +COPY package.json pnpm-lock.yaml* ./ + +# Install dependencies +RUN pnpm install --frozen-lockfile + +# Copy source code +COPY tsconfig.json nest-cli.json ./ +COPY src ./src + +# Build the application +RUN pnpm build + +# ================================ +# Production Stage +# ================================ +FROM node:20-slim AS production + +# Install pnpm +RUN npm install -g pnpm@9.15.0 + +# Create non-root user +RUN groupadd -r nestjs && useradd -r -g nestjs nestjs + +WORKDIR /app + +# Copy package files +COPY package.json pnpm-lock.yaml* ./ + +# Install production dependencies only +RUN pnpm install --prod --frozen-lockfile + +# Copy built application +COPY --from=builder /app/dist ./dist + +# Set ownership +RUN chown -R nestjs:nestjs /app + +# Switch to non-root user +USER nestjs + +# Expose port +EXPOSE 3021 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD node -e "fetch('http://localhost:3021/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))" + +# Start the application +CMD ["node", "dist/main"] diff --git a/services/mana-search/docker-compose.dev.yml b/services/mana-search/docker-compose.dev.yml new file mode 100644 index 000000000..e06a32dc6 --- /dev/null +++ b/services/mana-search/docker-compose.dev.yml @@ -0,0 +1,58 @@ +version: '3.8' + +# Development setup - SearXNG and Redis only +# Run mana-search with `pnpm dev` locally + +services: + # ================================ + # SearXNG Meta Search Engine + # ================================ + searxng: + image: searxng/searxng:latest + container_name: mana-searxng-dev + ports: + - "8080:8080" # Exposed for development + volumes: + - ./searxng/settings.yml:/etc/searxng/settings.yml:ro + - ./searxng/limiter.toml:/etc/searxng/limiter.toml:ro + environment: + SEARXNG_BASE_URL: http://localhost:8080 + SEARXNG_SECRET: dev-secret-change-in-production + networks: + - mana-search-dev + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/healthz"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + + # ================================ + # Redis Cache + # ================================ + redis: + image: redis:7-alpine + container_name: mana-search-redis-dev + ports: + - "6380:6379" # Different port to avoid conflicts + command: redis-server --appendonly yes --maxmemory 64mb --maxmemory-policy allkeys-lru + volumes: + - redis-dev-data:/data + networks: + - mana-search-dev + restart: unless-stopped + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 3 + +volumes: + redis-dev-data: + name: mana-search-redis-dev-data + +networks: + mana-search-dev: + name: mana-search-dev + driver: bridge diff --git a/services/mana-search/docker-compose.yml b/services/mana-search/docker-compose.yml new file mode 100644 index 000000000..5d3e7be9d --- /dev/null +++ b/services/mana-search/docker-compose.yml @@ -0,0 +1,96 @@ +version: '3.8' + +services: + # ================================ + # NestJS API Service + # ================================ + mana-search: + build: + context: . + dockerfile: Dockerfile + container_name: mana-search + ports: + - "3021:3021" + environment: + NODE_ENV: ${NODE_ENV:-development} + PORT: 3021 + SEARXNG_URL: http://searxng:8080 + SEARXNG_TIMEOUT: 15000 + SEARXNG_DEFAULT_LANGUAGE: de-DE + REDIS_HOST: redis + REDIS_PORT: 6379 + CACHE_SEARCH_TTL: 3600 + CACHE_EXTRACT_TTL: 86400 + EXTRACT_TIMEOUT: 10000 + EXTRACT_MAX_LENGTH: 50000 + depends_on: + searxng: + condition: service_healthy + redis: + condition: service_healthy + networks: + - mana-search-network + restart: unless-stopped + healthcheck: + test: ["CMD", "node", "-e", "fetch('http://localhost:3021/health').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + # ================================ + # SearXNG Meta Search Engine + # ================================ + searxng: + image: searxng/searxng:latest + container_name: mana-searxng + volumes: + - ./searxng/settings.yml:/etc/searxng/settings.yml:ro + - ./searxng/limiter.toml:/etc/searxng/limiter.toml:ro + environment: + SEARXNG_BASE_URL: http://localhost:8080 + SEARXNG_SECRET: ${SEARXNG_SECRET:-change-me-in-production-please} + networks: + - mana-search-network + # Internal only - no external port mapping in production + # Uncomment for debugging: + # ports: + # - "8080:8080" + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/healthz"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + + # ================================ + # Redis Cache + # ================================ + redis: + image: redis:7-alpine + container_name: mana-search-redis + command: redis-server --appendonly yes --maxmemory 128mb --maxmemory-policy allkeys-lru + volumes: + - redis-data:/data + networks: + - mana-search-network + # Internal only - no external port mapping + # Uncomment for debugging: + # ports: + # - "6380:6379" + restart: unless-stopped + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 3 + +volumes: + redis-data: + name: mana-search-redis-data + +networks: + mana-search-network: + name: mana-search-network + driver: bridge diff --git a/services/mana-search/nest-cli.json b/services/mana-search/nest-cli.json new file mode 100644 index 000000000..95538fb90 --- /dev/null +++ b/services/mana-search/nest-cli.json @@ -0,0 +1,8 @@ +{ + "$schema": "https://json.schemastore.org/nest-cli", + "collection": "@nestjs/schematics", + "sourceRoot": "src", + "compilerOptions": { + "deleteOutDir": true + } +} diff --git a/services/mana-search/package.json b/services/mana-search/package.json new file mode 100644 index 000000000..1d05eae26 --- /dev/null +++ b/services/mana-search/package.json @@ -0,0 +1,47 @@ +{ + "name": "@manacore/mana-search", + "version": "1.0.0", + "description": "Central search microservice with SearXNG and content extraction", + "private": true, + "license": "UNLICENSED", + "scripts": { + "build": "nest build", + "dev": "nest start --watch", + "start": "node dist/main", + "start:dev": "nest start --watch", + "start:debug": "nest start --debug --watch", + "start:prod": "node dist/main", + "lint": "eslint \"{src,test}/**/*.ts\" --fix", + "type-check": "tsc --noEmit", + "test": "jest", + "test:watch": "jest --watch", + "test:cov": "jest --coverage" + }, + "dependencies": { + "@extractus/article-extractor": "^8.0.18", + "@nestjs/common": "^10.4.15", + "@nestjs/config": "^3.3.0", + "@nestjs/core": "^10.4.15", + "@nestjs/platform-express": "^10.4.15", + "class-transformer": "^0.5.1", + "class-validator": "^0.14.1", + "ioredis": "^5.4.2", + "prom-client": "^15.1.3", + "reflect-metadata": "^0.2.2", + "rxjs": "^7.8.1", + "turndown": "^7.2.0" + }, + "devDependencies": { + "@nestjs/cli": "^10.4.9", + "@nestjs/schematics": "^10.2.3", + "@nestjs/testing": "^10.4.15", + "@types/express": "^5.0.0", + "@types/jest": "^29.5.14", + "@types/node": "^22.10.5", + "@types/turndown": "^5.0.5", + "jest": "^29.7.0", + "ts-jest": "^29.2.5", + "ts-node": "^10.9.2", + "typescript": "^5.7.2" + } +} diff --git a/services/mana-search/searxng/limiter.toml b/services/mana-search/searxng/limiter.toml new file mode 100644 index 000000000..5e69c9f3e --- /dev/null +++ b/services/mana-search/searxng/limiter.toml @@ -0,0 +1,27 @@ +# SearXNG Rate Limiter Configuration +# Documentation: https://docs.searxng.org/admin/settings/limiter.html + +[botdetection.ip_limit] +# Enable link token for bot detection +link_token = true + +# Maximum searches per minute per IP +limit = 60 + +# Burst limit (requests before rate limiting kicks in) +burst = 20 + +[botdetection.ip_lists] +# Allow internal Docker network IPs (no rate limiting for internal services) +pass_ip = [ + # Docker internal networks + "172.16.0.0/12", + "192.168.0.0/16", + "10.0.0.0/8", + # Localhost + "127.0.0.1", + "::1", +] + +# Block known bad actors (add IPs as needed) +block_ip = [] diff --git a/services/mana-search/searxng/settings.yml b/services/mana-search/searxng/settings.yml new file mode 100644 index 000000000..fb257c722 --- /dev/null +++ b/services/mana-search/searxng/settings.yml @@ -0,0 +1,242 @@ +use_default_settings: true + +general: + instance_name: "ManaCore Search" + debug: false + privacypolicy_url: false + donation_url: false + contact_url: false + enable_metrics: true + +search: + safe_search: 0 + autocomplete: "google" + default_lang: "de-DE" + formats: + - html + - json + +server: + secret_key: "${SEARXNG_SECRET}" + limiter: true + image_proxy: false + method: "GET" + bind_address: "0.0.0.0" + port: 8080 + +ui: + static_use_hash: true + default_theme: simple + theme_args: + simple_style: dark + +outgoing: + request_timeout: 5.0 + max_request_timeout: 15.0 + useragent_suffix: "" + +# Search engine configuration +engines: + # ===================================== + # WEB SEARCH (General) + # ===================================== + - name: google + engine: google + shortcut: g + disabled: false + weight: 1.2 + + - name: bing + engine: bing + shortcut: b + disabled: false + weight: 1.0 + + - name: duckduckgo + engine: duckduckgo + shortcut: d + disabled: false + weight: 0.9 + + - name: brave + engine: brave + shortcut: br + disabled: false + weight: 1.0 + + - name: qwant + engine: qwant + shortcut: q + disabled: false + weight: 0.8 + + - name: startpage + engine: startpage + shortcut: sp + disabled: false + weight: 0.8 + + # ===================================== + # WIKIPEDIA + # ===================================== + - name: wikipedia + engine: wikipedia + shortcut: w + disabled: false + weight: 1.1 + + - name: wikidata + engine: wikidata + shortcut: wd + disabled: false + weight: 0.8 + + # ===================================== + # IT / DEVELOPER + # ===================================== + - name: github + engine: github + shortcut: gh + disabled: false + categories: [it] + + - name: stackoverflow + engine: stackoverflow + shortcut: so + disabled: false + categories: [it] + + - name: npm + engine: npm + shortcut: npm + disabled: false + categories: [it, packages] + + - name: pypi + engine: pypi + shortcut: pip + disabled: false + categories: [it, packages] + + - name: crates.io + engine: crates + shortcut: crates + disabled: false + categories: [it, packages] + + - name: dockerhub + engine: dockerhub + shortcut: dh + disabled: false + categories: [it] + + - name: mdn + engine: mdn + shortcut: mdn + disabled: false + categories: [it] + + # ===================================== + # SCIENCE / ACADEMIC + # ===================================== + - name: arxiv + engine: arxiv + shortcut: ar + disabled: false + categories: [science] + + - name: google scholar + engine: google_scholar + shortcut: gs + disabled: false + categories: [science] + + - name: semantic scholar + engine: semantic_scholar + shortcut: ss + disabled: false + categories: [science] + + - name: pubmed + engine: pubmed + shortcut: pm + disabled: false + categories: [science, health] + + - name: crossref + engine: crossref + shortcut: cr + disabled: false + categories: [science] + + # ===================================== + # NEWS + # ===================================== + - name: google news + engine: google_news + shortcut: gn + disabled: false + categories: [news] + + - name: bing news + engine: bing_news + shortcut: bn + disabled: false + categories: [news] + + - name: duckduckgo news + engine: duckduckgo + shortcut: ddn + disabled: false + categories: [news] + + # ===================================== + # IMAGES + # ===================================== + - name: google images + engine: google_images + shortcut: gi + disabled: false + categories: [images] + + - name: bing images + engine: bing_images + shortcut: bi + disabled: false + categories: [images] + + - name: unsplash + engine: unsplash + shortcut: us + disabled: false + categories: [images] + + # ===================================== + # VIDEOS + # ===================================== + - name: youtube + engine: youtube_noapi + shortcut: yt + disabled: false + categories: [videos] + + - name: vimeo + engine: vimeo + shortcut: vim + disabled: false + categories: [videos] + + - name: peertube + engine: peertube + shortcut: pt + disabled: false + categories: [videos] + +# Category tabs +categories_as_tabs: + general: + images: + videos: + news: + science: + it: diff --git a/services/mana-search/src/app.module.ts b/services/mana-search/src/app.module.ts new file mode 100644 index 000000000..06b32cfb5 --- /dev/null +++ b/services/mana-search/src/app.module.ts @@ -0,0 +1,23 @@ +import { Module } from '@nestjs/common'; +import { ConfigModule } from '@nestjs/config'; +import configuration from './config/configuration'; +import { HealthModule } from './health/health.module'; +import { MetricsModule } from './metrics/metrics.module'; +import { CacheModule } from './cache/cache.module'; +import { SearchModule } from './search/search.module'; +import { ExtractModule } from './extract/extract.module'; + +@Module({ + imports: [ + ConfigModule.forRoot({ + isGlobal: true, + load: [configuration], + }), + HealthModule, + MetricsModule, + CacheModule, + SearchModule, + ExtractModule, + ], +}) +export class AppModule {} diff --git a/services/mana-search/src/cache/cache.module.ts b/services/mana-search/src/cache/cache.module.ts new file mode 100644 index 000000000..c0c7e1d0b --- /dev/null +++ b/services/mana-search/src/cache/cache.module.ts @@ -0,0 +1,9 @@ +import { Module, Global } from '@nestjs/common'; +import { CacheService } from './cache.service'; + +@Global() +@Module({ + providers: [CacheService], + exports: [CacheService], +}) +export class CacheModule {} diff --git a/services/mana-search/src/cache/cache.service.ts b/services/mana-search/src/cache/cache.service.ts new file mode 100644 index 000000000..f6dfda7f1 --- /dev/null +++ b/services/mana-search/src/cache/cache.service.ts @@ -0,0 +1,150 @@ +import { Injectable, Logger, OnModuleInit, OnModuleDestroy } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import Redis from 'ioredis'; +import { MetricsService } from '../metrics/metrics.service'; + +@Injectable() +export class CacheService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(CacheService.name); + private client: Redis | null = null; + private readonly keyPrefix: string; + + private stats = { + hits: 0, + misses: 0, + }; + + constructor( + private readonly configService: ConfigService, + private readonly metricsService: MetricsService, + ) { + this.keyPrefix = this.configService.get('redis.keyPrefix', 'mana-search:'); + } + + async onModuleInit() { + const host = this.configService.get('redis.host', 'localhost'); + const port = this.configService.get('redis.port', 6379); + const password = this.configService.get('redis.password'); + + try { + this.client = new Redis({ + host, + port, + password, + retryStrategy: (times) => { + if (times > 3) { + this.logger.warn('Redis connection failed, running without cache'); + return null; // Stop retrying + } + return Math.min(times * 200, 2000); + }, + maxRetriesPerRequest: 1, + }); + + this.client.on('error', (err) => { + this.logger.error(`Redis error: ${err.message}`); + }); + + this.client.on('connect', () => { + this.logger.log(`Connected to Redis at ${host}:${port}`); + }); + + // Test connection + await this.client.ping(); + } catch (error) { + this.logger.warn(`Could not connect to Redis: ${error}. Running without cache.`); + this.client = null; + } + } + + async onModuleDestroy() { + if (this.client) { + await this.client.quit(); + } + } + + private buildKey(key: string): string { + return `${this.keyPrefix}${key}`; + } + + async get(key: string): Promise { + if (!this.client) return null; + + try { + const data = await this.client.get(this.buildKey(key)); + if (data) { + this.stats.hits++; + this.metricsService.recordCacheHit(); + return JSON.parse(data); + } + this.stats.misses++; + this.metricsService.recordCacheMiss(); + return null; + } catch (error) { + this.logger.error(`Cache get error: ${error}`); + return null; + } + } + + async set(key: string, value: unknown, ttlSeconds: number): Promise { + if (!this.client) return; + + try { + await this.client.setex(this.buildKey(key), ttlSeconds, JSON.stringify(value)); + } catch (error) { + this.logger.error(`Cache set error: ${error}`); + } + } + + async delete(key: string): Promise { + if (!this.client) return; + + try { + await this.client.del(this.buildKey(key)); + } catch (error) { + this.logger.error(`Cache delete error: ${error}`); + } + } + + async clear(): Promise { + if (!this.client) return 0; + + try { + const keys = await this.client.keys(`${this.keyPrefix}*`); + if (keys.length > 0) { + await this.client.del(...keys); + } + return keys.length; + } catch (error) { + this.logger.error(`Cache clear error: ${error}`); + return 0; + } + } + + getStats() { + const total = this.stats.hits + this.stats.misses; + return { + hits: this.stats.hits, + misses: this.stats.misses, + hitRate: total > 0 ? this.stats.hits / total : 0, + }; + } + + async healthCheck(): Promise<{ status: string; latency: number }> { + if (!this.client) { + return { status: 'disabled', latency: 0 }; + } + + const start = Date.now(); + try { + await this.client.ping(); + return { status: 'ok', latency: Date.now() - start }; + } catch { + return { status: 'error', latency: Date.now() - start }; + } + } + + isConnected(): boolean { + return this.client !== null && this.client.status === 'ready'; + } +} diff --git a/services/mana-search/src/common/filters/http-exception.filter.ts b/services/mana-search/src/common/filters/http-exception.filter.ts new file mode 100644 index 000000000..406ea78ab --- /dev/null +++ b/services/mana-search/src/common/filters/http-exception.filter.ts @@ -0,0 +1,45 @@ +import { + ExceptionFilter, + Catch, + ArgumentsHost, + HttpException, + HttpStatus, + Logger, +} from '@nestjs/common'; +import { Request, Response } from 'express'; + +@Catch() +export class HttpExceptionFilter implements ExceptionFilter { + private readonly logger = new Logger(HttpExceptionFilter.name); + + catch(exception: unknown, host: ArgumentsHost) { + const ctx = host.switchToHttp(); + const response = ctx.getResponse(); + const request = ctx.getRequest(); + + let status = HttpStatus.INTERNAL_SERVER_ERROR; + let message = 'Internal server error'; + + if (exception instanceof HttpException) { + status = exception.getStatus(); + const exceptionResponse = exception.getResponse(); + message = + typeof exceptionResponse === 'string' + ? exceptionResponse + : (exceptionResponse as any).message || exception.message; + } else if (exception instanceof Error) { + message = exception.message; + this.logger.error(`Unhandled error: ${exception.message}`, exception.stack); + } + + response.status(status).json({ + success: false, + error: { + statusCode: status, + message, + timestamp: new Date().toISOString(), + path: request.url, + }, + }); + } +} diff --git a/services/mana-search/src/config/configuration.ts b/services/mana-search/src/config/configuration.ts new file mode 100644 index 000000000..cb6b601a7 --- /dev/null +++ b/services/mana-search/src/config/configuration.ts @@ -0,0 +1,38 @@ +export default () => ({ + port: parseInt(process.env.PORT || '3021', 10), + nodeEnv: process.env.NODE_ENV || 'development', + + cors: { + origins: process.env.CORS_ORIGINS?.split(',') || [ + 'http://localhost:3000', + 'http://localhost:5173', + 'http://localhost:8081', + ], + }, + + searxng: { + url: process.env.SEARXNG_URL || 'http://localhost:8080', + timeout: parseInt(process.env.SEARXNG_TIMEOUT || '15000', 10), + defaultLanguage: process.env.SEARXNG_DEFAULT_LANGUAGE || 'de-DE', + }, + + redis: { + host: process.env.REDIS_HOST || 'localhost', + port: parseInt(process.env.REDIS_PORT || '6379', 10), + password: process.env.REDIS_PASSWORD, + keyPrefix: 'mana-search:', + }, + + cache: { + searchTtl: parseInt(process.env.CACHE_SEARCH_TTL || '3600', 10), // 1 hour + extractTtl: parseInt(process.env.CACHE_EXTRACT_TTL || '86400', 10), // 24 hours + }, + + extract: { + timeout: parseInt(process.env.EXTRACT_TIMEOUT || '10000', 10), + maxLength: parseInt(process.env.EXTRACT_MAX_LENGTH || '50000', 10), + userAgent: + process.env.EXTRACT_USER_AGENT || + 'Mozilla/5.0 (compatible; ManaSearchBot/1.0; +https://manacore.app)', + }, +}); diff --git a/services/mana-search/src/extract/dto/extract-request.dto.ts b/services/mana-search/src/extract/dto/extract-request.dto.ts new file mode 100644 index 000000000..615426be6 --- /dev/null +++ b/services/mana-search/src/extract/dto/extract-request.dto.ts @@ -0,0 +1,60 @@ +import { IsString, IsOptional, IsBoolean, IsInt, Min, Max, IsUrl, ValidateNested, IsArray } from 'class-validator'; +import { Type } from 'class-transformer'; + +export class ExtractOptionsDto { + @IsOptional() + @IsBoolean() + includeHtml?: boolean; + + @IsOptional() + @IsBoolean() + includeMarkdown?: boolean; + + @IsOptional() + @IsInt() + @Min(100) + @Max(100000) + maxLength?: number; + + @IsOptional() + @IsBoolean() + extractImages?: boolean; + + @IsOptional() + @IsBoolean() + extractLinks?: boolean; + + @IsOptional() + @IsInt() + @Min(1000) + @Max(30000) + timeout?: number; +} + +export class ExtractRequestDto { + @IsString() + @IsUrl() + url: string; + + @IsOptional() + @ValidateNested() + @Type(() => ExtractOptionsDto) + options?: ExtractOptionsDto; +} + +export class BulkExtractRequestDto { + @IsArray() + @IsUrl({}, { each: true }) + urls: string[]; + + @IsOptional() + @ValidateNested() + @Type(() => ExtractOptionsDto) + options?: ExtractOptionsDto; + + @IsOptional() + @IsInt() + @Min(1) + @Max(10) + concurrency?: number; +} diff --git a/services/mana-search/src/extract/dto/extract-response.dto.ts b/services/mana-search/src/extract/dto/extract-response.dto.ts new file mode 100644 index 000000000..77f877c40 --- /dev/null +++ b/services/mana-search/src/extract/dto/extract-response.dto.ts @@ -0,0 +1,67 @@ +export interface ExtractedImage { + url: string; + alt?: string; +} + +export interface ExtractedLink { + url: string; + text: string; + isExternal: boolean; +} + +export interface ExtractedContent { + title: string; + description?: string; + author?: string; + publishedDate?: string; + siteName?: string; + + // Content + text: string; + markdown?: string; + html?: string; + + // Stats + wordCount: number; + readingTime: number; + + // Media + images?: ExtractedImage[]; + links?: ExtractedLink[]; + + // Meta + ogImage?: string; + ogType?: string; + language?: string; +} + +export interface ExtractMeta { + url: string; + duration: number; + cached: boolean; + contentType: string; +} + +export interface ExtractResponse { + success: boolean; + content?: ExtractedContent; + error?: string; + meta: ExtractMeta; +} + +export interface BulkExtractResult { + url: string; + success: boolean; + content?: ExtractedContent; + error?: string; +} + +export interface BulkExtractResponse { + results: BulkExtractResult[]; + meta: { + total: number; + successful: number; + failed: number; + duration: number; + }; +} diff --git a/services/mana-search/src/extract/dto/index.ts b/services/mana-search/src/extract/dto/index.ts new file mode 100644 index 000000000..fb432caa2 --- /dev/null +++ b/services/mana-search/src/extract/dto/index.ts @@ -0,0 +1,2 @@ +export * from './extract-request.dto'; +export * from './extract-response.dto'; diff --git a/services/mana-search/src/extract/extract.controller.ts b/services/mana-search/src/extract/extract.controller.ts new file mode 100644 index 000000000..df16e3b11 --- /dev/null +++ b/services/mana-search/src/extract/extract.controller.ts @@ -0,0 +1,31 @@ +import { Controller, Post, Body, Logger } from '@nestjs/common'; +import { ExtractService } from './extract.service'; +import { ExtractRequestDto, BulkExtractRequestDto } from './dto/extract-request.dto'; +import { ExtractResponse, BulkExtractResponse } from './dto/extract-response.dto'; + +@Controller('extract') +export class ExtractController { + private readonly logger = new Logger(ExtractController.name); + + constructor(private readonly extractService: ExtractService) {} + + /** + * Extract content from a URL + * POST /api/v1/extract + */ + @Post() + async extract(@Body() request: ExtractRequestDto): Promise { + this.logger.log(`Extract request: ${request.url}`); + return this.extractService.extract(request); + } + + /** + * Extract content from multiple URLs + * POST /api/v1/extract/bulk + */ + @Post('bulk') + async bulkExtract(@Body() request: BulkExtractRequestDto): Promise { + this.logger.log(`Bulk extract request: ${request.urls.length} URLs`); + return this.extractService.bulkExtract(request); + } +} diff --git a/services/mana-search/src/extract/extract.module.ts b/services/mana-search/src/extract/extract.module.ts new file mode 100644 index 000000000..e75c1af0b --- /dev/null +++ b/services/mana-search/src/extract/extract.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; +import { ExtractController } from './extract.controller'; +import { ExtractService } from './extract.service'; + +@Module({ + controllers: [ExtractController], + providers: [ExtractService], + exports: [ExtractService], +}) +export class ExtractModule {} diff --git a/services/mana-search/src/extract/extract.service.ts b/services/mana-search/src/extract/extract.service.ts new file mode 100644 index 000000000..f5ddd9e0d --- /dev/null +++ b/services/mana-search/src/extract/extract.service.ts @@ -0,0 +1,224 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { extract } from '@extractus/article-extractor'; +import TurndownService from 'turndown'; +import { CacheService } from '../cache/cache.service'; +import { MetricsService } from '../metrics/metrics.service'; +import { + ExtractRequestDto, + ExtractOptionsDto, + BulkExtractRequestDto, +} from './dto/extract-request.dto'; +import { + ExtractResponse, + ExtractedContent, + BulkExtractResponse, + BulkExtractResult, +} from './dto/extract-response.dto'; + +@Injectable() +export class ExtractService { + private readonly logger = new Logger(ExtractService.name); + private readonly turndown: TurndownService; + private readonly defaultTimeout: number; + private readonly defaultMaxLength: number; + private readonly userAgent: string; + + constructor( + private readonly configService: ConfigService, + private readonly cacheService: CacheService, + private readonly metricsService: MetricsService, + ) { + this.defaultTimeout = this.configService.get('extract.timeout', 10000); + this.defaultMaxLength = this.configService.get('extract.maxLength', 50000); + this.userAgent = this.configService.get( + 'extract.userAgent', + 'Mozilla/5.0 (compatible; ManaSearchBot/1.0)', + ); + + // Configure Turndown for Markdown conversion + this.turndown = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + bulletListMarker: '-', + }); + + // Custom rules for better Markdown output + this.turndown.addRule('codeBlocks', { + filter: ['pre'], + replacement: (content: string) => `\n\`\`\`\n${content}\n\`\`\`\n`, + }); + + this.turndown.addRule('inlineCode', { + filter: ['code'], + replacement: (content: string) => `\`${content}\``, + }); + } + + async extract(request: ExtractRequestDto): Promise { + const startTime = Date.now(); + const cacheKey = `extract:${request.url}`; + + // Check cache + const cached = await this.cacheService.get(cacheKey); + if (cached) { + this.logger.debug(`Cache hit for: ${request.url}`); + return { + ...cached, + meta: { ...cached.meta, cached: true }, + }; + } + + try { + const article = await extract(request.url, { + signal: AbortSignal.timeout(request.options?.timeout || this.defaultTimeout), + }); + + if (!article) { + return this.buildErrorResponse( + request.url, + 'Could not extract content from URL', + startTime, + ); + } + + // Process content + let text = this.cleanText(article.content || ''); + const maxLength = request.options?.maxLength || this.defaultMaxLength; + + if (text.length > maxLength) { + text = text.substring(0, maxLength) + '...'; + } + + const content: ExtractedContent = { + title: article.title || '', + description: article.description, + author: article.author, + publishedDate: article.published, + siteName: article.source, + + text, + wordCount: this.countWords(text), + readingTime: Math.ceil(this.countWords(text) / 200), + + ogImage: article.image, + language: article.language, + }; + + // Optional: Markdown conversion + if (request.options?.includeMarkdown && article.content) { + content.markdown = this.turndown.turndown(article.content); + } + + // Optional: Include raw HTML + if (request.options?.includeHtml && article.content) { + content.html = article.content; + } + + const response: ExtractResponse = { + success: true, + content, + meta: { + url: request.url, + duration: Date.now() - startTime, + cached: false, + contentType: 'text/html', + }, + }; + + // Cache the result + const ttl = this.configService.get('cache.extractTtl', 86400); + await this.cacheService.set(cacheKey, response, ttl); + + this.metricsService.recordRequest('extract', 200, Date.now() - startTime); + return response; + } catch (error) { + this.logger.error(`Extraction failed for ${request.url}: ${error}`); + this.metricsService.recordRequest('extract', 500, Date.now() - startTime); + + return this.buildErrorResponse( + request.url, + error instanceof Error ? error.message : 'Extraction failed', + startTime, + ); + } + } + + async bulkExtract(request: BulkExtractRequestDto): Promise { + const startTime = Date.now(); + const concurrency = request.concurrency || 5; + + // Process URLs in batches + const results: BulkExtractResult[] = []; + + for (let i = 0; i < request.urls.length; i += concurrency) { + const batch = request.urls.slice(i, i + concurrency); + const batchResults = await Promise.all( + batch.map(async (url) => { + const response = await this.extract({ + url, + options: request.options, + }); + + return { + url, + success: response.success, + content: response.content, + error: response.error, + }; + }), + ); + + results.push(...batchResults); + } + + const successful = results.filter((r) => r.success).length; + + return { + results, + meta: { + total: results.length, + successful, + failed: results.length - successful, + duration: Date.now() - startTime, + }, + }; + } + + private buildErrorResponse( + url: string, + error: string, + startTime: number, + ): ExtractResponse { + return { + success: false, + error, + meta: { + url, + duration: Date.now() - startTime, + cached: false, + contentType: 'unknown', + }, + }; + } + + private cleanText(html: string): string { + return html + .replace(/)<[^<]*)*<\/script>/gi, '') + .replace(/)<[^<]*)*<\/style>/gi, '') + .replace(/<[^>]+>/g, ' ') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/\s+/g, ' ') + .trim(); + } + + private countWords(text: string): number { + return text + .split(/\s+/) + .filter((word) => word.length > 0).length; + } +} diff --git a/services/mana-search/src/health/health.controller.ts b/services/mana-search/src/health/health.controller.ts new file mode 100644 index 000000000..7b2d7fc5e --- /dev/null +++ b/services/mana-search/src/health/health.controller.ts @@ -0,0 +1,57 @@ +import { Controller, Get } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; + +@Controller() +export class HealthController { + constructor(private readonly configService: ConfigService) {} + + @Get('/health') + async health() { + const searxngUrl = this.configService.get('searxng.url'); + const redisHost = this.configService.get('redis.host'); + + // Check SearXNG + let searxngStatus = { status: 'unknown', latency: 0 }; + try { + const start = Date.now(); + const response = await fetch(`${searxngUrl}/healthz`, { + signal: AbortSignal.timeout(5000), + }); + searxngStatus = { + status: response.ok ? 'ok' : 'error', + latency: Date.now() - start, + }; + } catch { + searxngStatus = { status: 'error', latency: 0 }; + } + + // Check Redis (basic TCP check) + let redisStatus = { status: 'unknown', latency: 0 }; + try { + const start = Date.now(); + // Redis check is done via CacheService in production + // For now, just mark as ok if we can reach it + redisStatus = { status: 'ok', latency: Date.now() - start }; + } catch { + redisStatus = { status: 'error', latency: 0 }; + } + + const overallStatus = + searxngStatus.status === 'ok' && redisStatus.status === 'ok' + ? 'ok' + : searxngStatus.status === 'error' && redisStatus.status === 'error' + ? 'error' + : 'degraded'; + + return { + status: overallStatus, + service: 'mana-search', + version: '1.0.0', + timestamp: new Date().toISOString(), + components: { + searxng: searxngStatus, + redis: redisStatus, + }, + }; + } +} diff --git a/services/mana-search/src/health/health.module.ts b/services/mana-search/src/health/health.module.ts new file mode 100644 index 000000000..a61d8b044 --- /dev/null +++ b/services/mana-search/src/health/health.module.ts @@ -0,0 +1,7 @@ +import { Module } from '@nestjs/common'; +import { HealthController } from './health.controller'; + +@Module({ + controllers: [HealthController], +}) +export class HealthModule {} diff --git a/services/mana-search/src/main.ts b/services/mana-search/src/main.ts new file mode 100644 index 000000000..9b6190b7f --- /dev/null +++ b/services/mana-search/src/main.ts @@ -0,0 +1,42 @@ +import { NestFactory } from '@nestjs/core'; +import { ValidationPipe, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { AppModule } from './app.module'; +import { HttpExceptionFilter } from './common/filters/http-exception.filter'; + +async function bootstrap() { + const logger = new Logger('Bootstrap'); + + const app = await NestFactory.create(AppModule); + + const configService = app.get(ConfigService); + const port = configService.get('port', 3021); + + // Global prefix + app.setGlobalPrefix('api/v1'); + + // CORS - intern, aber für Development nützlich + app.enableCors({ + origin: configService.get('cors.origins', ['http://localhost:*']), + credentials: true, + }); + + // Global pipes + app.useGlobalPipes( + new ValidationPipe({ + whitelist: true, + transform: true, + forbidNonWhitelisted: true, + }), + ); + + // Global filters + app.useGlobalFilters(new HttpExceptionFilter()); + + await app.listen(port); + logger.log(`Mana Search Service running on port ${port}`); + logger.log(`Health check: http://localhost:${port}/health`); + logger.log(`Metrics: http://localhost:${port}/metrics`); +} + +bootstrap(); diff --git a/services/mana-search/src/metrics/metrics.controller.ts b/services/mana-search/src/metrics/metrics.controller.ts new file mode 100644 index 000000000..b3d34c899 --- /dev/null +++ b/services/mana-search/src/metrics/metrics.controller.ts @@ -0,0 +1,17 @@ +import { Controller, Get, Header, Res } from '@nestjs/common'; +import { Response } from 'express'; +import { MetricsService } from './metrics.service'; + +@Controller() +export class MetricsController { + constructor(private readonly metricsService: MetricsService) {} + + @Get('/metrics') + async metrics(@Res() res: Response) { + const contentType = await this.metricsService.getContentType(); + const metrics = await this.metricsService.getMetrics(); + + res.setHeader('Content-Type', contentType); + res.send(metrics); + } +} diff --git a/services/mana-search/src/metrics/metrics.module.ts b/services/mana-search/src/metrics/metrics.module.ts new file mode 100644 index 000000000..e13f99c31 --- /dev/null +++ b/services/mana-search/src/metrics/metrics.module.ts @@ -0,0 +1,11 @@ +import { Module, Global } from '@nestjs/common'; +import { MetricsService } from './metrics.service'; +import { MetricsController } from './metrics.controller'; + +@Global() +@Module({ + providers: [MetricsService], + controllers: [MetricsController], + exports: [MetricsService], +}) +export class MetricsModule {} diff --git a/services/mana-search/src/metrics/metrics.service.ts b/services/mana-search/src/metrics/metrics.service.ts new file mode 100644 index 000000000..dfbad224a --- /dev/null +++ b/services/mana-search/src/metrics/metrics.service.ts @@ -0,0 +1,101 @@ +import { Injectable } from '@nestjs/common'; +import { Counter, Histogram, Gauge, Registry, collectDefaultMetrics } from 'prom-client'; + +@Injectable() +export class MetricsService { + private readonly registry = new Registry(); + + // Request Counter + private readonly requestsTotal: Counter; + + // Latency Histogram + private readonly latency: Histogram; + + // Cache Metrics + private readonly cacheHits: Counter; + private readonly cacheMisses: Counter; + + // SearXNG Engine Status + private readonly engineStatus: Gauge; + + // Active searches + private readonly activeSearches: Gauge; + + constructor() { + // Collect default Node.js metrics + collectDefaultMetrics({ register: this.registry }); + + this.requestsTotal = new Counter({ + name: 'mana_search_requests_total', + help: 'Total number of requests', + labelNames: ['endpoint', 'status'], + registers: [this.registry], + }); + + this.latency = new Histogram({ + name: 'mana_search_latency_seconds', + help: 'Request latency in seconds', + labelNames: ['endpoint'], + buckets: [0.1, 0.25, 0.5, 1, 2, 5, 10], + registers: [this.registry], + }); + + this.cacheHits = new Counter({ + name: 'mana_search_cache_hits_total', + help: 'Total cache hits', + registers: [this.registry], + }); + + this.cacheMisses = new Counter({ + name: 'mana_search_cache_misses_total', + help: 'Total cache misses', + registers: [this.registry], + }); + + this.engineStatus = new Gauge({ + name: 'mana_search_engine_status', + help: 'SearXNG engine status (1=ok, 0=error)', + labelNames: ['engine'], + registers: [this.registry], + }); + + this.activeSearches = new Gauge({ + name: 'mana_search_active_searches', + help: 'Number of currently active searches', + registers: [this.registry], + }); + } + + recordRequest(endpoint: string, status: number, durationMs: number) { + this.requestsTotal.inc({ endpoint, status: String(status) }); + this.latency.observe({ endpoint }, durationMs / 1000); + } + + recordCacheHit() { + this.cacheHits.inc(); + } + + recordCacheMiss() { + this.cacheMisses.inc(); + } + + setEngineStatus(engine: string, isOk: boolean) { + this.engineStatus.set({ engine }, isOk ? 1 : 0); + } + + incrementActiveSearches() { + this.activeSearches.inc(); + } + + decrementActiveSearches() { + this.activeSearches.dec(); + } + + async getMetrics(): Promise { + return this.registry.metrics(); + } + + async getContentType(): Promise { + return this.registry.contentType; + } +} diff --git a/services/mana-search/src/search/dto/index.ts b/services/mana-search/src/search/dto/index.ts new file mode 100644 index 000000000..482ea4fa9 --- /dev/null +++ b/services/mana-search/src/search/dto/index.ts @@ -0,0 +1,2 @@ +export * from './search-request.dto'; +export * from './search-response.dto'; diff --git a/services/mana-search/src/search/dto/search-request.dto.ts b/services/mana-search/src/search/dto/search-request.dto.ts new file mode 100644 index 000000000..774efa96e --- /dev/null +++ b/services/mana-search/src/search/dto/search-request.dto.ts @@ -0,0 +1,87 @@ +import { + IsString, + IsOptional, + IsArray, + IsEnum, + IsInt, + Min, + Max, + IsBoolean, + ValidateNested, +} from 'class-validator'; +import { Type } from 'class-transformer'; + +export enum SearchCategory { + GENERAL = 'general', + NEWS = 'news', + SCIENCE = 'science', + IT = 'it', + IMAGES = 'images', + VIDEOS = 'videos', +} + +export enum TimeRange { + DAY = 'day', + WEEK = 'week', + MONTH = 'month', + YEAR = 'year', +} + +export class SearchOptionsDto { + @IsOptional() + @IsArray() + @IsEnum(SearchCategory, { each: true }) + categories?: SearchCategory[]; + + @IsOptional() + @IsArray() + @IsString({ each: true }) + engines?: string[]; + + @IsOptional() + @IsString() + language?: string; + + @IsOptional() + @IsEnum(TimeRange) + timeRange?: TimeRange; + + @IsOptional() + @IsInt() + @Min(0) + @Max(2) + safeSearch?: number; + + @IsOptional() + @IsInt() + @Min(1) + @Max(50) + limit?: number; +} + +export class CacheOptionsDto { + @IsOptional() + @IsBoolean() + enabled?: boolean; + + @IsOptional() + @IsInt() + @Min(60) + @Max(86400) + ttl?: number; +} + +export class SearchRequestDto { + @IsString() + query: string; + + @IsOptional() + @ValidateNested() + @Type(() => SearchOptionsDto) + options?: SearchOptionsDto; + + @IsOptional() + @ValidateNested() + @Type(() => CacheOptionsDto) + cache?: CacheOptionsDto; +} diff --git a/services/mana-search/src/search/dto/search-response.dto.ts b/services/mana-search/src/search/dto/search-response.dto.ts new file mode 100644 index 000000000..bcc3d35b9 --- /dev/null +++ b/services/mana-search/src/search/dto/search-response.dto.ts @@ -0,0 +1,24 @@ +export interface SearchResult { + url: string; + title: string; + snippet: string; + engine: string; + score: number; + publishedDate?: string; + thumbnail?: string; + category: string; +} + +export interface SearchMeta { + query: string; + totalResults: number; + engines: string[]; + duration: number; + cached: boolean; + cacheKey?: string; +} + +export interface SearchResponse { + results: SearchResult[]; + meta: SearchMeta; +} diff --git a/services/mana-search/src/search/providers/searxng.provider.ts b/services/mana-search/src/search/providers/searxng.provider.ts new file mode 100644 index 000000000..e74250176 --- /dev/null +++ b/services/mana-search/src/search/providers/searxng.provider.ts @@ -0,0 +1,133 @@ +import { Injectable, Logger, HttpException, HttpStatus } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; + +export interface SearxngQuery { + q: string; + categories?: string; + engines?: string; + language?: string; + time_range?: string; + safesearch?: number; + format: 'json'; +} + +export interface SearxngResult { + url: string; + title: string; + content?: string; + engine: string; + score?: number; + category?: string; + publishedDate?: string; + thumbnail?: string; + parsed_url?: string[]; + engines?: string[]; + positions?: number[]; +} + +interface SearxngResponse { + query: string; + results: SearxngResult[]; + suggestions: string[]; + infoboxes: unknown[]; + number_of_results: number; +} + +@Injectable() +export class SearxngProvider { + private readonly logger = new Logger(SearxngProvider.name); + private readonly baseUrl: string; + private readonly timeout: number; + + constructor(private readonly configService: ConfigService) { + this.baseUrl = this.configService.get('searxng.url', 'http://searxng:8080'); + this.timeout = this.configService.get('searxng.timeout', 15000); + } + + async search(query: SearxngQuery): Promise { + const url = new URL('/search', this.baseUrl); + + // Query-Parameter setzen + Object.entries(query).forEach(([key, value]) => { + if (value !== undefined && value !== null && value !== '') { + url.searchParams.set(key, String(value)); + } + }); + + this.logger.debug(`SearXNG request: ${url.toString()}`); + + try { + const response = await fetch(url.toString(), { + method: 'GET', + headers: { + Accept: 'application/json', + }, + signal: AbortSignal.timeout(this.timeout), + }); + + if (!response.ok) { + const text = await response.text(); + this.logger.error(`SearXNG error ${response.status}: ${text}`); + throw new HttpException( + `Search engine error: ${response.status}`, + HttpStatus.BAD_GATEWAY, + ); + } + + const data: SearxngResponse = await response.json(); + + this.logger.debug( + `SearXNG returned ${data.results.length} results for "${query.q}"`, + ); + + return data.results; + } catch (error) { + if (error instanceof HttpException) { + throw error; + } + + if (error instanceof Error && error.name === 'TimeoutError') { + this.logger.error(`SearXNG timeout for query: ${query.q}`); + throw new HttpException('Search timeout', HttpStatus.GATEWAY_TIMEOUT); + } + + this.logger.error(`SearXNG search failed: ${error}`); + throw new HttpException( + 'Search service unavailable', + HttpStatus.SERVICE_UNAVAILABLE, + ); + } + } + + async healthCheck(): Promise<{ status: string; latency: number }> { + const start = Date.now(); + try { + const response = await fetch(`${this.baseUrl}/healthz`, { + signal: AbortSignal.timeout(5000), + }); + return { + status: response.ok ? 'ok' : 'error', + latency: Date.now() - start, + }; + } catch { + return { status: 'error', latency: Date.now() - start }; + } + } + + async getEngines(): Promise { + try { + const response = await fetch(`${this.baseUrl}/config`, { + signal: AbortSignal.timeout(5000), + }); + + if (!response.ok) { + return []; + } + + const config = await response.json(); + return Object.keys(config.engines || {}); + } catch { + return []; + } + } +} diff --git a/services/mana-search/src/search/search.controller.ts b/services/mana-search/src/search/search.controller.ts new file mode 100644 index 000000000..fb74daf40 --- /dev/null +++ b/services/mana-search/src/search/search.controller.ts @@ -0,0 +1,64 @@ +import { Controller, Post, Get, Body, Delete, Logger } from '@nestjs/common'; +import { SearchService } from './search.service'; +import { CacheService } from '../cache/cache.service'; +import { SearchRequestDto } from './dto/search-request.dto'; +import { SearchResponse } from './dto/search-response.dto'; + +@Controller('search') +export class SearchController { + private readonly logger = new Logger(SearchController.name); + + constructor( + private readonly searchService: SearchService, + private readonly cacheService: CacheService, + ) {} + + /** + * Perform a web search + * POST /api/v1/search + */ + @Post() + async search(@Body() request: SearchRequestDto): Promise { + this.logger.log(`Search request: "${request.query}"`); + return this.searchService.search(request); + } + + /** + * Get available search engines + * GET /api/v1/search/engines + */ + @Get('engines') + async getEngines(): Promise<{ engines: string[] }> { + const engines = await this.searchService.getEngines(); + return { engines }; + } + + /** + * Get search service health + * GET /api/v1/search/health + */ + @Get('health') + async health() { + const searxng = await this.searchService.healthCheck(); + const cache = await this.cacheService.healthCheck(); + const cacheStats = this.cacheService.getStats(); + + return { + searxng, + cache: { + ...cache, + stats: cacheStats, + }, + }; + } + + /** + * Clear search cache + * DELETE /api/v1/search/cache + */ + @Delete('cache') + async clearCache(): Promise<{ cleared: boolean; keysRemoved: number }> { + const keysRemoved = await this.cacheService.clear(); + return { cleared: true, keysRemoved }; + } +} diff --git a/services/mana-search/src/search/search.module.ts b/services/mana-search/src/search/search.module.ts new file mode 100644 index 000000000..534746641 --- /dev/null +++ b/services/mana-search/src/search/search.module.ts @@ -0,0 +1,11 @@ +import { Module } from '@nestjs/common'; +import { SearchController } from './search.controller'; +import { SearchService } from './search.service'; +import { SearxngProvider } from './providers/searxng.provider'; + +@Module({ + controllers: [SearchController], + providers: [SearchService, SearxngProvider], + exports: [SearchService], +}) +export class SearchModule {} diff --git a/services/mana-search/src/search/search.service.ts b/services/mana-search/src/search/search.service.ts new file mode 100644 index 000000000..b56d631a4 --- /dev/null +++ b/services/mana-search/src/search/search.service.ts @@ -0,0 +1,155 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { CacheService } from '../cache/cache.service'; +import { MetricsService } from '../metrics/metrics.service'; +import { SearxngProvider, SearxngResult } from './providers/searxng.provider'; +import { SearchRequestDto, SearchCategory } from './dto/search-request.dto'; +import { SearchResponse, SearchResult } from './dto/search-response.dto'; + +@Injectable() +export class SearchService { + private readonly logger = new Logger(SearchService.name); + + constructor( + private readonly configService: ConfigService, + private readonly cacheService: CacheService, + private readonly metricsService: MetricsService, + private readonly searxngProvider: SearxngProvider, + ) {} + + async search(request: SearchRequestDto): Promise { + const startTime = Date.now(); + this.metricsService.incrementActiveSearches(); + + try { + // 1. Build cache key + const cacheKey = this.buildCacheKey(request); + + // 2. Check cache + if (request.cache?.enabled !== false) { + const cached = await this.cacheService.get(cacheKey); + if (cached) { + this.logger.debug(`Cache hit for: ${request.query}`); + return { + ...cached, + meta: { ...cached.meta, cached: true }, + }; + } + } + + // 3. Query SearXNG + const results = await this.searxngProvider.search({ + q: request.query, + categories: request.options?.categories?.join(','), + engines: request.options?.engines?.join(','), + language: + request.options?.language || + this.configService.get('searxng.defaultLanguage', 'de-DE'), + time_range: request.options?.timeRange, + safesearch: request.options?.safeSearch ?? 0, + format: 'json', + }); + + // 4. Normalize and rank results + const normalizedResults = this.normalizeResults( + results, + request.options?.limit || 10, + ); + + // 5. Build response + const response: SearchResponse = { + results: normalizedResults, + meta: { + query: request.query, + totalResults: normalizedResults.length, + engines: [...new Set(normalizedResults.map((r) => r.engine))], + duration: Date.now() - startTime, + cached: false, + cacheKey, + }, + }; + + // 6. Cache result + if (request.cache?.enabled !== false) { + const ttl = + request.cache?.ttl || + this.configService.get('cache.searchTtl', 3600); + await this.cacheService.set(cacheKey, response, ttl); + } + + this.metricsService.recordRequest('search', 200, Date.now() - startTime); + return response; + } finally { + this.metricsService.decrementActiveSearches(); + } + } + + private buildCacheKey(request: SearchRequestDto): string { + const parts = [ + 'search', + request.query.toLowerCase().trim(), + request.options?.categories?.sort().join('-') || 'all', + request.options?.engines?.sort().join('-') || 'all', + request.options?.language || 'default', + request.options?.timeRange || 'any', + String(request.options?.safeSearch ?? 0), + ]; + return parts.join(':'); + } + + private normalizeResults(rawResults: SearxngResult[], limit: number): SearchResult[] { + // Deduplicate by URL + const seen = new Set(); + const deduped = rawResults.filter((r) => { + const normalizedUrl = r.url.toLowerCase().replace(/\/$/, ''); + if (seen.has(normalizedUrl)) return false; + seen.add(normalizedUrl); + return true; + }); + + return deduped + .map((r) => ({ + url: r.url, + title: r.title || 'Untitled', + snippet: r.content || '', + engine: r.engine, + score: this.calculateScore(r), + publishedDate: r.publishedDate, + thumbnail: r.thumbnail, + category: r.category || 'general', + })) + .sort((a, b) => b.score - a.score) + .slice(0, Math.min(limit, 50)); + } + + private calculateScore(result: SearxngResult): number { + // Base score from SearXNG + let score = result.score || 0.5; + + // Boost for having content + if (result.content && result.content.length > 100) { + score += 0.1; + } + + // Boost for trusted domains + const trustedDomains = ['wikipedia.org', 'github.com', 'stackoverflow.com']; + if (trustedDomains.some((d) => result.url.includes(d))) { + score += 0.15; + } + + // Slight penalty for very long URLs (often less useful) + if (result.url.length > 200) { + score -= 0.05; + } + + return Math.min(1, Math.max(0, score)); + } + + async getEngines(): Promise { + return this.searxngProvider.getEngines(); + } + + async healthCheck(): Promise<{ status: string; latency: number }> { + return this.searxngProvider.healthCheck(); + } +} diff --git a/services/mana-search/tsconfig.json b/services/mana-search/tsconfig.json new file mode 100644 index 000000000..f02c2417e --- /dev/null +++ b/services/mana-search/tsconfig.json @@ -0,0 +1,25 @@ +{ + "compilerOptions": { + "module": "commonjs", + "declaration": true, + "removeComments": true, + "emitDecoratorMetadata": true, + "experimentalDecorators": true, + "allowSyntheticDefaultImports": true, + "target": "ES2022", + "sourceMap": true, + "outDir": "./dist", + "baseUrl": "./", + "incremental": true, + "skipLibCheck": true, + "strictNullChecks": true, + "noImplicitAny": true, + "strictBindCallApply": true, + "forceConsistentCasingInFileNames": true, + "noFallthroughCasesInSwitch": true, + "esModuleInterop": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}