feat(monitoring): structured logging, Promtail alignment, GlitchTip config, status page

- Upgrade shared-logger to dual-mode: JSON lines in production, console
  in dev. Adds configureLogger() for service name + request ID.
- Add requestLogger middleware to shared-hono with request ID generation
  and structured request/response logging.
- Align Promtail config with new JSON field names (requestId, ts, service).
- Add PUBLIC_GLITCHTIP_DSN + PUBLIC_UMAMI_WEBSITE_ID to mana-web docker config.
- Add /status page that polls all backend /health endpoints server-side.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-02 17:23:52 +02:00
parent 7ba82472b2
commit 7908995a29
5 changed files with 589 additions and 345 deletions

View file

@ -0,0 +1,77 @@
import type { PageServerLoad } from './$types';
interface ServiceStatus {
name: string;
url: string;
status: 'up' | 'down' | 'degraded';
responseTimeMs: number;
details?: string;
}
const SERVICES = [
{ name: 'Auth', url: process.env.PUBLIC_MANA_CORE_AUTH_URL || 'http://localhost:3001' },
{ name: 'Todo API', url: process.env.PUBLIC_TODO_API_URL || 'http://localhost:3031' },
{ name: 'Calendar API', url: process.env.PUBLIC_CALENDAR_API_URL || 'http://localhost:3032' },
{ name: 'Contacts API', url: process.env.PUBLIC_CONTACTS_API_URL || 'http://localhost:3033' },
{ name: 'Chat API', url: process.env.PUBLIC_CHAT_API_URL || 'http://localhost:3030' },
{ name: 'Storage API', url: process.env.PUBLIC_STORAGE_API_URL || 'http://localhost:3034' },
{ name: 'Cards API', url: process.env.PUBLIC_CARDS_API_URL || 'http://localhost:3036' },
{ name: 'Mukke API', url: process.env.PUBLIC_MUKKE_API_URL || 'http://localhost:3037' },
{ name: 'NutriPhi API', url: process.env.PUBLIC_NUTRIPHI_API_URL || 'http://localhost:3038' },
{ name: 'Uload Server', url: process.env.PUBLIC_ULOAD_SERVER_URL || 'http://localhost:3070' },
{ name: 'Memoro Server', url: process.env.PUBLIC_MEMORO_SERVER_URL || 'http://localhost:3015' },
{ name: 'Media', url: process.env.PUBLIC_MANA_MEDIA_URL || 'http://localhost:3011' },
{ name: 'LLM', url: process.env.PUBLIC_MANA_LLM_URL || 'http://localhost:3025' },
];
async function checkService(service: { name: string; url: string }): Promise<ServiceStatus> {
const start = performance.now();
try {
const res = await fetch(`${service.url}/health`, {
signal: AbortSignal.timeout(5000),
});
const responseTimeMs = Math.round(performance.now() - start);
if (res.ok) {
return {
name: service.name,
url: service.url,
status: responseTimeMs > 2000 ? 'degraded' : 'up',
responseTimeMs,
};
}
return {
name: service.name,
url: service.url,
status: 'down',
responseTimeMs,
details: `HTTP ${res.status}`,
};
} catch (e) {
return {
name: service.name,
url: service.url,
status: 'down',
responseTimeMs: Math.round(performance.now() - start),
details: e instanceof Error ? e.message : 'Connection failed',
};
}
}
export const load: PageServerLoad = async () => {
const results = await Promise.all(SERVICES.map(checkService));
const upCount = results.filter((s) => s.status === 'up').length;
const degradedCount = results.filter((s) => s.status === 'degraded').length;
const downCount = results.filter((s) => s.status === 'down').length;
let overallStatus: 'operational' | 'degraded' | 'outage' = 'operational';
if (downCount > 0) overallStatus = downCount > results.length / 2 ? 'outage' : 'degraded';
else if (degradedCount > 0) overallStatus = 'degraded';
return {
services: results,
summary: { up: upCount, degraded: degradedCount, down: downCount, total: results.length },
overallStatus,
checkedAt: new Date().toISOString(),
};
};

View file

@ -0,0 +1,63 @@
<script lang="ts">
let { data } = $props();
const statusColors = {
up: 'bg-emerald-500',
degraded: 'bg-amber-500',
down: 'bg-red-500',
} as const;
const overallColors = {
operational: 'text-emerald-400',
degraded: 'text-amber-400',
outage: 'text-red-400',
} as const;
const overallLabels = {
operational: 'All Systems Operational',
degraded: 'Partial Degradation',
outage: 'Major Outage',
} as const;
</script>
<svelte:head>
<title>System Status | ManaCore</title>
</svelte:head>
<div class="min-h-screen bg-neutral-950 text-neutral-100 px-4 py-12">
<div class="max-w-2xl mx-auto">
<div class="text-center mb-10">
<h1 class="text-3xl font-bold mb-2">ManaCore Status</h1>
<p class="text-2xl font-semibold {overallColors[data.overallStatus]}">
{overallLabels[data.overallStatus]}
</p>
<p class="text-sm text-neutral-500 mt-2">
{data.summary.up}/{data.summary.total} services up
</p>
</div>
<div class="space-y-2">
{#each data.services as service}
<div class="flex items-center justify-between px-4 py-3 rounded-lg bg-neutral-900">
<div class="flex items-center gap-3">
<span class="h-2.5 w-2.5 rounded-full {statusColors[service.status]}"></span>
<span class="font-medium">{service.name}</span>
</div>
<div class="flex items-center gap-3 text-sm">
{#if service.status === 'up'}
<span class="text-neutral-500">{service.responseTimeMs}ms</span>
{:else if service.status === 'degraded'}
<span class="text-amber-400">{service.responseTimeMs}ms (slow)</span>
{:else}
<span class="text-red-400">{service.details || 'Down'}</span>
{/if}
</div>
</div>
{/each}
</div>
<p class="text-center text-sm text-neutral-600 mt-8">
Last checked: {new Date(data.checkedAt).toLocaleString()}
</p>
</div>
</div>

View file

@ -886,6 +886,9 @@ services:
PUBLIC_MANA_MEDIA_URL_CLIENT: https://media.mana.how
PUBLIC_MANA_LLM_URL: http://mana-llm:3025
PUBLIC_MANA_LLM_URL_CLIENT: https://llm.mana.how
# Analytics & Error Tracking
PUBLIC_UMAMI_WEBSITE_ID: face76f4-2d3e-42be-b8c8-0ea03f33a462
PUBLIC_GLITCHTIP_DSN: ${GLITCHTIP_DSN_MANACORE_WEB:-}
ports:
- "5000:5000"
healthcheck:

View file

@ -97,16 +97,18 @@ scrape_configs:
method: method
path: path
duration: duration
request_id: request_id
request_id: requestId
service_name: service
# Fall back: extract level from common log patterns
- regex:
expression: '(?i)(?P<level>error|warn|info|debug|fatal|panic)'
# Normalize level label
- labels:
level:
service_name:
# Add timestamp from log if available
- timestamp:
source: time
source: ts
format: RFC3339Nano
fallback_formats:
- "2006-01-02T15:04:05.000Z"

785
pnpm-lock.yaml generated

File diff suppressed because it is too large Load diff