managarten/services/mana-ai/src/tracing.ts
Till JS 76577869e1 feat(mana-ai): OpenTelemetry tracing + Grafana Tempo backend
Add distributed tracing to the mana-ai background runner so mission
execution can be visualized end-to-end in Grafana.

Instrumentation (services/mana-ai/):
- tracing.ts: OTel provider setup with OTLP/HTTP exporter, withSpan() helper
- tick.ts: tick.planMission span with mission/agent/user attributes
- client.ts: planner.complete span with LLM model, tokens, latency

Infrastructure:
- docker/tempo/tempo.yaml: Grafana Tempo config (OTLP HTTP on 4318)
- docker-compose: tempo service + tempo_data volume + mana-ai env var
- docker/grafana/provisioning/datasources/tempo.yml: auto-provisioned

Trace flow:
  tick.planMission (root span)
    └── planner.complete (child span)
        ├── llm.model = "gpt-4o-mini"
        ├── llm.tokens.total = 1234
        └── llm.response.length = 567

Enable: set OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
View: Grafana → Explore → Tempo datasource

Also fixes: removed broken @mana/subscriptions workspace ref from arcade.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 15:21:23 +02:00

96 lines
2.9 KiB
TypeScript

/**
* OpenTelemetry tracing setup for mana-ai.
*
* Exports a tracer and initializes the trace provider on first import.
* Traces are exported via OTLP/HTTP to Grafana Tempo (or any
* OTLP-compatible backend). When no backend is configured
* (OTEL_EXPORTER_OTLP_ENDPOINT not set), tracing is a no-op.
*
* Usage in service code:
* import { tracer } from '../tracing';
* const span = tracer.startSpan('tick.planOneMission');
* try { ... } finally { span.end(); }
*
* Or with the helper:
* import { withSpan } from '../tracing';
* const result = await withSpan('tick.planOneMission', { missionId }, async (span) => {
* // ... your code
* });
*/
import {
trace,
SpanStatusCode,
type Span,
type Tracer,
type SpanOptions,
} from '@opentelemetry/api';
import {
BasicTracerProvider,
SimpleSpanProcessor,
BatchSpanProcessor,
} from '@opentelemetry/sdk-trace-base';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic-conventions';
const OTEL_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
// Initialize provider once on module load
if (OTEL_ENDPOINT) {
const resource = new Resource({
[ATTR_SERVICE_NAME]: 'mana-ai',
[ATTR_SERVICE_VERSION]: '0.6.0',
});
const exporter = new OTLPTraceExporter({
url: `${OTEL_ENDPOINT}/v1/traces`,
});
const provider = new BasicTracerProvider({ resource });
// Use batch in production (less overhead), simple in dev (immediate export)
const isDev = process.env.NODE_ENV === 'development';
provider.addSpanProcessor(
isDev ? new SimpleSpanProcessor(exporter) : new BatchSpanProcessor(exporter)
);
provider.register();
console.log(`[mana-ai] OTel tracing enabled → ${OTEL_ENDPOINT}/v1/traces`);
} else {
console.log('[mana-ai] OTel tracing disabled (set OTEL_EXPORTER_OTLP_ENDPOINT to enable)');
}
/** The mana-ai tracer instance. When OTel is not configured, all
* operations are no-ops (the API guarantees this). */
export const tracer: Tracer = trace.getTracer('mana-ai', '0.6.0');
/**
* Execute an async function within a traced span. Automatically:
* - Sets span attributes from the provided record
* - Marks the span as ERROR on throw
* - Ends the span in all cases
*/
export async function withSpan<T>(
name: string,
attributes: Record<string, string | number | boolean>,
fn: (span: Span) => Promise<T>,
options?: SpanOptions
): Promise<T> {
return tracer.startActiveSpan(name, options ?? {}, async (span) => {
for (const [key, value] of Object.entries(attributes)) {
span.setAttribute(key, value);
}
try {
const result = await fn(span);
return result;
} catch (err) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: err instanceof Error ? err.message : String(err),
});
throw err;
} finally {
span.end();
}
});
}