managarten/load-tests/llm-ollama.js
Till JS 9276d9a212 feat: GPU offload, signup limit, load tests & capacity planning
- Route all AI workloads (Ollama, STT, TTS, Image Gen) to GPU server
  (192.168.178.11) via LAN instead of host.docker.internal
- Upgrade default model to gemma3:12b and max concurrent to 5
- Add daily signup limit service (MAX_DAILY_SIGNUPS env var)
- Add GET /api/v1/auth/signup-status public endpoint
- Add k6 load test suite (web-apps, auth, sync-websocket, ollama)
- Add capacity planning documentation
- Fix: add eslint-config to sveltekit-base and calendar Dockerfiles

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 21:14:24 +01:00

82 lines
2.1 KiB
JavaScript

/* eslint-disable no-undef, @typescript-eslint/no-unused-vars */
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend, Counter } from 'k6/metrics';
const errorRate = new Rate('errors');
const tokensPerSec = new Trend('tokens_per_second', true);
const totalTokens = new Counter('total_tokens_generated');
const OLLAMA_URL = __ENV.OLLAMA_URL || 'http://localhost:11434';
const MODEL = __ENV.MODEL || 'gemma3:4b';
export const options = {
// LLM is single-threaded effectively — test with few VUs
stages: [
{ duration: '30s', target: 1 },
{ duration: '2m', target: 3 },
{ duration: '30s', target: 1 },
],
thresholds: {
http_req_duration: ['p(95)<30000'], // LLM responses can be slow
errors: ['rate<0.10'],
},
};
const prompts = [
'Was ist die Hauptstadt von Deutschland? Antworte in einem Satz.',
'Erklaere Photosynthese in 2 Saetzen.',
'Schreibe ein kurzes Haiku ueber Programmierung.',
'Was ist der Unterschied zwischen TCP und UDP? Kurz.',
'Nenne 3 Vorteile von Self-Hosting.',
];
export default function () {
const prompt = prompts[Math.floor(Math.random() * prompts.length)];
// Non-streaming request for easier metrics
const res = http.post(
`${OLLAMA_URL}/api/generate`,
JSON.stringify({
model: MODEL,
prompt: prompt,
stream: false,
options: {
num_predict: 100, // Cap tokens to keep tests fast
},
}),
{
headers: { 'Content-Type': 'application/json' },
timeout: '60s',
}
);
const ok = check(res, {
'status is 200': (r) => r.status === 200,
'has response text': (r) => {
try {
const body = JSON.parse(r.body);
return body.response && body.response.length > 0;
} catch {
return false;
}
},
});
if (ok && res.status === 200) {
try {
const body = JSON.parse(res.body);
// Ollama returns eval_count and eval_duration
if (body.eval_count && body.eval_duration) {
const tps = body.eval_count / (body.eval_duration / 1e9);
tokensPerSec.add(tps);
totalTokens.add(body.eval_count);
}
} catch (_) {}
}
errorRate.add(!ok);
// Longer pause between LLM requests — realistic usage
sleep(Math.random() * 5 + 3);
}