feat(cards): Deck-Generierung aus Bildern und PDFs via Vision-LLM

Neuer Endpoint POST /api/v1/decks/from-image akzeptiert bis zu 5 Bilder
(PNG/JPG/WebP, max 10 MiB je) oder PDFs (max 30 MiB je) als multipart/form-data.
Alle Dateien werden in einem einzigen mana-llm Vision-Call verarbeitet
(mana/vision → llava → Gemini 2.5-flash → GPT-4o Fallback-Chain).

PDFs werden von Gemini nativ verstanden (Layout, Tabellen, Bilder im Dokument)
ohne Zwischenschritt über Text-Extraktion oder Rendering. Der google.py-Provider
reicht den MIME-Type aus dem data:-URI direkt an types.Part.from_bytes() weiter.

- llm-client: chatVisionJson() mit images[]-Array (mehrere Bilder/Dokumente)
- decks-generate: GeneratedDeckSchema + insertGeneratedDeck() exportiert
- decks-from-image: neuer Route-Handler, MIME-Filter für image/* + application/pdf
- index: neue Route gemountet
- client.ts: apiForm() für multipart-Uploads ohne JSON.stringify
- decks.ts: generateDeckFromImage(files, opts)
- NewDeckCard + /decks/new: Dropzone mit Multi-File, Thumbnail-Strip, PDF-Icon

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-05-10 15:21:35 +02:00
parent 170a2825a4
commit 1212b62613
8 changed files with 667 additions and 94 deletions

View file

@ -13,6 +13,7 @@ import { dsgvoRouter } from './routes/dsgvo.ts';
import { meRouter } from './routes/me.ts';
import { mediaRouter } from './routes/media.ts';
import { decksGenerateRouter } from './routes/decks-generate.ts';
import { decksFromImageRouter } from './routes/decks-from-image.ts';
import { authorsRouter as marketplaceAuthorsRouter } from './routes/marketplace/authors.ts';
import { marketplaceDecksRouter } from './routes/marketplace/decks.ts';
import { exploreRouter as marketplaceExploreRouter } from './routes/marketplace/explore.ts';
@ -53,6 +54,7 @@ app.route('/api/v1/dsgvo', dsgvoRouter());
app.route('/api/v1/me', meRouter());
app.route('/api/v1/media', mediaRouter());
app.route('/api/v1/decks/generate', decksGenerateRouter());
app.route('/api/v1/decks/from-image', decksFromImageRouter());
// Marketplace (Phase 12). Eigenes pgSchema, additive Routen unter /v1/marketplace/*.
// Plan: docs/playbooks/MARKETPLACE_RESTORE.md.

View file

@ -0,0 +1,141 @@
import { Hono } from 'hono';
import { z } from 'zod';
import { getDb, type CardsDb } from '../db/connection.ts';
import { authMiddleware, type AuthVars } from '../middleware/auth.ts';
import { chatVisionJson } from '../services/llm-client.ts';
import { GeneratedDeckSchema, insertGeneratedDeck } from './decks-generate.ts';
export type FromImageDeps = { db?: CardsDb };
const MAX_FILES = 5;
const MAX_BYTES_PER_IMAGE = 10 * 1024 * 1024; // 10 MiB je Bild
const MAX_BYTES_PER_PDF = 30 * 1024 * 1024; // 30 MiB je PDF (Gemini unterstützt bis ~300 Seiten)
function isAllowedMime(mime: string): boolean {
return mime.startsWith('image/') || mime === 'application/pdf';
}
function maxBytesFor(mime: string): number {
return mime === 'application/pdf' ? MAX_BYTES_PER_PDF : MAX_BYTES_PER_IMAGE;
}
const InputSchema = z.object({
language: z.enum(['de', 'en']).optional().default('de'),
count: z.coerce.number().int().min(1).max(40).optional().default(15),
});
const SYSTEM_PROMPT = `Du bist ein Lerndesigner. Analysiere die Bilder oder Dokumente und erstelle daraus ein einziges zusammenhängendes Karteikarten-Deck für Spaced-Repetition-Lernen.
Du gibst NUR ein gültiges JSON-Objekt zurück, exakt mit diesem Schema:
{
"deck_name": "<kurzer Titel, max 80 Zeichen>",
"deck_description": "<eine Zeile Beschreibung, optional>",
"cards": [
{ "front": "<Frage oder Begriff>", "back": "<Antwort oder Erklärung>" },
...
]
}
Regeln:
- Front ist Frage / Begriff / Hinweis. Back ist Antwort / Definition / Erklärung.
- Eine Karte = ein Lernstoff-Bissen (atomic). Nicht mehrere Konzepte in eine Karte stopfen.
- Markdown ist erlaubt (**fett**, *kursiv*, Listen, \`code\`).
- KEIN HTML, KEIN Code-Fence außerhalb des JSON, KEINE Erklärung außerhalb des JSON.
- Erstelle ein kohärentes Deck, das den Lernstoff aller Dateien zusammenfasst.`;
export function decksFromImageRouter(deps: FromImageDeps = {}): Hono<{ Variables: AuthVars }> {
const r = new Hono<{ Variables: AuthVars }>();
const dbOf = () => deps.db ?? getDb();
r.use('*', authMiddleware);
r.post('/', async (c) => {
const userId = c.get('userId');
const form = await c.req.formData().catch(() => null);
if (!form) {
return c.json({ error: 'invalid_input', detail: 'multipart body required' }, 400);
}
const rawFiles = form.getAll('file');
const files = rawFiles.filter((f): f is File => f instanceof File && isAllowedMime(f.type));
if (files.length === 0) {
return c.json(
{ error: 'invalid_input', detail: 'at least one image or PDF file required' },
400,
);
}
if (files.length > MAX_FILES) {
return c.json({ error: 'invalid_input', detail: `max ${MAX_FILES} files per request` }, 400);
}
const oversized = files.find((f) => f.size > maxBytesFor(f.type));
if (oversized) {
const limit = oversized.type === 'application/pdf' ? '30 MiB' : '10 MiB';
return c.json(
{ error: 'invalid_input', detail: `"${oversized.name}" exceeds ${limit} limit` },
413,
);
}
const parsed = InputSchema.safeParse({
language: form.get('language') ?? undefined,
count: form.get('count') ?? undefined,
});
if (!parsed.success) {
return c.json(
{ error: 'invalid_input', issues: parsed.error.issues.map((i) => i.message) },
422,
);
}
const { language, count } = parsed.data;
const images = await Promise.all(
files.map(async (f) => ({
base64: Buffer.from(await f.arrayBuffer()).toString('base64'),
mimeType: f.type,
})),
);
const imageCount = images.length;
const hasPdf = files.some((f) => f.type === 'application/pdf');
const contentLabel = hasPdf
? imageCount === 1 ? 'diesem Dokument' : `diesen ${imageCount} Dateien`
: imageCount === 1 ? 'diesem Bild' : `diesen ${imageCount} Bildern`;
const userText = `Erstelle ${count} Lernkarten auf ${language === 'de' ? 'Deutsch' : 'English'} aus ${contentLabel}.`;
let generated: z.infer<typeof GeneratedDeckSchema>;
try {
const raw = await chatVisionJson<unknown>({
images,
systemPrompt: SYSTEM_PROMPT,
userText,
timeoutMs: 120_000,
});
const r2 = GeneratedDeckSchema.safeParse(raw);
if (!r2.success) {
return c.json(
{
error: 'llm_returned_invalid_shape',
issues: r2.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`),
raw,
},
502,
);
}
generated = r2.data;
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
return c.json({ error: 'llm_call_failed', detail: msg }, 502);
}
const fallback = hasPdf
? imageCount === 1 ? 'KI-generiert aus Dokument' : `KI-generiert aus ${imageCount} Dateien`
: imageCount === 1 ? 'KI-generiert aus Bild' : `KI-generiert aus ${imageCount} Bildern`;
const result = await insertGeneratedDeck(dbOf(), userId, generated, fallback);
return c.json(result, 201);
});
return r;
}

View file

@ -12,15 +12,9 @@ import { chatJson } from '../services/llm-client.ts';
export type GenerateDeps = { db?: CardsDb };
const GenerateInputSchema = z.object({
prompt: z.string().min(3).max(500),
language: z.enum(['de', 'en']).optional().default('de'),
count: z.number().int().min(1).max(40).optional().default(15),
});
// Was die LLM zurückgeben muss. zod-strict damit Halluzinationen
// (extra Felder, falsche Types) hart abgelehnt werden.
const GeneratedDeckSchema = z.object({
export const GeneratedDeckSchema = z.object({
deck_name: z.string().min(1).max(80),
deck_description: z.string().max(400).optional(),
cards: z
@ -33,7 +27,97 @@ const GeneratedDeckSchema = z.object({
.min(1)
.max(40),
});
type GeneratedDeck = z.infer<typeof GeneratedDeckSchema>;
export type GeneratedDeck = z.infer<typeof GeneratedDeckSchema>;
export async function insertGeneratedDeck(
db: CardsDb,
userId: string,
generated: GeneratedDeck,
descriptionFallback: string,
) {
const deckId = ulid();
const now = new Date();
const cardRowsInsert = await Promise.all(
generated.cards.map(async (gc) => {
const id = ulid();
const fields = { front: gc.front, back: gc.back };
const contentHash = await cardContentHash({ type: 'basic', fields });
return { id, fields, contentHash };
})
);
await db.transaction(async (tx) => {
await tx.insert(decks).values({
id: deckId,
userId,
name: generated.deck_name,
description: generated.deck_description ?? descriptionFallback,
color: '#7c3aed',
visibility: 'private',
fsrsSettings: {},
createdAt: now,
updatedAt: now,
});
for (const cr of cardRowsInsert) {
await tx.insert(cards).values({
id: cr.id,
deckId,
userId,
type: 'basic',
fields: cr.fields,
mediaRefs: [],
contentHash: cr.contentHash,
createdAt: now,
updatedAt: now,
});
const subIndices = Array.from({ length: subIndexCount('basic') }, (_, i) => i);
const initial = subIndices.map((subIndex) => {
const r = newReview({ userId, cardId: cr.id, subIndex, now });
return {
cardId: r.card_id,
subIndex: r.sub_index,
userId: r.user_id,
due: new Date(r.due),
stability: r.stability,
difficulty: r.difficulty,
elapsedDays: r.elapsed_days,
scheduledDays: r.scheduled_days,
learningSteps: r.learning_steps,
reps: r.reps,
lapses: r.lapses,
state: r.state,
lastReview: r.last_review ? new Date(r.last_review) : null,
};
});
await tx.insert(reviews).values(initial);
}
});
const [row] = await db.select().from(decks).where(eq(decks.id, deckId)).limit(1);
return {
deck: row
? {
id: row.id,
name: row.name,
description: row.description,
color: row.color,
visibility: row.visibility,
fsrs_settings: row.fsrsSettings,
user_id: row.userId,
created_at: row.createdAt.toISOString(),
updated_at: row.updatedAt.toISOString(),
}
: null,
cards_created: cardRowsInsert.length,
};
}
const GenerateInputSchema = z.object({
prompt: z.string().min(3).max(500),
language: z.enum(['de', 'en']).optional().default('de'),
count: z.number().int().min(1).max(40).optional().default(15),
});
const SYSTEM_PROMPT = `Du bist ein Lerndesigner und erstellst Karteikarten-Decks für Spaced-Repetition-Lernen.
@ -105,87 +189,13 @@ ${parsed.data.prompt}`;
return c.json({ error: 'llm_call_failed', detail: msg }, 502);
}
// Deck + Karten in einer Transaction anlegen.
const deckId = ulid();
const now = new Date();
const cardRowsInsert = await Promise.all(
generated.cards.map(async (gc) => {
const id = ulid();
const fields = { front: gc.front, back: gc.back };
const contentHash = await cardContentHash({ type: 'basic', fields });
return { id, fields, contentHash };
})
);
await dbOf().transaction(async (tx) => {
await tx.insert(decks).values({
id: deckId,
userId,
name: generated.deck_name,
description: generated.deck_description ?? `KI-generiert: ${parsed.data.prompt}`,
color: '#7c3aed', // purple-600 — visuelle Markierung als KI-generiert
visibility: 'private',
fsrsSettings: {},
createdAt: now,
updatedAt: now,
});
for (const cr of cardRowsInsert) {
await tx.insert(cards).values({
id: cr.id,
deckId,
userId,
type: 'basic',
fields: cr.fields,
mediaRefs: [],
contentHash: cr.contentHash,
createdAt: now,
updatedAt: now,
});
const subIndices = Array.from({ length: subIndexCount('basic') }, (_, i) => i);
const initial = subIndices.map((subIndex) => {
const r = newReview({ userId, cardId: cr.id, subIndex, now });
return {
cardId: r.card_id,
subIndex: r.sub_index,
userId: r.user_id,
due: new Date(r.due),
stability: r.stability,
difficulty: r.difficulty,
elapsedDays: r.elapsed_days,
scheduledDays: r.scheduled_days,
learningSteps: r.learning_steps,
reps: r.reps,
lapses: r.lapses,
state: r.state,
lastReview: r.last_review ? new Date(r.last_review) : null,
};
});
await tx.insert(reviews).values(initial);
}
});
// Deck-DTO zurückgeben.
const [row] = await dbOf().select().from(decks).where(eq(decks.id, deckId)).limit(1);
return c.json(
{
deck: row
? {
id: row.id,
name: row.name,
description: row.description,
color: row.color,
visibility: row.visibility,
fsrs_settings: row.fsrsSettings,
user_id: row.userId,
created_at: row.createdAt.toISOString(),
updated_at: row.updatedAt.toISOString(),
}
: null,
cards_created: cardRowsInsert.length,
},
201
const result = await insertGeneratedDeck(
dbOf(),
userId,
generated,
`KI-generiert: ${parsed.data.prompt}`,
);
return c.json(result, 201);
});
return r;

View file

@ -5,7 +5,8 @@
* Aliases statt konkreter Modelle, damit der Routing-Layer die
* Provider-Auswahl macht:
* - `mana/structured` für JSON-Output (Deck-Generation)
* - `mana/long-form` wenn freier Text gefragt wäre
* - `mana/vision` für multimodale Anfragen mit Bild-Input
* - `mana/long-form` wenn freier Text gefragt wäre
*
* Auth: heute keine mana-llm hat `GPU_API_KEY` leer. Wenn das mal
* gesetzt wird, kommt der Key über CARDS_LLM_API_KEY rein.
@ -33,6 +34,65 @@ export class LlmError extends Error {
}
}
// Vision-Modelle wrappen JSON manchmal in Markdown-Code-Fences.
function parseJsonFromCompletion(text: string): unknown {
const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
if (fenced) return JSON.parse(fenced[1]);
return JSON.parse(text.trim());
}
export async function chatVisionJson<T>(opts: {
images: { base64: string; mimeType: string }[];
systemPrompt: string;
userText: string;
timeoutMs?: number;
}): Promise<T> {
const headers: Record<string, string> = { 'Content-Type': 'application/json' };
if (LLM_API_KEY) headers['X-API-Key'] = LLM_API_KEY;
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 90_000);
try {
const r = await fetch(`${LLM_URL}/v1/chat/completions`, {
method: 'POST',
headers,
signal: controller.signal,
body: JSON.stringify({
model: 'mana/vision',
messages: [
{ role: 'system', content: opts.systemPrompt },
{
role: 'user',
content: [
...opts.images.map((img) => ({
type: 'image_url' as const,
image_url: { url: `data:${img.mimeType};base64,${img.base64}` },
})),
{ type: 'text', text: opts.userText },
],
},
],
temperature: 0.5,
}),
});
if (!r.ok) {
const body = await r.text().catch(() => '');
throw new LlmError(r.status, body);
}
const data = (await r.json()) as ChatCompletion;
const content = data.choices?.[0]?.message?.content;
if (!content) throw new LlmError(0, data, 'mana-llm: empty completion');
try {
return parseJsonFromCompletion(content) as T;
} catch (e) {
throw new LlmError(0, content, `mana-llm: invalid JSON (${(e as Error).message})`);
}
} finally {
clearTimeout(timer);
}
}
export async function chatJson<T>(opts: {
model: string;
messages: ChatMessage[];