From 1212b62613b0e8f4c150fa32e3a81484c0aaf76b Mon Sep 17 00:00:00 2001 From: Till JS Date: Sun, 10 May 2026 15:21:35 +0200 Subject: [PATCH] feat(cards): Deck-Generierung aus Bildern und PDFs via Vision-LLM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neuer Endpoint POST /api/v1/decks/from-image akzeptiert bis zu 5 Bilder (PNG/JPG/WebP, max 10 MiB je) oder PDFs (max 30 MiB je) als multipart/form-data. Alle Dateien werden in einem einzigen mana-llm Vision-Call verarbeitet (mana/vision → llava → Gemini 2.5-flash → GPT-4o Fallback-Chain). PDFs werden von Gemini nativ verstanden (Layout, Tabellen, Bilder im Dokument) ohne Zwischenschritt über Text-Extraktion oder Rendering. Der google.py-Provider reicht den MIME-Type aus dem data:-URI direkt an types.Part.from_bytes() weiter. - llm-client: chatVisionJson() mit images[]-Array (mehrere Bilder/Dokumente) - decks-generate: GeneratedDeckSchema + insertGeneratedDeck() exportiert - decks-from-image: neuer Route-Handler, MIME-Filter für image/* + application/pdf - index: neue Route gemountet - client.ts: apiForm() für multipart-Uploads ohne JSON.stringify - decks.ts: generateDeckFromImage(files, opts) - NewDeckCard + /decks/new: Dropzone mit Multi-File, Thumbnail-Strip, PDF-Icon Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/index.ts | 2 + apps/api/src/routes/decks-from-image.ts | 141 +++++++++++++ apps/api/src/routes/decks-generate.ts | 186 ++++++++-------- apps/api/src/services/llm-client.ts | 62 +++++- apps/web/src/lib/api/client.ts | 26 +++ apps/web/src/lib/api/decks.ts | 14 +- .../web/src/lib/components/NewDeckCard.svelte | 198 +++++++++++++++++- apps/web/src/routes/decks/new/+page.svelte | 132 +++++++++++- 8 files changed, 667 insertions(+), 94 deletions(-) create mode 100644 apps/api/src/routes/decks-from-image.ts diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index d5335c5..c50a9d4 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -13,6 +13,7 @@ import { dsgvoRouter } from './routes/dsgvo.ts'; import { meRouter } from './routes/me.ts'; import { mediaRouter } from './routes/media.ts'; import { decksGenerateRouter } from './routes/decks-generate.ts'; +import { decksFromImageRouter } from './routes/decks-from-image.ts'; import { authorsRouter as marketplaceAuthorsRouter } from './routes/marketplace/authors.ts'; import { marketplaceDecksRouter } from './routes/marketplace/decks.ts'; import { exploreRouter as marketplaceExploreRouter } from './routes/marketplace/explore.ts'; @@ -53,6 +54,7 @@ app.route('/api/v1/dsgvo', dsgvoRouter()); app.route('/api/v1/me', meRouter()); app.route('/api/v1/media', mediaRouter()); app.route('/api/v1/decks/generate', decksGenerateRouter()); +app.route('/api/v1/decks/from-image', decksFromImageRouter()); // Marketplace (Phase 12). Eigenes pgSchema, additive Routen unter /v1/marketplace/*. // Plan: docs/playbooks/MARKETPLACE_RESTORE.md. diff --git a/apps/api/src/routes/decks-from-image.ts b/apps/api/src/routes/decks-from-image.ts new file mode 100644 index 0000000..4e5b19f --- /dev/null +++ b/apps/api/src/routes/decks-from-image.ts @@ -0,0 +1,141 @@ +import { Hono } from 'hono'; +import { z } from 'zod'; + +import { getDb, type CardsDb } from '../db/connection.ts'; +import { authMiddleware, type AuthVars } from '../middleware/auth.ts'; +import { chatVisionJson } from '../services/llm-client.ts'; +import { GeneratedDeckSchema, insertGeneratedDeck } from './decks-generate.ts'; + +export type FromImageDeps = { db?: CardsDb }; + +const MAX_FILES = 5; +const MAX_BYTES_PER_IMAGE = 10 * 1024 * 1024; // 10 MiB je Bild +const MAX_BYTES_PER_PDF = 30 * 1024 * 1024; // 30 MiB je PDF (Gemini unterstützt bis ~300 Seiten) + +function isAllowedMime(mime: string): boolean { + return mime.startsWith('image/') || mime === 'application/pdf'; +} + +function maxBytesFor(mime: string): number { + return mime === 'application/pdf' ? MAX_BYTES_PER_PDF : MAX_BYTES_PER_IMAGE; +} + +const InputSchema = z.object({ + language: z.enum(['de', 'en']).optional().default('de'), + count: z.coerce.number().int().min(1).max(40).optional().default(15), +}); + +const SYSTEM_PROMPT = `Du bist ein Lerndesigner. Analysiere die Bilder oder Dokumente und erstelle daraus ein einziges zusammenhängendes Karteikarten-Deck für Spaced-Repetition-Lernen. + +Du gibst NUR ein gültiges JSON-Objekt zurück, exakt mit diesem Schema: +{ + "deck_name": "", + "deck_description": "", + "cards": [ + { "front": "", "back": "" }, + ... + ] +} + +Regeln: +- Front ist Frage / Begriff / Hinweis. Back ist Antwort / Definition / Erklärung. +- Eine Karte = ein Lernstoff-Bissen (atomic). Nicht mehrere Konzepte in eine Karte stopfen. +- Markdown ist erlaubt (**fett**, *kursiv*, Listen, \`code\`). +- KEIN HTML, KEIN Code-Fence außerhalb des JSON, KEINE Erklärung außerhalb des JSON. +- Erstelle ein kohärentes Deck, das den Lernstoff aller Dateien zusammenfasst.`; + +export function decksFromImageRouter(deps: FromImageDeps = {}): Hono<{ Variables: AuthVars }> { + const r = new Hono<{ Variables: AuthVars }>(); + const dbOf = () => deps.db ?? getDb(); + + r.use('*', authMiddleware); + + r.post('/', async (c) => { + const userId = c.get('userId'); + + const form = await c.req.formData().catch(() => null); + if (!form) { + return c.json({ error: 'invalid_input', detail: 'multipart body required' }, 400); + } + + const rawFiles = form.getAll('file'); + const files = rawFiles.filter((f): f is File => f instanceof File && isAllowedMime(f.type)); + + if (files.length === 0) { + return c.json( + { error: 'invalid_input', detail: 'at least one image or PDF file required' }, + 400, + ); + } + if (files.length > MAX_FILES) { + return c.json({ error: 'invalid_input', detail: `max ${MAX_FILES} files per request` }, 400); + } + const oversized = files.find((f) => f.size > maxBytesFor(f.type)); + if (oversized) { + const limit = oversized.type === 'application/pdf' ? '30 MiB' : '10 MiB'; + return c.json( + { error: 'invalid_input', detail: `"${oversized.name}" exceeds ${limit} limit` }, + 413, + ); + } + + const parsed = InputSchema.safeParse({ + language: form.get('language') ?? undefined, + count: form.get('count') ?? undefined, + }); + if (!parsed.success) { + return c.json( + { error: 'invalid_input', issues: parsed.error.issues.map((i) => i.message) }, + 422, + ); + } + const { language, count } = parsed.data; + + const images = await Promise.all( + files.map(async (f) => ({ + base64: Buffer.from(await f.arrayBuffer()).toString('base64'), + mimeType: f.type, + })), + ); + + const imageCount = images.length; + const hasPdf = files.some((f) => f.type === 'application/pdf'); + const contentLabel = hasPdf + ? imageCount === 1 ? 'diesem Dokument' : `diesen ${imageCount} Dateien` + : imageCount === 1 ? 'diesem Bild' : `diesen ${imageCount} Bildern`; + const userText = `Erstelle ${count} Lernkarten auf ${language === 'de' ? 'Deutsch' : 'English'} aus ${contentLabel}.`; + + let generated: z.infer; + try { + const raw = await chatVisionJson({ + images, + systemPrompt: SYSTEM_PROMPT, + userText, + timeoutMs: 120_000, + }); + const r2 = GeneratedDeckSchema.safeParse(raw); + if (!r2.success) { + return c.json( + { + error: 'llm_returned_invalid_shape', + issues: r2.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`), + raw, + }, + 502, + ); + } + generated = r2.data; + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + return c.json({ error: 'llm_call_failed', detail: msg }, 502); + } + + const fallback = hasPdf + ? imageCount === 1 ? 'KI-generiert aus Dokument' : `KI-generiert aus ${imageCount} Dateien` + : imageCount === 1 ? 'KI-generiert aus Bild' : `KI-generiert aus ${imageCount} Bildern`; + const result = await insertGeneratedDeck(dbOf(), userId, generated, fallback); + return c.json(result, 201); + }); + + return r; +} diff --git a/apps/api/src/routes/decks-generate.ts b/apps/api/src/routes/decks-generate.ts index 0aac656..84febfb 100644 --- a/apps/api/src/routes/decks-generate.ts +++ b/apps/api/src/routes/decks-generate.ts @@ -12,15 +12,9 @@ import { chatJson } from '../services/llm-client.ts'; export type GenerateDeps = { db?: CardsDb }; -const GenerateInputSchema = z.object({ - prompt: z.string().min(3).max(500), - language: z.enum(['de', 'en']).optional().default('de'), - count: z.number().int().min(1).max(40).optional().default(15), -}); - // Was die LLM zurückgeben muss. zod-strict damit Halluzinationen // (extra Felder, falsche Types) hart abgelehnt werden. -const GeneratedDeckSchema = z.object({ +export const GeneratedDeckSchema = z.object({ deck_name: z.string().min(1).max(80), deck_description: z.string().max(400).optional(), cards: z @@ -33,7 +27,97 @@ const GeneratedDeckSchema = z.object({ .min(1) .max(40), }); -type GeneratedDeck = z.infer; +export type GeneratedDeck = z.infer; + +export async function insertGeneratedDeck( + db: CardsDb, + userId: string, + generated: GeneratedDeck, + descriptionFallback: string, +) { + const deckId = ulid(); + const now = new Date(); + const cardRowsInsert = await Promise.all( + generated.cards.map(async (gc) => { + const id = ulid(); + const fields = { front: gc.front, back: gc.back }; + const contentHash = await cardContentHash({ type: 'basic', fields }); + return { id, fields, contentHash }; + }) + ); + + await db.transaction(async (tx) => { + await tx.insert(decks).values({ + id: deckId, + userId, + name: generated.deck_name, + description: generated.deck_description ?? descriptionFallback, + color: '#7c3aed', + visibility: 'private', + fsrsSettings: {}, + createdAt: now, + updatedAt: now, + }); + + for (const cr of cardRowsInsert) { + await tx.insert(cards).values({ + id: cr.id, + deckId, + userId, + type: 'basic', + fields: cr.fields, + mediaRefs: [], + contentHash: cr.contentHash, + createdAt: now, + updatedAt: now, + }); + const subIndices = Array.from({ length: subIndexCount('basic') }, (_, i) => i); + const initial = subIndices.map((subIndex) => { + const r = newReview({ userId, cardId: cr.id, subIndex, now }); + return { + cardId: r.card_id, + subIndex: r.sub_index, + userId: r.user_id, + due: new Date(r.due), + stability: r.stability, + difficulty: r.difficulty, + elapsedDays: r.elapsed_days, + scheduledDays: r.scheduled_days, + learningSteps: r.learning_steps, + reps: r.reps, + lapses: r.lapses, + state: r.state, + lastReview: r.last_review ? new Date(r.last_review) : null, + }; + }); + await tx.insert(reviews).values(initial); + } + }); + + const [row] = await db.select().from(decks).where(eq(decks.id, deckId)).limit(1); + return { + deck: row + ? { + id: row.id, + name: row.name, + description: row.description, + color: row.color, + visibility: row.visibility, + fsrs_settings: row.fsrsSettings, + user_id: row.userId, + created_at: row.createdAt.toISOString(), + updated_at: row.updatedAt.toISOString(), + } + : null, + cards_created: cardRowsInsert.length, + }; +} + +const GenerateInputSchema = z.object({ + prompt: z.string().min(3).max(500), + language: z.enum(['de', 'en']).optional().default('de'), + count: z.number().int().min(1).max(40).optional().default(15), +}); const SYSTEM_PROMPT = `Du bist ein Lerndesigner und erstellst Karteikarten-Decks für Spaced-Repetition-Lernen. @@ -105,87 +189,13 @@ ${parsed.data.prompt}`; return c.json({ error: 'llm_call_failed', detail: msg }, 502); } - // Deck + Karten in einer Transaction anlegen. - const deckId = ulid(); - const now = new Date(); - const cardRowsInsert = await Promise.all( - generated.cards.map(async (gc) => { - const id = ulid(); - const fields = { front: gc.front, back: gc.back }; - const contentHash = await cardContentHash({ type: 'basic', fields }); - return { id, fields, contentHash }; - }) - ); - - await dbOf().transaction(async (tx) => { - await tx.insert(decks).values({ - id: deckId, - userId, - name: generated.deck_name, - description: generated.deck_description ?? `KI-generiert: ${parsed.data.prompt}`, - color: '#7c3aed', // purple-600 — visuelle Markierung als KI-generiert - visibility: 'private', - fsrsSettings: {}, - createdAt: now, - updatedAt: now, - }); - - for (const cr of cardRowsInsert) { - await tx.insert(cards).values({ - id: cr.id, - deckId, - userId, - type: 'basic', - fields: cr.fields, - mediaRefs: [], - contentHash: cr.contentHash, - createdAt: now, - updatedAt: now, - }); - const subIndices = Array.from({ length: subIndexCount('basic') }, (_, i) => i); - const initial = subIndices.map((subIndex) => { - const r = newReview({ userId, cardId: cr.id, subIndex, now }); - return { - cardId: r.card_id, - subIndex: r.sub_index, - userId: r.user_id, - due: new Date(r.due), - stability: r.stability, - difficulty: r.difficulty, - elapsedDays: r.elapsed_days, - scheduledDays: r.scheduled_days, - learningSteps: r.learning_steps, - reps: r.reps, - lapses: r.lapses, - state: r.state, - lastReview: r.last_review ? new Date(r.last_review) : null, - }; - }); - await tx.insert(reviews).values(initial); - } - }); - - // Deck-DTO zurückgeben. - const [row] = await dbOf().select().from(decks).where(eq(decks.id, deckId)).limit(1); - return c.json( - { - deck: row - ? { - id: row.id, - name: row.name, - description: row.description, - color: row.color, - visibility: row.visibility, - fsrs_settings: row.fsrsSettings, - user_id: row.userId, - created_at: row.createdAt.toISOString(), - updated_at: row.updatedAt.toISOString(), - } - : null, - cards_created: cardRowsInsert.length, - }, - 201 + const result = await insertGeneratedDeck( + dbOf(), + userId, + generated, + `KI-generiert: ${parsed.data.prompt}`, ); + return c.json(result, 201); }); return r; diff --git a/apps/api/src/services/llm-client.ts b/apps/api/src/services/llm-client.ts index e15e468..c452b52 100644 --- a/apps/api/src/services/llm-client.ts +++ b/apps/api/src/services/llm-client.ts @@ -5,7 +5,8 @@ * Aliases statt konkreter Modelle, damit der Routing-Layer die * Provider-Auswahl macht: * - `mana/structured` für JSON-Output (Deck-Generation) - * - `mana/long-form` wenn freier Text gefragt wäre + * - `mana/vision` für multimodale Anfragen mit Bild-Input + * - `mana/long-form` wenn freier Text gefragt wäre * * Auth: heute keine — mana-llm hat `GPU_API_KEY` leer. Wenn das mal * gesetzt wird, kommt der Key über CARDS_LLM_API_KEY rein. @@ -33,6 +34,65 @@ export class LlmError extends Error { } } +// Vision-Modelle wrappen JSON manchmal in Markdown-Code-Fences. +function parseJsonFromCompletion(text: string): unknown { + const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/); + if (fenced) return JSON.parse(fenced[1]); + return JSON.parse(text.trim()); +} + +export async function chatVisionJson(opts: { + images: { base64: string; mimeType: string }[]; + systemPrompt: string; + userText: string; + timeoutMs?: number; +}): Promise { + const headers: Record = { 'Content-Type': 'application/json' }; + if (LLM_API_KEY) headers['X-API-Key'] = LLM_API_KEY; + + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 90_000); + + try { + const r = await fetch(`${LLM_URL}/v1/chat/completions`, { + method: 'POST', + headers, + signal: controller.signal, + body: JSON.stringify({ + model: 'mana/vision', + messages: [ + { role: 'system', content: opts.systemPrompt }, + { + role: 'user', + content: [ + ...opts.images.map((img) => ({ + type: 'image_url' as const, + image_url: { url: `data:${img.mimeType};base64,${img.base64}` }, + })), + { type: 'text', text: opts.userText }, + ], + }, + ], + temperature: 0.5, + }), + }); + if (!r.ok) { + const body = await r.text().catch(() => ''); + throw new LlmError(r.status, body); + } + const data = (await r.json()) as ChatCompletion; + const content = data.choices?.[0]?.message?.content; + if (!content) throw new LlmError(0, data, 'mana-llm: empty completion'); + try { + return parseJsonFromCompletion(content) as T; + } catch (e) { + throw new LlmError(0, content, `mana-llm: invalid JSON (${(e as Error).message})`); + } + } finally { + clearTimeout(timer); + } +} + export async function chatJson(opts: { model: string; messages: ChatMessage[]; diff --git a/apps/web/src/lib/api/client.ts b/apps/web/src/lib/api/client.ts index 0cc871f..a1e0d14 100644 --- a/apps/web/src/lib/api/client.ts +++ b/apps/web/src/lib/api/client.ts @@ -56,6 +56,32 @@ async function doFetch(path: string, opts: RequestOptions): Promise { }); } +// Für Multipart-Uploads (z.B. Bild → Deck). Content-Type wird vom Browser +// automatisch mit Boundary gesetzt — kein manueller Header nötig. +export async function apiForm(path: string, form: FormData): Promise { + await devUser.ensureFreshToken(); + + const headers: Record = {}; + if (devUser.token) { + headers['Authorization'] = `Bearer ${devUser.token}`; + } else if (devUser.stubId) { + headers['X-User-Id'] = devUser.stubId; + } + + const res = await fetch(`${API_BASE}${path}`, { method: 'POST', headers, body: form }); + + if (!res.ok) { + let body: unknown = null; + try { + body = await res.json(); + } catch { + body = await res.text(); + } + throw new ApiError(res.status, body); + } + return (await res.json()) as T; +} + export async function api(path: string, opts: RequestOptions = {}): Promise { // Proaktive Frische-Prüfung: wenn Token <60s gültig ist, refreshen // wir, bevor der Request rausgeht. Coalesced über tryRefresh(). diff --git a/apps/web/src/lib/api/decks.ts b/apps/web/src/lib/api/decks.ts index b84e52b..b852c5d 100644 --- a/apps/web/src/lib/api/decks.ts +++ b/apps/web/src/lib/api/decks.ts @@ -1,5 +1,5 @@ import type { Deck, DeckCreate, DeckUpdate } from '@cards/domain'; -import { api } from './client.ts'; +import { api, apiForm } from './client.ts'; export function listDecks(opts: { forkedFromMarketplace?: boolean } = {}) { const qs = opts.forkedFromMarketplace ? '?forked_from_marketplace=true' : ''; @@ -28,3 +28,15 @@ export function generateDeck(input: { prompt: string; language?: 'de' | 'en'; co body: input, }); } + +export function generateDeckFromImage( + files: File | File[], + opts: { language?: 'de' | 'en'; count?: number }, +) { + const form = new FormData(); + const arr = Array.isArray(files) ? files : [files]; + for (const f of arr) form.append('file', f); + if (opts.language) form.append('language', opts.language); + if (opts.count != null) form.append('count', String(opts.count)); + return apiForm<{ deck: Deck; cards_created: number }>('/api/v1/decks/from-image', form); +} diff --git a/apps/web/src/lib/components/NewDeckCard.svelte b/apps/web/src/lib/components/NewDeckCard.svelte index 3c5f0af..62e19f8 100644 --- a/apps/web/src/lib/components/NewDeckCard.svelte +++ b/apps/web/src/lib/components/NewDeckCard.svelte @@ -1,7 +1,7 @@