Phase 8c: Anki-Import via portiertem Parser

Strategie-B-Ausnahme: parse.ts (Anki-Format-Parser via JSZip + sql.js)
und AnkiImport.svelte (UI-Stages) sind aus mana-monorepo portiert,
mit Source-Comment-Header dokumentiert. Anki-Format ist standalone
Parser-Logik, kein Architektur-Schmuggel.

Neuer server-authoritative import.ts schreibt direkt gegen die
cards-api ($lib/api/decks + cards) — keine Stores, keine Dexie.
Anki "::"-Hierarchie wird zu " / "-Strings flach. Fallback-Deck
"Anki-Import" für Karten ohne explizites Deck. Cloze-Karten kommen
first-class durch (Sub-Index pro Cluster, Sprint 8a/8b).

Phase-8-MVP-Scope: Bilder + Audio werden gedroppt (Option A) — der
sanitizeAnkiHtml entfernt <img> und [sound:…] ersatzlos. Späterer
Media-Pfad (lokaler Cards-Upload oder mana-media nach Phase 2) ist
additiv.

Neue Route /import + Top-Nav-Link. Hermetic Vitest (5 Cases): baut
zur Laufzeit ein Mini-.apkg via sql.js + JSZip und prüft den
Parser-Output (basic, basic-reverse, cloze, sanitize, dedupe auf
Note-Ebene). svelte-check 0 errors, prod-Build sauber.

sql-wasm.wasm liegt in static/ (660kB) — fix für sql.js 1.14.1, vom
Browser einmal geladen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-05-08 17:43:12 +02:00
parent 0b609c46fd
commit 2ca09fe0c3
9 changed files with 916 additions and 3 deletions

View file

@ -0,0 +1,120 @@
/**
* Server-authoritative Anki-Import.
*
* Schreibt gegen die cards-api HTTP-Endpoints keine Dexie, keine
* lokalen Stores. Anki-Decks werden 1:1 in cards-Decks gemappt
* (Anki-`::` zu ` / ` flacht die Hierarchie aus, wie im Original).
* Karten werden mit sanitisiertem Markdown angelegt.
*
* Phase-8-MVP: Bilder + Audio werden gedroppt (siehe parse.ts
* `sanitizeAnkiHtml`). Ein späterer Media-Pfad ist additiv.
*
* No de-dupe: Re-Import derselben .apkg legt doppelte Decks an.
*/
import { createDeck } from '$lib/api/decks.ts';
import { createCard } from '$lib/api/cards.ts';
import { sanitizeAnkiHtml, type ParsedAnki } from './parse.ts';
export interface ImportResult {
decksCreated: number;
cardsCreated: number;
failed: number;
failures: string[];
}
export interface ImportProgress {
stage: 'decks' | 'cards' | 'done';
current: number;
total: number;
}
export async function importParsedAnki(
parsed: ParsedAnki,
opts: { onProgress?: (p: ImportProgress) => void } = {}
): Promise<ImportResult> {
const result: ImportResult = {
decksCreated: 0,
cardsCreated: 0,
failed: 0,
failures: [],
};
// 1) Decks — Anki "::"-Hierarchie zu " / "-Strings flach machen.
const ankiIdToDeckId = new Map<string, string>();
let deckIdx = 0;
for (const ankiDeck of parsed.decks) {
opts.onProgress?.({ stage: 'decks', current: deckIdx++, total: parsed.decks.length });
const name = ankiDeck.name.replace(/::/g, ' / ');
try {
const created = await createDeck({ name });
ankiIdToDeckId.set(ankiDeck.ankiId, created.id);
result.decksCreated++;
} catch (e) {
result.failed++;
result.failures.push(`deck "${name}": ${errMessage(e)}`);
}
}
// Fallback-Deck für Karten ohne explizit referenziertes Anki-Deck.
let fallbackDeckId: string | null = null;
const ensureFallbackDeck = async (): Promise<string | null> => {
if (fallbackDeckId) return fallbackDeckId;
try {
const created = await createDeck({ name: 'Anki-Import' });
fallbackDeckId = created.id;
result.decksCreated++;
return fallbackDeckId;
} catch (e) {
result.failures.push(`fallback deck: ${errMessage(e)}`);
return null;
}
};
// 2) Cards — Felder sanitizen (Media-Refs werden gedroppt).
for (let i = 0; i < parsed.cards.length; i++) {
opts.onProgress?.({ stage: 'cards', current: i, total: parsed.cards.length });
const card = parsed.cards[i];
let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId);
if (!targetDeckId) {
const fallback = await ensureFallbackDeck();
if (!fallback) {
result.failed++;
continue;
}
targetDeckId = fallback;
}
const cleanFields: Record<string, string> = {};
for (const [key, value] of Object.entries(card.fields)) {
cleanFields[key] = sanitizeAnkiHtml(value);
}
try {
await createCard({
deck_id: targetDeckId,
type: card.type,
fields: cleanFields,
});
result.cardsCreated++;
} catch (e) {
result.failed++;
result.failures.push(`card "${preview(cleanFields)}": ${errMessage(e)}`);
}
}
opts.onProgress?.({ stage: 'done', current: parsed.cards.length, total: parsed.cards.length });
return result;
}
function errMessage(e: unknown): string {
if (e instanceof Error) return e.message;
return String(e);
}
function preview(fields: Record<string, string>): string {
const first = Object.values(fields)[0] ?? '';
const trimmed = first.length > 40 ? first.slice(0, 40) + '…' : first;
return trimmed.replace(/\s+/g, ' ');
}

View file

@ -0,0 +1,241 @@
/**
* Parse an Anki .apkg / .colpkg file in the browser.
*
* .apkg = ZIP archive containing a SQLite collection (`collection.anki2`
* or `collection.anki21`) plus media files. We open the SQLite blob with
* sql.js (WASM-backed in-browser SQLite) and walk Anki's three core
* tables: `col` (collection meta with JSON-encoded models + decks),
* `notes` (the user-typed content), and `cards` (one row per learnable
* unit basic = 1, basic-reverse = 2, cloze = N).
*
* MVP scope (Cards Phase 8): basic + basic-reverse + cloze. Media is
* collected but not uploaded Image/audio refs are stripped from the
* sanitized text. Review history is skipped FSRS state will be
* regenerated on first sight.
*
* --------------------------------------------------------------------
* STRATEGIE-B-AUSNAHME: Diese Datei ist ein bewusst portierter Lift aus
* mana-monorepo/apps/cards/apps/web/src/lib/anki/parse.ts (commit
* ~Mai 2026). Anki-Format-Logik ist standalone Parser-Code ohne
* Architektur-Übernahme die Kopie spart 2-3 Tage Re-Implementierung
* bei null Strategy-Risiko. CardType-Import auf @cards/domain
* umgestellt, Doc-Kommentar an Phase-8-Scope angepasst.
* --------------------------------------------------------------------
*/
import JSZip, { type JSZipObject } from 'jszip';
import initSqlJs, { type Database } from 'sql.js';
import type { CardType } from '@cards/domain';
export interface ParsedDeck {
ankiId: string; // Anki's numeric deck id, stringified
name: string; // "Studies::Spanish" — Anki uses :: as separator
}
export interface ParsedCard {
ankiDeckId: string;
type: CardType;
fields: Record<string, string>;
}
export interface ParsedAnki {
decks: ParsedDeck[];
cards: ParsedCard[];
skipped: number;
warnings: string[];
/**
* Mapping from the original media filename (as referenced in card
* fields, e.g. `paris.jpg` or `audio_001.mp3`) to its ZIP entry. Anki
* stores files numerically (`0`, `1`, ) and the JSON manifest
* (`media`) maps numbers original names; we flip that here so the
* importer can look up by the name it sees in the field text.
*/
mediaByFilename: Map<string, JSZipObject>;
}
interface AnkiModel {
id: number;
name: string;
type: number; // 0 = standard, 1 = cloze
flds: { name: string }[];
tmpls: { name: string }[];
}
interface AnkiDeckJson {
id: number;
name: string;
}
let SQL: Awaited<ReturnType<typeof initSqlJs>> | null = null;
async function getSql() {
if (SQL) return SQL;
SQL = await initSqlJs({ locateFile: (file) => `/${file}` });
return SQL;
}
export async function parseApkg(file: File | Blob): Promise<ParsedAnki> {
const zip = await JSZip.loadAsync(await file.arrayBuffer());
const collectionEntry = zip.file('collection.anki21') ?? zip.file('collection.anki2');
if (!collectionEntry) {
throw new Error(
'Keine Anki-Collection-Datei in der .apkg gefunden (erwartet: collection.anki21 oder collection.anki2).'
);
}
const sqliteBytes = await collectionEntry.async('uint8array');
const sql = await getSql();
const db: Database = new sql.Database(sqliteBytes);
const mediaByFilename = await extractMediaManifest(zip);
try {
const result = extract(db);
return { ...result, mediaByFilename };
} finally {
db.close();
}
}
async function extractMediaManifest(zip: JSZip): Promise<Map<string, JSZipObject>> {
const out = new Map<string, JSZipObject>();
const manifestEntry = zip.file('media');
if (!manifestEntry) return out;
let manifest: Record<string, string>;
try {
manifest = JSON.parse(await manifestEntry.async('string'));
} catch {
return out;
}
for (const [numericKey, originalName] of Object.entries(manifest)) {
const entry = zip.file(numericKey);
if (entry) out.set(originalName, entry);
}
return out;
}
// Internal extract returns everything except media — that's plumbed in
// at the parseApkg layer so the SQLite-only path stays focused.
type ExtractResult = Omit<ParsedAnki, 'mediaByFilename'>;
function extract(db: Database): ExtractResult {
const colRow = db.exec('SELECT models, decks FROM col LIMIT 1');
if (colRow.length === 0 || colRow[0].values.length === 0) {
throw new Error('Anki-Collection ist leer.');
}
const [modelsJson, decksJson] = colRow[0].values[0] as [string, string];
const models: Record<string, AnkiModel> = JSON.parse(modelsJson);
const decksMap: Record<string, AnkiDeckJson> = JSON.parse(decksJson);
const decks: ParsedDeck[] = Object.values(decksMap)
.filter((d) => d.id !== 1) // Anki's "Default" deck has id 1; skip if empty later
.map((d) => ({ ankiId: String(d.id), name: d.name }));
// Pre-load notes into a Map so we don't hit SQLite per card.
type NoteRow = { id: string; mid: string; flds: string };
const notesById = new Map<string, NoteRow>();
const notesRes = db.exec('SELECT id, mid, flds FROM notes');
if (notesRes.length > 0) {
for (const row of notesRes[0].values) {
const [id, mid, flds] = row as [number, number, string];
notesById.set(String(id), { id: String(id), mid: String(mid), flds });
}
}
const warnings: string[] = [];
const cards: ParsedCard[] = [];
let skipped = 0;
const cardsRes = db.exec('SELECT nid, did, ord FROM cards');
if (cardsRes.length === 0)
return { decks, cards: [], skipped: 0, warnings: ['Keine Karten gefunden.'] };
// We dedupe at the note level — Anki stores one DB-row per generated
// card (basic-reverse = 2 rows, cloze cluster c1+c2 = 2 rows). Our
// model regenerates these from `type` + `fields` automatically, so
// pulling each note once is enough.
const seenNotes = new Set<string>();
for (const row of cardsRes[0].values) {
const [nid, did] = row as [number, number, number];
const noteKey = String(nid);
if (seenNotes.has(noteKey)) continue;
seenNotes.add(noteKey);
const note = notesById.get(noteKey);
if (!note) {
skipped++;
continue;
}
const model = models[note.mid];
if (!model) {
skipped++;
warnings.push(`Note ${nid}: unknown model ${note.mid}`);
continue;
}
const fieldValues = note.flds.split('\x1f');
const result = mapNoteToCard(model, fieldValues);
if (!result) {
skipped++;
continue;
}
cards.push({ ankiDeckId: String(did), ...result });
}
if (skipped > 0) warnings.unshift(`${skipped} Karten übersprungen (unbekannter Typ).`);
return { decks, cards, skipped, warnings };
}
function mapNoteToCard(
model: AnkiModel,
fields: string[]
): { type: CardType; fields: Record<string, string> } | null {
// Cloze: exactly one input field with {{cN::...}} markup.
if (model.type === 1) {
const text = fields[0] ?? '';
return { type: 'cloze', fields: { text, ...(fields[1] ? { extra: fields[1] } : {}) } };
}
// Standard: one or two templates → basic / basic-reverse.
if (model.type === 0) {
const front = fields[0] ?? '';
const back = fields[1] ?? '';
if (model.tmpls.length === 2) {
return { type: 'basic-reverse', fields: { front, back } };
}
// 1 (or unusual N) → treat as basic. Custom multi-card templates
// lose their extra surfaces; the user-typed content survives.
return { type: 'basic', fields: { front, back } };
}
return null;
}
/**
* Convert Anki's HTML / image / sound markup to plain text + Markdown.
*
* Phase-8-MVP: Bilder + Audio werden ersatzlos gedroppt (Option A).
* Ein späterer Media-Pfad (lokaler Cards-Upload-Endpunkt oder mana-media
* via Phase 2 Auth-Föderation) kann hier eine FilenameURL-Map einsetzen,
* die dann zu `<img>` / `<audio>`-Tags expandiert.
*/
export function sanitizeAnkiHtml(html: string): string {
// Bilder + Audio-Refs vollständig entfernen.
const imgStripped = html.replace(/<img\b[^>]*>/gi, '');
const soundStripped = imgStripped.replace(/\[sound:[^\]]+\]/g, '');
return soundStripped
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/?(?:b|strong)>/gi, '**')
.replace(/<\/?(?:i|em)>/gi, '*')
.replace(/<\/?p>/gi, '\n')
.replace(/<\/?div>/gi, '\n')
.replace(/<[^>]+>/gi, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/\n{3,}/g, '\n\n')
.trim();
}