feat(cards-web): Anki import carries images + audio along

Closes the gap from the first Anki-import pass: media files are now
uploaded alongside the cards instead of stripped.

Pipeline:
  • parse.ts: read the .apkg's `media` JSON manifest, build a
    filename → ZIP-entry map (Anki names files numerically; the
    manifest is the original-name lookup table). Returned alongside
    decks/cards as parsed.mediaByFilename.
  • import.ts: collectMediaRefs() walks every card field, gathers
    distinct <img src=…> and [sound:…] references — orphan media
    bundled in the .apkg are ignored. Referenced files upload to
    mana-media in 4 parallel workers, returning a filename → URL map.
  • parse.sanitizeAnkiHtml() now takes that map: <img src="X"> →
    <img src="<url>" alt="" />, [sound:Y] → <audio controls
    preload="metadata" src="<url>"/>. The remaining-tag stripper has
    a negative lookahead for img/audio/video/source so the new tags
    survive.
  • CardFace already renders <img>/<audio> via @mana/cards-core's
    DOMPurify config (the image/audio attachments commit added the
    allowlist), so the freshly-imported cards just work in the
    learn session.

UI:
  • AnkiImport gains an "uploading-media" stage with X / N progress
    bar between preview and card creation.
  • Preview now shows the media count, copy promise updated from
    "Bilder/Audio bleiben raus" to "Bilder + Audio werden mit
    übernommen".
  • Result block reports `N Medien übernommen · M fehlgeschlagen`.

Phase-2 ideas: per-user media scoping in mana-media; verify-then-
upload via /media/hash/:sha256 to skip duplicates from re-imports.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-05-07 14:25:43 +02:00
parent 0ae1e70bf1
commit 82db4eb794
3 changed files with 250 additions and 41 deletions

View file

@ -2,9 +2,14 @@
* Apply a `ParsedAnki` to the local DB. * Apply a `ParsedAnki` to the local DB.
* *
* Strategy: every Anki deck becomes one of our decks (1:1, name-mapped). * Strategy: every Anki deck becomes one of our decks (1:1, name-mapped).
* Card content is HTML-sanitized to plain Markdown before save. Reviews * Card content is HTML-sanitized to plain Markdown / inline media tags
* are auto-generated by reviewStore.ensureReviewsForCard the imported * before save. Reviews are auto-generated by reviewStore.ensureReviewsForCard
* cards become "new" in the FSRS sense, no inherited schedule. * the imported cards become "new" in the FSRS sense, no inherited schedule.
*
* Media: every referenced file is uploaded to mana-media first; the
* resulting URL replaces the original Anki filename in the field text.
* Files referenced from no card are skipped many Anki decks bundle
* orphaned media that bloats the upload time.
* *
* No de-dupe: re-importing the same .apkg adds duplicate decks. The UI * No de-dupe: re-importing the same .apkg adds duplicate decks. The UI
* warns about this once we decide it matters. * warns about this once we decide it matters.
@ -12,20 +17,135 @@
import { deckStore } from '../stores/decks.svelte'; import { deckStore } from '../stores/decks.svelte';
import { cardStore } from '../stores/cards.svelte'; import { cardStore } from '../stores/cards.svelte';
import { uploadCardMedia, MediaUploadError } from '../media/upload';
import { sanitizeAnkiHtml, type ParsedAnki } from './parse'; import { sanitizeAnkiHtml, type ParsedAnki } from './parse';
export interface ImportResult { export interface ImportResult {
decksCreated: number; decksCreated: number;
cardsCreated: number; cardsCreated: number;
mediaUploaded: number;
mediaFailed: number;
failed: number; failed: number;
} }
export async function importParsedAnki(parsed: ParsedAnki): Promise<ImportResult> { export interface MediaProgress {
const result: ImportResult = { decksCreated: 0, cardsCreated: 0, failed: 0 }; uploaded: number;
total: number;
}
// Anki deck names use "::" as a separator for nesting — flatten with const MEDIA_CONCURRENCY = 4;
// a slash so the user sees a meaningful single-line title and we // Anki's <img src="..."> always quotes; we also catch [sound:foo.mp3].
// don't have to invent a hierarchy concept yet. const IMG_RE = /<img\b[^>]*\bsrc=["']([^"']+)["']/gi;
const SOUND_RE = /\[sound:([^\]]+)\]/g;
function collectMediaRefs(parsed: ParsedAnki): Set<string> {
const refs = new Set<string>();
for (const card of parsed.cards) {
for (const value of Object.values(card.fields)) {
let m: RegExpExecArray | null;
IMG_RE.lastIndex = 0;
while ((m = IMG_RE.exec(value))) refs.add(m[1]);
SOUND_RE.lastIndex = 0;
while ((m = SOUND_RE.exec(value))) refs.add(m[1]);
}
}
return refs;
}
async function uploadOne(
filename: string,
parsed: ParsedAnki
): Promise<{ filename: string; url: string | null }> {
const entry = parsed.mediaByFilename.get(filename);
if (!entry) return { filename, url: null };
try {
const blob = await entry.async('blob');
const file = new File([blob], filename, { type: guessMime(filename) });
const media = await uploadCardMedia(file);
return { filename, url: media.url };
} catch (e) {
if (e instanceof MediaUploadError) {
console.warn(`[anki] media upload failed: ${filename}`, e.message);
} else {
console.warn(`[anki] media upload failed: ${filename}`, e);
}
return { filename, url: null };
}
}
function guessMime(filename: string): string {
const ext = filename.split('.').pop()?.toLowerCase() ?? '';
const map: Record<string, string> = {
jpg: 'image/jpeg',
jpeg: 'image/jpeg',
png: 'image/png',
gif: 'image/gif',
webp: 'image/webp',
svg: 'image/svg+xml',
mp3: 'audio/mpeg',
ogg: 'audio/ogg',
oga: 'audio/ogg',
wav: 'audio/wav',
m4a: 'audio/mp4',
mp4: 'video/mp4',
webm: 'video/webm',
};
return map[ext] ?? 'application/octet-stream';
}
async function uploadAllMedia(
parsed: ParsedAnki,
onProgress?: (p: MediaProgress) => void
): Promise<{ urlByFilename: Map<string, string>; uploaded: number; failed: number }> {
const referenced = [...collectMediaRefs(parsed)].filter((f) => parsed.mediaByFilename.has(f));
const urlByFilename = new Map<string, string>();
let uploaded = 0;
let failed = 0;
if (referenced.length === 0) {
onProgress?.({ uploaded: 0, total: 0 });
return { urlByFilename, uploaded, failed };
}
let nextIdx = 0;
async function worker() {
while (true) {
const idx = nextIdx++;
if (idx >= referenced.length) return;
const result = await uploadOne(referenced[idx], parsed);
if (result.url) {
urlByFilename.set(result.filename, result.url);
uploaded++;
} else {
failed++;
}
onProgress?.({ uploaded: uploaded + failed, total: referenced.length });
}
}
await Promise.all(Array.from({ length: MEDIA_CONCURRENCY }, () => worker()));
return { urlByFilename, uploaded, failed };
}
export async function importParsedAnki(
parsed: ParsedAnki,
opts: { onMediaProgress?: (p: MediaProgress) => void } = {}
): Promise<ImportResult> {
const result: ImportResult = {
decksCreated: 0,
cardsCreated: 0,
mediaUploaded: 0,
mediaFailed: 0,
failed: 0,
};
// 1) Media — upload before any cards so the field-text rewrite has
// real URLs to point at. Empty in the no-media case.
const { urlByFilename, uploaded, failed } = await uploadAllMedia(parsed, opts.onMediaProgress);
result.mediaUploaded = uploaded;
result.mediaFailed = failed;
// 2) Decks — Anki "::" hierarchy flattened to " / ".
const ankiIdToDeckId = new Map<string, string>(); const ankiIdToDeckId = new Map<string, string>();
for (const ankiDeck of parsed.decks) { for (const ankiDeck of parsed.decks) {
const title = ankiDeck.name.replace(/::/g, ' / '); const title = ankiDeck.name.replace(/::/g, ' / ');
@ -38,9 +158,8 @@ export async function importParsedAnki(parsed: ParsedAnki): Promise<ImportResult
result.decksCreated++; result.decksCreated++;
} }
// Cards whose Anki deck wasn't in the parsed list (e.g. the implicit // Fallback deck for cards whose Anki deck wasn't in the parsed list
// "Default" deck Anki uses for orphans) get a fallback deck so we // (the "Default" deck Anki uses for orphans, mostly).
// don't drop any user content.
const ensureFallbackDeck = (() => { const ensureFallbackDeck = (() => {
let id: string | null = null; let id: string | null = null;
return async () => { return async () => {
@ -57,7 +176,8 @@ export async function importParsedAnki(parsed: ParsedAnki): Promise<ImportResult
}; };
})(); })();
let orderByDeck = new Map<string, number>(); // 3) Cards — sanitize each field with the media URL map.
const orderByDeck = new Map<string, number>();
for (const card of parsed.cards) { for (const card of parsed.cards) {
let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId); let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId);
if (!targetDeckId) { if (!targetDeckId) {
@ -71,7 +191,7 @@ export async function importParsedAnki(parsed: ParsedAnki): Promise<ImportResult
const cleanFields: Record<string, string> = {}; const cleanFields: Record<string, string> = {};
for (const [key, value] of Object.entries(card.fields)) { for (const [key, value] of Object.entries(card.fields)) {
cleanFields[key] = sanitizeAnkiHtml(value); cleanFields[key] = sanitizeAnkiHtml(value, urlByFilename);
} }
const order = orderByDeck.get(targetDeckId) ?? 0; const order = orderByDeck.get(targetDeckId) ?? 0;

View file

@ -13,7 +13,7 @@
* regenerated on first sight. * regenerated on first sight.
*/ */
import JSZip from 'jszip'; import JSZip, { type JSZipObject } from 'jszip';
import initSqlJs, { type Database } from 'sql.js'; import initSqlJs, { type Database } from 'sql.js';
import type { CardType } from '@mana/cards-core'; import type { CardType } from '@mana/cards-core';
@ -33,6 +33,14 @@ export interface ParsedAnki {
cards: ParsedCard[]; cards: ParsedCard[];
skipped: number; skipped: number;
warnings: string[]; warnings: string[];
/**
* Mapping from the original media filename (as referenced in card
* fields, e.g. `paris.jpg` or `audio_001.mp3`) to its ZIP entry. Anki
* stores files numerically (`0`, `1`, ) and the JSON manifest
* (`media`) maps numbers original names; we flip that here so the
* importer can look up by the name it sees in the field text.
*/
mediaByFilename: Map<string, JSZipObject>;
} }
interface AnkiModel { interface AnkiModel {
@ -69,14 +77,37 @@ export async function parseApkg(file: File | Blob): Promise<ParsedAnki> {
const sql = await getSql(); const sql = await getSql();
const db: Database = new sql.Database(sqliteBytes); const db: Database = new sql.Database(sqliteBytes);
const mediaByFilename = await extractMediaManifest(zip);
try { try {
return extract(db); const result = extract(db);
return { ...result, mediaByFilename };
} finally { } finally {
db.close(); db.close();
} }
} }
function extract(db: Database): ParsedAnki { async function extractMediaManifest(zip: JSZip): Promise<Map<string, JSZipObject>> {
const out = new Map<string, JSZipObject>();
const manifestEntry = zip.file('media');
if (!manifestEntry) return out;
let manifest: Record<string, string>;
try {
manifest = JSON.parse(await manifestEntry.async('string'));
} catch {
return out;
}
for (const [numericKey, originalName] of Object.entries(manifest)) {
const entry = zip.file(numericKey);
if (entry) out.set(originalName, entry);
}
return out;
}
// Internal extract returns everything except media — that's plumbed in
// at the parseApkg layer so the SQLite-only path stays focused.
type ExtractResult = Omit<ParsedAnki, 'mediaByFilename'>;
function extract(db: Database): ExtractResult {
const colRow = db.exec('SELECT models, decks FROM col LIMIT 1'); const colRow = db.exec('SELECT models, decks FROM col LIMIT 1');
if (colRow.length === 0 || colRow[0].values.length === 0) { if (colRow.length === 0 || colRow[0].values.length === 0) {
throw new Error('Anki-Collection ist leer.'); throw new Error('Anki-Collection ist leer.');
@ -169,25 +200,48 @@ function mapNoteToCard(
return null; return null;
} }
/** Strip Anki's HTML / image / sound markup down to plain text + Markdown. /**
* Conservative keeps line breaks and bold/italic but strips images * Convert Anki's HTML / image / sound markup to plain text + Markdown.
* and sound refs (Phase-2 will re-import media). */ *
export function sanitizeAnkiHtml(html: string): string { * `mediaUrlByFilename` maps the filename Anki references in the field
return html * (e.g. `paris.jpg` for `<img src="paris.jpg">` or `audio.mp3` for
.replace(/<img[^>]*>/g, '') * `[sound:audio.mp3]`) to its post-upload URL on mana-media. Anything
.replace(/\[sound:[^\]]+\]/g, '') * not in the map is dropped silently same as the no-media path.
.replace(/<br\s*\/?>/gi, '\n') */
.replace(/<\/?(?:b|strong)>/gi, '**') export function sanitizeAnkiHtml(
.replace(/<\/?(?:i|em)>/gi, '*') html: string,
.replace(/<\/?p>/gi, '\n') mediaUrlByFilename: Map<string, string> = new Map()
.replace(/<\/?div>/gi, '\n') ): string {
.replace(/<[^>]+>/g, '') // drop remaining tags const imgReplaced = html.replace(
.replace(/&nbsp;/g, ' ') /<img\b[^>]*\bsrc=["']([^"']+)["'][^>]*>/gi,
.replace(/&amp;/g, '&') (_, src: string) => {
.replace(/&lt;/g, '<') const url = mediaUrlByFilename.get(src);
.replace(/&gt;/g, '>') return url ? `<img src="${url}" alt="" />` : '';
.replace(/&quot;/g, '"') }
.replace(/&#39;/g, "'") );
.replace(/\n{3,}/g, '\n\n') const soundReplaced = imgReplaced.replace(/\[sound:([^\]]+)\]/g, (_, name: string) => {
.trim(); const url = mediaUrlByFilename.get(name);
return url ? `<audio controls preload="metadata" src="${url}"></audio>` : '';
});
return (
soundReplaced
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/?(?:b|strong)>/gi, '**')
.replace(/<\/?(?:i|em)>/gi, '*')
.replace(/<\/?p>/gi, '\n')
.replace(/<\/?div>/gi, '\n')
// Drop remaining HTML tags except the ones we just emitted
// (img/audio/video/source) — those need to survive into the
// rendered card. Negative lookahead does that in one pass.
.replace(/<(?!\/?(?:img|audio|video|source)\b)[^>]+>/gi, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/\n{3,}/g, '\n\n')
.trim()
);
} }

View file

@ -3,11 +3,16 @@
import { importParsedAnki, type ImportResult } from '$lib/anki/import'; import { importParsedAnki, type ImportResult } from '$lib/anki/import';
let fileInput = $state<HTMLInputElement | null>(null); let fileInput = $state<HTMLInputElement | null>(null);
let stage = $state<'idle' | 'parsing' | 'preview' | 'importing' | 'done' | 'error'>('idle'); let stage = $state<
'idle' | 'parsing' | 'preview' | 'uploading-media' | 'importing' | 'done' | 'error'
>('idle');
let parsed = $state<ParsedAnki | null>(null); let parsed = $state<ParsedAnki | null>(null);
let result = $state<ImportResult | null>(null); let result = $state<ImportResult | null>(null);
let error = $state<string | null>(null); let error = $state<string | null>(null);
let fileName = $state<string>(''); let fileName = $state<string>('');
let mediaProgress = $state<{ uploaded: number; total: number }>({ uploaded: 0, total: 0 });
const mediaCount = $derived(parsed?.mediaByFilename.size ?? 0);
async function handleFile(file: File) { async function handleFile(file: File) {
error = null; error = null;
@ -37,9 +42,17 @@
async function confirmImport() { async function confirmImport() {
if (!parsed) return; if (!parsed) return;
stage = 'importing'; mediaProgress = { uploaded: 0, total: mediaCount };
stage = mediaCount > 0 ? 'uploading-media' : 'importing';
try { try {
result = await importParsedAnki(parsed); result = await importParsedAnki(parsed, {
onMediaProgress: (p) => {
mediaProgress = p;
if (p.uploaded >= p.total && stage === 'uploading-media') {
stage = 'importing';
}
},
});
stage = 'done'; stage = 'done';
} catch (e: any) { } catch (e: any) {
error = e?.message ?? 'Import fehlgeschlagen.'; error = e?.message ?? 'Import fehlgeschlagen.';
@ -70,7 +83,7 @@
> >
<div class="mb-1">📦 .apkg-Datei hier ablegen oder klicken</div> <div class="mb-1">📦 .apkg-Datei hier ablegen oder klicken</div>
<div class="text-xs text-neutral-500"> <div class="text-xs text-neutral-500">
Basic, Basic + Reverse und Cloze werden importiert. Bilder/Audio bleiben raus. Basic, Basic + Reverse, Cloze · Bilder + Audio werden mit übernommen.
</div> </div>
</div> </div>
<input <input
@ -91,6 +104,9 @@
<ul class="ml-4 list-disc text-neutral-300"> <ul class="ml-4 list-disc text-neutral-300">
<li>{parsed.decks.length} {parsed.decks.length === 1 ? 'Deck' : 'Decks'}</li> <li>{parsed.decks.length} {parsed.decks.length === 1 ? 'Deck' : 'Decks'}</li>
<li>{parsed.cards.length} {parsed.cards.length === 1 ? 'Karte' : 'Karten'}</li> <li>{parsed.cards.length} {parsed.cards.length === 1 ? 'Karte' : 'Karten'}</li>
{#if mediaCount > 0}
<li>{mediaCount} Medien (Bilder/Audio)</li>
{/if}
{#if parsed.skipped > 0} {#if parsed.skipped > 0}
<li class="text-amber-400">{parsed.skipped} übersprungen (unbekannter Typ)</li> <li class="text-amber-400">{parsed.skipped} übersprungen (unbekannter Typ)</li>
{/if} {/if}
@ -118,6 +134,18 @@
</button> </button>
</div> </div>
</div> </div>
{:else if stage === 'uploading-media'}
<div class="py-6 text-center text-sm text-neutral-400">
<div>Lade Medien hoch · {mediaProgress.uploaded} / {mediaProgress.total}</div>
<div class="mx-auto mt-3 h-1 w-48 overflow-hidden rounded-full bg-neutral-800">
<div
class="h-full bg-indigo-500 transition-all"
style="width: {mediaProgress.total === 0
? 0
: (mediaProgress.uploaded / mediaProgress.total) * 100}%"
></div>
</div>
</div>
{:else if stage === 'importing'} {:else if stage === 'importing'}
<div class="py-6 text-center text-sm text-neutral-400"> <div class="py-6 text-center text-sm text-neutral-400">
Importiere {parsed?.cards.length ?? 0} Karten… Importiere {parsed?.cards.length ?? 0} Karten…
@ -128,6 +156,13 @@
{result.cardsCreated} Karten in {result.decksCreated} {result.cardsCreated} Karten in {result.decksCreated}
{result.decksCreated === 1 ? 'Deck' : 'Decks'} angelegt. {result.decksCreated === 1 ? 'Deck' : 'Decks'} angelegt.
</div> </div>
{#if result.mediaUploaded > 0 || result.mediaFailed > 0}
<div class="text-neutral-400">
{result.mediaUploaded} Medien übernommen{#if result.mediaFailed > 0}
<span class="text-amber-400">· {result.mediaFailed} fehlgeschlagen</span>
{/if}
</div>
{/if}
{#if result.failed > 0} {#if result.failed > 0}
<div class="text-amber-400">{result.failed} Karten konnten nicht angelegt werden.</div> <div class="text-amber-400">{result.failed} Karten konnten nicht angelegt werden.</div>
{/if} {/if}