diff --git a/apps/api/src/routes/cards.ts b/apps/api/src/routes/cards.ts index 77d1e7f..5fe9a9c 100644 --- a/apps/api/src/routes/cards.ts +++ b/apps/api/src/routes/cards.ts @@ -4,6 +4,7 @@ import { Hono } from 'hono'; import { CardCreateSchema, CardUpdateSchema, + cardContentHash, newReview, subIndexCount, subIndexCountForCloze, @@ -66,6 +67,10 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }> const cardId = ulid(); const now = new Date(); const subIndices = Array.from({ length: count }, (_, i) => i); + const contentHash = await cardContentHash({ + type: parsed.data.type, + fields: parsed.data.fields, + }); const [cardRow] = await dbOf().transaction(async (tx) => { const [card] = await tx @@ -77,6 +82,7 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }> type: parsed.data.type, fields: parsed.data.fields, mediaRefs: parsed.data.media_refs ?? [], + contentHash, createdAt: now, updatedAt: now, }) @@ -120,6 +126,24 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }> return c.json({ cards: rows.map(toCardDto), total: rows.length }); }); + /** + * Liefert nur die content_hash-Liste des Users — kompakter Pfad für + * den Anki-Re-Import-Dedupe. Frontend lädt das einmal und prüft pro + * Karte clientseitig, statt für jeden Insert einen Round-Trip zu + * machen. Karten ohne content_hash (Pre-Phase-9j) werden weggefiltert. + */ + r.get('/hashes', async (c) => { + const userId = c.get('userId'); + const rows = await dbOf() + .select({ contentHash: cards.contentHash }) + .from(cards) + .where(eq(cards.userId, userId)); + const hashes = rows + .map((r) => r.contentHash) + .filter((h): h is string => typeof h === 'string' && h.length > 0); + return c.json({ hashes, total: hashes.length }); + }); + r.get('/:id', async (c) => { const userId = c.get('userId'); const id = c.req.param('id'); diff --git a/apps/api/tests/cards.test.ts b/apps/api/tests/cards.test.ts index 0658d22..df637ef 100644 --- a/apps/api/tests/cards.test.ts +++ b/apps/api/tests/cards.test.ts @@ -28,6 +28,12 @@ describe('cardsRouter — auth-gate', () => { const res = await app.request('/api/v1/cards'); expect(res.status).toBe(401); }); + + it('GET /hashes ohne X-User-Id ist 401', async () => { + const { app } = buildApp(); + const res = await app.request('/api/v1/cards/hashes'); + expect(res.status).toBe(401); + }); }); describe('cardsRouter — Input-Validation', () => { diff --git a/apps/web/src/lib/anki/import.ts b/apps/web/src/lib/anki/import.ts index 45442b1..def34ec 100644 --- a/apps/web/src/lib/anki/import.ts +++ b/apps/web/src/lib/anki/import.ts @@ -9,16 +9,24 @@ * Phase-8-MVP: Bilder + Audio werden gedroppt (siehe parse.ts * `sanitizeAnkiHtml`). Ein späterer Media-Pfad ist additiv. * - * No de-dupe: Re-Import derselben .apkg legt doppelte Decks an. + * Phase-9j-Re-Import-Dedupe: Vor dem Insert wird der content_hash der + * Karte berechnet (gleiche Funktion wie der Server) und gegen die + * existierende Hash-Liste des Users geprüft. Duplikate werden gezählt + * und übersprungen — Re-Imports bringen also keine doppelten Karten + * mehr ins Deck. Decks werden nicht dedupliziert (gewollt: zwei + * .apkg-Files mit identischen Decknamen sollen sich nicht + * versehentlich zusammenführen). */ +import { cardContentHash } from '@cards/domain'; import { createDeck } from '$lib/api/decks.ts'; -import { createCard } from '$lib/api/cards.ts'; +import { createCard, listCardHashes } from '$lib/api/cards.ts'; import { sanitizeAnkiHtml, type ParsedAnki } from './parse.ts'; export interface ImportResult { decksCreated: number; cardsCreated: number; + cardsSkippedDuplicate: number; failed: number; failures: string[]; } @@ -36,10 +44,22 @@ export async function importParsedAnki( const result: ImportResult = { decksCreated: 0, cardsCreated: 0, + cardsSkippedDuplicate: 0, failed: 0, failures: [], }; + // Vor dem Insert die Hash-Liste des Users laden — wenn der Endpoint + // fehlschlägt (z.B. älterer Server vor Phase 9j), fallen wir + // stillschweigend auf "kein Dedupe" zurück. + const existingHashes = new Set(); + try { + const r = await listCardHashes(); + for (const h of r.hashes) existingHashes.add(h); + } catch { + // Dedupe bleibt aus — Karten werden eingefügt wie zuvor. + } + // 1) Decks — Anki "::"-Hierarchie zu " / "-Strings flach machen. const ankiIdToDeckId = new Map(); let deckIdx = 0; @@ -71,11 +91,22 @@ export async function importParsedAnki( } }; - // 2) Cards — Felder sanitizen (Media-Refs werden gedroppt). + // 2) Cards — Felder sanitizen, content_hash prüfen, einfügen. for (let i = 0; i < parsed.cards.length; i++) { opts.onProgress?.({ stage: 'cards', current: i, total: parsed.cards.length }); const card = parsed.cards[i]; + const cleanFields: Record = {}; + for (const [key, value] of Object.entries(card.fields)) { + cleanFields[key] = sanitizeAnkiHtml(value); + } + + const hash = await cardContentHash({ type: card.type, fields: cleanFields }); + if (existingHashes.has(hash)) { + result.cardsSkippedDuplicate++; + continue; + } + let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId); if (!targetDeckId) { const fallback = await ensureFallbackDeck(); @@ -86,11 +117,6 @@ export async function importParsedAnki( targetDeckId = fallback; } - const cleanFields: Record = {}; - for (const [key, value] of Object.entries(card.fields)) { - cleanFields[key] = sanitizeAnkiHtml(value); - } - try { await createCard({ deck_id: targetDeckId, @@ -98,6 +124,9 @@ export async function importParsedAnki( fields: cleanFields, }); result.cardsCreated++; + // Hash sofort merken — derselbe Import könnte zwei identische + // Karten enthalten (Anki-Drift), zweite würde sonst auch rein. + existingHashes.add(hash); } catch (e) { result.failed++; result.failures.push(`card "${preview(cleanFields)}": ${errMessage(e)}`); diff --git a/apps/web/src/lib/api/cards.ts b/apps/web/src/lib/api/cards.ts index 984ce0f..34ac16a 100644 --- a/apps/web/src/lib/api/cards.ts +++ b/apps/web/src/lib/api/cards.ts @@ -6,6 +6,11 @@ export function listCards(deckId?: string) { return api<{ cards: Card[]; total: number }>(`/api/v1/cards${qs}`); } +/** Holt nur die content_hash-Liste — kompakt für Anki-Re-Import-Dedupe. */ +export function listCardHashes() { + return api<{ hashes: string[]; total: number }>('/api/v1/cards/hashes'); +} + export function getCard(id: string) { return api(`/api/v1/cards/${id}`); } diff --git a/apps/web/src/lib/components/AnkiImport.svelte b/apps/web/src/lib/components/AnkiImport.svelte index 733d407..6ce514a 100644 --- a/apps/web/src/lib/components/AnkiImport.svelte +++ b/apps/web/src/lib/components/AnkiImport.svelte @@ -199,6 +199,11 @@ ? t('import.done_summary_one', { cards: result.cardsCreated }) : t('import.done_summary', { cards: result.cardsCreated, decks: result.decksCreated })} + {#if result.cardsSkippedDuplicate > 0} +
+ {t('import.done_dupes', { n: result.cardsSkippedDuplicate })} +
+ {/if} {#if result.failed > 0}
{t('import.done_failures', { n: result.failed })} diff --git a/apps/web/src/lib/i18n/de.ts b/apps/web/src/lib/i18n/de.ts index 7f8589a..d4e2b32 100644 --- a/apps/web/src/lib/i18n/de.ts +++ b/apps/web/src/lib/i18n/de.ts @@ -156,6 +156,7 @@ export const de: TranslationNode = { stage_done: 'Fertig.', done_summary_one: '✓ {cards} Karten in 1 Deck angelegt.', done_summary: '✓ {cards} Karten in {decks} Decks angelegt.', + done_dupes: '{n} Duplikate übersprungen (gleicher Inhalt schon vorhanden).', done_failures: '{n} Fehler', done_more: 'Weitere Datei', error_label: 'Fehler: {msg}', diff --git a/apps/web/src/lib/i18n/en.ts b/apps/web/src/lib/i18n/en.ts index 2af2915..93f6f6f 100644 --- a/apps/web/src/lib/i18n/en.ts +++ b/apps/web/src/lib/i18n/en.ts @@ -153,6 +153,7 @@ export const en: TranslationNode = { stage_done: 'Done.', done_summary_one: '✓ {cards} cards in 1 deck.', done_summary: '✓ {cards} cards in {decks} decks.', + done_dupes: '{n} duplicates skipped (same content already exists).', done_failures: '{n} errors', done_more: 'Another file', error_label: 'Error: {msg}', diff --git a/packages/cards-domain/src/content-hash.ts b/packages/cards-domain/src/content-hash.ts new file mode 100644 index 0000000..acc3203 --- /dev/null +++ b/packages/cards-domain/src/content-hash.ts @@ -0,0 +1,35 @@ +/** + * Content-Hash für Karten — deterministisch, idempotent. + * + * Wird beim Card-Insert geschrieben (in `cards.content_hash`) und vom + * Anki-Re-Import zur Dedupe genutzt: zwei Imports derselben Karte + * landen nicht mehrfach im Deck. + * + * Eingabe: `type` + `fields`. Field-Keys werden sortiert, damit die + * Reihenfolge der Eingabe nichts ändert. Field-Values werden roh + * gehasht — Markdown-Whitespace und Cloze-Markup zählen mit + * (gewollt: zwei Karten mit demselben Text aber unterschiedlichem + * `{{c1::…}}`-Markup sind verschiedene Karten). + * + * Hash: SHA-256 → hex-string. Pure Web-Crypto (Browser + Bun + Node 20+). + */ + +export interface CardContentInput { + type: string; + fields: Record; +} + +function canonicalize(input: CardContentInput): string { + const keys = Object.keys(input.fields).sort(); + const fields = keys.map((k) => [k, input.fields[k] ?? '']); + return JSON.stringify({ type: input.type, fields }); +} + +export async function cardContentHash(input: CardContentInput): Promise { + const text = canonicalize(input); + const data = new TextEncoder().encode(text); + const buf = await crypto.subtle.digest('SHA-256', data); + return Array.from(new Uint8Array(buf)) + .map((b) => b.toString(16).padStart(2, '0')) + .join(''); +} diff --git a/packages/cards-domain/src/index.ts b/packages/cards-domain/src/index.ts index e6096fb..b2c8124 100644 --- a/packages/cards-domain/src/index.ts +++ b/packages/cards-domain/src/index.ts @@ -11,3 +11,4 @@ export * from './schemas/index.ts'; export * from './fsrs.ts'; export * from './protocol/index.ts'; export * from './cloze.ts'; +export * from './content-hash.ts'; diff --git a/packages/cards-domain/tests/content-hash.test.ts b/packages/cards-domain/tests/content-hash.test.ts new file mode 100644 index 0000000..61b1b61 --- /dev/null +++ b/packages/cards-domain/tests/content-hash.test.ts @@ -0,0 +1,61 @@ +import { describe, it, expect } from 'vitest'; + +import { cardContentHash } from '../src/content-hash.ts'; + +describe('cardContentHash', () => { + it('liefert deterministischen 64-char-hex-String (SHA-256)', async () => { + const h = await cardContentHash({ + type: 'basic', + fields: { front: 'Q', back: 'A' }, + }); + expect(h).toMatch(/^[0-9a-f]{64}$/); + }); + + it('ist invariant gegenüber Field-Reihenfolge', async () => { + const a = await cardContentHash({ + type: 'basic', + fields: { front: 'Q', back: 'A' }, + }); + const b = await cardContentHash({ + type: 'basic', + fields: { back: 'A', front: 'Q' }, + }); + expect(a).toBe(b); + }); + + it('unterscheidet basic und basic-reverse', async () => { + const a = await cardContentHash({ + type: 'basic', + fields: { front: 'Q', back: 'A' }, + }); + const b = await cardContentHash({ + type: 'basic-reverse', + fields: { front: 'Q', back: 'A' }, + }); + expect(a).not.toBe(b); + }); + + it('unterscheidet zwei Cloze-Karten mit unterschiedlichem Cluster-Markup', async () => { + const a = await cardContentHash({ + type: 'cloze', + fields: { text: 'Die {{c1::Hauptstadt}} ist {{c2::Paris}}.' }, + }); + const b = await cardContentHash({ + type: 'cloze', + fields: { text: 'Die Hauptstadt ist {{c1::Paris}}.' }, + }); + expect(a).not.toBe(b); + }); + + it('unterscheidet Karten mit Whitespace-Drift', async () => { + const a = await cardContentHash({ + type: 'basic', + fields: { front: 'Q', back: 'A' }, + }); + const b = await cardContentHash({ + type: 'basic', + fields: { front: 'Q ', back: 'A' }, + }); + expect(a).not.toBe(b); + }); +});