Phase 9j: Anki-Re-Import-Dedupe via content_hash
Neuer Domain-Helper cardContentHash({ type, fields }) — SHA-256 über
canonisierten JSON ({type, sorted-fields}), pure Web-Crypto. Field-
Reihenfolge ist invariant; Whitespace + Cloze-Markup zählen mit
(zwei Karten mit identischem Text aber unterschiedlichem
{{c1::…}}-Markup sind verschiedene Karten).
cards-API POST schreibt content_hash automatisch in den schon
existierenden Schema-Slot. Neuer Endpoint GET /api/v1/cards/hashes
liefert die kompakte Hash-Liste des Users (ohne Card-Body) — eine
Anfrage pro Anki-Import statt pro Karte.
apps/web/src/lib/anki/import.ts holt die Hashes vor dem Loop und
prüft pro Karte clientseitig. Duplikate werden gezählt
(cardsSkippedDuplicate) und übersprungen, der Counter erscheint
in der AnkiImport-Done-View. Same-File-Drift (Anki-interne
Doppel-Notes) wird auch erkannt — nach erfolgreichem Insert
landet der Hash sofort im Set.
Fallback: wenn /hashes fehlschlägt (älterer Server), bleibt das
Dedupe-Set leer und Karten werden eingefügt wie zuvor — kein
Hard-Bruch.
Pre-Phase-9j-Karten haben null content_hash (Hashes-Endpoint
filtert sie weg) — sie können also irrtümlich erneut eingespielt
werden, falls noch im Anki-File. Pragmatisch akzeptiert: ein
Backfill-Script wäre Phase-10-Polish, sobald Live-User da sind.
5 neue Domain-Tests, 1 neuer API-Auth-Gate-Test (105 grün ges.:
51 + 49 + 5). svelte-check 380 files 0 errors. E2E gegen lokale
Postgres bestätigt: neue Karte hat content_hash (64-char-hex),
/hashes listet sie.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4b451f1b8d
commit
593d4475df
10 changed files with 176 additions and 8 deletions
|
|
@ -4,6 +4,7 @@ import { Hono } from 'hono';
|
|||
import {
|
||||
CardCreateSchema,
|
||||
CardUpdateSchema,
|
||||
cardContentHash,
|
||||
newReview,
|
||||
subIndexCount,
|
||||
subIndexCountForCloze,
|
||||
|
|
@ -66,6 +67,10 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
|||
const cardId = ulid();
|
||||
const now = new Date();
|
||||
const subIndices = Array.from({ length: count }, (_, i) => i);
|
||||
const contentHash = await cardContentHash({
|
||||
type: parsed.data.type,
|
||||
fields: parsed.data.fields,
|
||||
});
|
||||
|
||||
const [cardRow] = await dbOf().transaction(async (tx) => {
|
||||
const [card] = await tx
|
||||
|
|
@ -77,6 +82,7 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
|||
type: parsed.data.type,
|
||||
fields: parsed.data.fields,
|
||||
mediaRefs: parsed.data.media_refs ?? [],
|
||||
contentHash,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
})
|
||||
|
|
@ -120,6 +126,24 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
|||
return c.json({ cards: rows.map(toCardDto), total: rows.length });
|
||||
});
|
||||
|
||||
/**
|
||||
* Liefert nur die content_hash-Liste des Users — kompakter Pfad für
|
||||
* den Anki-Re-Import-Dedupe. Frontend lädt das einmal und prüft pro
|
||||
* Karte clientseitig, statt für jeden Insert einen Round-Trip zu
|
||||
* machen. Karten ohne content_hash (Pre-Phase-9j) werden weggefiltert.
|
||||
*/
|
||||
r.get('/hashes', async (c) => {
|
||||
const userId = c.get('userId');
|
||||
const rows = await dbOf()
|
||||
.select({ contentHash: cards.contentHash })
|
||||
.from(cards)
|
||||
.where(eq(cards.userId, userId));
|
||||
const hashes = rows
|
||||
.map((r) => r.contentHash)
|
||||
.filter((h): h is string => typeof h === 'string' && h.length > 0);
|
||||
return c.json({ hashes, total: hashes.length });
|
||||
});
|
||||
|
||||
r.get('/:id', async (c) => {
|
||||
const userId = c.get('userId');
|
||||
const id = c.req.param('id');
|
||||
|
|
|
|||
|
|
@ -28,6 +28,12 @@ describe('cardsRouter — auth-gate', () => {
|
|||
const res = await app.request('/api/v1/cards');
|
||||
expect(res.status).toBe(401);
|
||||
});
|
||||
|
||||
it('GET /hashes ohne X-User-Id ist 401', async () => {
|
||||
const { app } = buildApp();
|
||||
const res = await app.request('/api/v1/cards/hashes');
|
||||
expect(res.status).toBe(401);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cardsRouter — Input-Validation', () => {
|
||||
|
|
|
|||
|
|
@ -9,16 +9,24 @@
|
|||
* Phase-8-MVP: Bilder + Audio werden gedroppt (siehe parse.ts
|
||||
* `sanitizeAnkiHtml`). Ein späterer Media-Pfad ist additiv.
|
||||
*
|
||||
* No de-dupe: Re-Import derselben .apkg legt doppelte Decks an.
|
||||
* Phase-9j-Re-Import-Dedupe: Vor dem Insert wird der content_hash der
|
||||
* Karte berechnet (gleiche Funktion wie der Server) und gegen die
|
||||
* existierende Hash-Liste des Users geprüft. Duplikate werden gezählt
|
||||
* und übersprungen — Re-Imports bringen also keine doppelten Karten
|
||||
* mehr ins Deck. Decks werden nicht dedupliziert (gewollt: zwei
|
||||
* .apkg-Files mit identischen Decknamen sollen sich nicht
|
||||
* versehentlich zusammenführen).
|
||||
*/
|
||||
|
||||
import { cardContentHash } from '@cards/domain';
|
||||
import { createDeck } from '$lib/api/decks.ts';
|
||||
import { createCard } from '$lib/api/cards.ts';
|
||||
import { createCard, listCardHashes } from '$lib/api/cards.ts';
|
||||
import { sanitizeAnkiHtml, type ParsedAnki } from './parse.ts';
|
||||
|
||||
export interface ImportResult {
|
||||
decksCreated: number;
|
||||
cardsCreated: number;
|
||||
cardsSkippedDuplicate: number;
|
||||
failed: number;
|
||||
failures: string[];
|
||||
}
|
||||
|
|
@ -36,10 +44,22 @@ export async function importParsedAnki(
|
|||
const result: ImportResult = {
|
||||
decksCreated: 0,
|
||||
cardsCreated: 0,
|
||||
cardsSkippedDuplicate: 0,
|
||||
failed: 0,
|
||||
failures: [],
|
||||
};
|
||||
|
||||
// Vor dem Insert die Hash-Liste des Users laden — wenn der Endpoint
|
||||
// fehlschlägt (z.B. älterer Server vor Phase 9j), fallen wir
|
||||
// stillschweigend auf "kein Dedupe" zurück.
|
||||
const existingHashes = new Set<string>();
|
||||
try {
|
||||
const r = await listCardHashes();
|
||||
for (const h of r.hashes) existingHashes.add(h);
|
||||
} catch {
|
||||
// Dedupe bleibt aus — Karten werden eingefügt wie zuvor.
|
||||
}
|
||||
|
||||
// 1) Decks — Anki "::"-Hierarchie zu " / "-Strings flach machen.
|
||||
const ankiIdToDeckId = new Map<string, string>();
|
||||
let deckIdx = 0;
|
||||
|
|
@ -71,11 +91,22 @@ export async function importParsedAnki(
|
|||
}
|
||||
};
|
||||
|
||||
// 2) Cards — Felder sanitizen (Media-Refs werden gedroppt).
|
||||
// 2) Cards — Felder sanitizen, content_hash prüfen, einfügen.
|
||||
for (let i = 0; i < parsed.cards.length; i++) {
|
||||
opts.onProgress?.({ stage: 'cards', current: i, total: parsed.cards.length });
|
||||
const card = parsed.cards[i];
|
||||
|
||||
const cleanFields: Record<string, string> = {};
|
||||
for (const [key, value] of Object.entries(card.fields)) {
|
||||
cleanFields[key] = sanitizeAnkiHtml(value);
|
||||
}
|
||||
|
||||
const hash = await cardContentHash({ type: card.type, fields: cleanFields });
|
||||
if (existingHashes.has(hash)) {
|
||||
result.cardsSkippedDuplicate++;
|
||||
continue;
|
||||
}
|
||||
|
||||
let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId);
|
||||
if (!targetDeckId) {
|
||||
const fallback = await ensureFallbackDeck();
|
||||
|
|
@ -86,11 +117,6 @@ export async function importParsedAnki(
|
|||
targetDeckId = fallback;
|
||||
}
|
||||
|
||||
const cleanFields: Record<string, string> = {};
|
||||
for (const [key, value] of Object.entries(card.fields)) {
|
||||
cleanFields[key] = sanitizeAnkiHtml(value);
|
||||
}
|
||||
|
||||
try {
|
||||
await createCard({
|
||||
deck_id: targetDeckId,
|
||||
|
|
@ -98,6 +124,9 @@ export async function importParsedAnki(
|
|||
fields: cleanFields,
|
||||
});
|
||||
result.cardsCreated++;
|
||||
// Hash sofort merken — derselbe Import könnte zwei identische
|
||||
// Karten enthalten (Anki-Drift), zweite würde sonst auch rein.
|
||||
existingHashes.add(hash);
|
||||
} catch (e) {
|
||||
result.failed++;
|
||||
result.failures.push(`card "${preview(cleanFields)}": ${errMessage(e)}`);
|
||||
|
|
|
|||
|
|
@ -6,6 +6,11 @@ export function listCards(deckId?: string) {
|
|||
return api<{ cards: Card[]; total: number }>(`/api/v1/cards${qs}`);
|
||||
}
|
||||
|
||||
/** Holt nur die content_hash-Liste — kompakt für Anki-Re-Import-Dedupe. */
|
||||
export function listCardHashes() {
|
||||
return api<{ hashes: string[]; total: number }>('/api/v1/cards/hashes');
|
||||
}
|
||||
|
||||
export function getCard(id: string) {
|
||||
return api<Card>(`/api/v1/cards/${id}`);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -199,6 +199,11 @@
|
|||
? t('import.done_summary_one', { cards: result.cardsCreated })
|
||||
: t('import.done_summary', { cards: result.cardsCreated, decks: result.decksCreated })}
|
||||
</div>
|
||||
{#if result.cardsSkippedDuplicate > 0}
|
||||
<div class="text-[var(--color-muted)]">
|
||||
{t('import.done_dupes', { n: result.cardsSkippedDuplicate })}
|
||||
</div>
|
||||
{/if}
|
||||
{#if result.failed > 0}
|
||||
<details class="text-[var(--color-danger)]">
|
||||
<summary class="cursor-pointer">{t('import.done_failures', { n: result.failed })}</summary>
|
||||
|
|
|
|||
|
|
@ -156,6 +156,7 @@ export const de: TranslationNode = {
|
|||
stage_done: 'Fertig.',
|
||||
done_summary_one: '✓ {cards} Karten in 1 Deck angelegt.',
|
||||
done_summary: '✓ {cards} Karten in {decks} Decks angelegt.',
|
||||
done_dupes: '{n} Duplikate übersprungen (gleicher Inhalt schon vorhanden).',
|
||||
done_failures: '{n} Fehler',
|
||||
done_more: 'Weitere Datei',
|
||||
error_label: 'Fehler: {msg}',
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ export const en: TranslationNode = {
|
|||
stage_done: 'Done.',
|
||||
done_summary_one: '✓ {cards} cards in 1 deck.',
|
||||
done_summary: '✓ {cards} cards in {decks} decks.',
|
||||
done_dupes: '{n} duplicates skipped (same content already exists).',
|
||||
done_failures: '{n} errors',
|
||||
done_more: 'Another file',
|
||||
error_label: 'Error: {msg}',
|
||||
|
|
|
|||
35
packages/cards-domain/src/content-hash.ts
Normal file
35
packages/cards-domain/src/content-hash.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Content-Hash für Karten — deterministisch, idempotent.
|
||||
*
|
||||
* Wird beim Card-Insert geschrieben (in `cards.content_hash`) und vom
|
||||
* Anki-Re-Import zur Dedupe genutzt: zwei Imports derselben Karte
|
||||
* landen nicht mehrfach im Deck.
|
||||
*
|
||||
* Eingabe: `type` + `fields`. Field-Keys werden sortiert, damit die
|
||||
* Reihenfolge der Eingabe nichts ändert. Field-Values werden roh
|
||||
* gehasht — Markdown-Whitespace und Cloze-Markup zählen mit
|
||||
* (gewollt: zwei Karten mit demselben Text aber unterschiedlichem
|
||||
* `{{c1::…}}`-Markup sind verschiedene Karten).
|
||||
*
|
||||
* Hash: SHA-256 → hex-string. Pure Web-Crypto (Browser + Bun + Node 20+).
|
||||
*/
|
||||
|
||||
export interface CardContentInput {
|
||||
type: string;
|
||||
fields: Record<string, string>;
|
||||
}
|
||||
|
||||
function canonicalize(input: CardContentInput): string {
|
||||
const keys = Object.keys(input.fields).sort();
|
||||
const fields = keys.map((k) => [k, input.fields[k] ?? '']);
|
||||
return JSON.stringify({ type: input.type, fields });
|
||||
}
|
||||
|
||||
export async function cardContentHash(input: CardContentInput): Promise<string> {
|
||||
const text = canonicalize(input);
|
||||
const data = new TextEncoder().encode(text);
|
||||
const buf = await crypto.subtle.digest('SHA-256', data);
|
||||
return Array.from(new Uint8Array(buf))
|
||||
.map((b) => b.toString(16).padStart(2, '0'))
|
||||
.join('');
|
||||
}
|
||||
|
|
@ -11,3 +11,4 @@ export * from './schemas/index.ts';
|
|||
export * from './fsrs.ts';
|
||||
export * from './protocol/index.ts';
|
||||
export * from './cloze.ts';
|
||||
export * from './content-hash.ts';
|
||||
|
|
|
|||
61
packages/cards-domain/tests/content-hash.test.ts
Normal file
61
packages/cards-domain/tests/content-hash.test.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import { describe, it, expect } from 'vitest';
|
||||
|
||||
import { cardContentHash } from '../src/content-hash.ts';
|
||||
|
||||
describe('cardContentHash', () => {
|
||||
it('liefert deterministischen 64-char-hex-String (SHA-256)', async () => {
|
||||
const h = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { front: 'Q', back: 'A' },
|
||||
});
|
||||
expect(h).toMatch(/^[0-9a-f]{64}$/);
|
||||
});
|
||||
|
||||
it('ist invariant gegenüber Field-Reihenfolge', async () => {
|
||||
const a = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { front: 'Q', back: 'A' },
|
||||
});
|
||||
const b = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { back: 'A', front: 'Q' },
|
||||
});
|
||||
expect(a).toBe(b);
|
||||
});
|
||||
|
||||
it('unterscheidet basic und basic-reverse', async () => {
|
||||
const a = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { front: 'Q', back: 'A' },
|
||||
});
|
||||
const b = await cardContentHash({
|
||||
type: 'basic-reverse',
|
||||
fields: { front: 'Q', back: 'A' },
|
||||
});
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it('unterscheidet zwei Cloze-Karten mit unterschiedlichem Cluster-Markup', async () => {
|
||||
const a = await cardContentHash({
|
||||
type: 'cloze',
|
||||
fields: { text: 'Die {{c1::Hauptstadt}} ist {{c2::Paris}}.' },
|
||||
});
|
||||
const b = await cardContentHash({
|
||||
type: 'cloze',
|
||||
fields: { text: 'Die Hauptstadt ist {{c1::Paris}}.' },
|
||||
});
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it('unterscheidet Karten mit Whitespace-Drift', async () => {
|
||||
const a = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { front: 'Q', back: 'A' },
|
||||
});
|
||||
const b = await cardContentHash({
|
||||
type: 'basic',
|
||||
fields: { front: 'Q ', back: 'A' },
|
||||
});
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue