Phase 9j: Anki-Re-Import-Dedupe via content_hash
Neuer Domain-Helper cardContentHash({ type, fields }) — SHA-256 über
canonisierten JSON ({type, sorted-fields}), pure Web-Crypto. Field-
Reihenfolge ist invariant; Whitespace + Cloze-Markup zählen mit
(zwei Karten mit identischem Text aber unterschiedlichem
{{c1::…}}-Markup sind verschiedene Karten).
cards-API POST schreibt content_hash automatisch in den schon
existierenden Schema-Slot. Neuer Endpoint GET /api/v1/cards/hashes
liefert die kompakte Hash-Liste des Users (ohne Card-Body) — eine
Anfrage pro Anki-Import statt pro Karte.
apps/web/src/lib/anki/import.ts holt die Hashes vor dem Loop und
prüft pro Karte clientseitig. Duplikate werden gezählt
(cardsSkippedDuplicate) und übersprungen, der Counter erscheint
in der AnkiImport-Done-View. Same-File-Drift (Anki-interne
Doppel-Notes) wird auch erkannt — nach erfolgreichem Insert
landet der Hash sofort im Set.
Fallback: wenn /hashes fehlschlägt (älterer Server), bleibt das
Dedupe-Set leer und Karten werden eingefügt wie zuvor — kein
Hard-Bruch.
Pre-Phase-9j-Karten haben null content_hash (Hashes-Endpoint
filtert sie weg) — sie können also irrtümlich erneut eingespielt
werden, falls noch im Anki-File. Pragmatisch akzeptiert: ein
Backfill-Script wäre Phase-10-Polish, sobald Live-User da sind.
5 neue Domain-Tests, 1 neuer API-Auth-Gate-Test (105 grün ges.:
51 + 49 + 5). svelte-check 380 files 0 errors. E2E gegen lokale
Postgres bestätigt: neue Karte hat content_hash (64-char-hex),
/hashes listet sie.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4b451f1b8d
commit
593d4475df
10 changed files with 176 additions and 8 deletions
|
|
@ -4,6 +4,7 @@ import { Hono } from 'hono';
|
||||||
import {
|
import {
|
||||||
CardCreateSchema,
|
CardCreateSchema,
|
||||||
CardUpdateSchema,
|
CardUpdateSchema,
|
||||||
|
cardContentHash,
|
||||||
newReview,
|
newReview,
|
||||||
subIndexCount,
|
subIndexCount,
|
||||||
subIndexCountForCloze,
|
subIndexCountForCloze,
|
||||||
|
|
@ -66,6 +67,10 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
||||||
const cardId = ulid();
|
const cardId = ulid();
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
const subIndices = Array.from({ length: count }, (_, i) => i);
|
const subIndices = Array.from({ length: count }, (_, i) => i);
|
||||||
|
const contentHash = await cardContentHash({
|
||||||
|
type: parsed.data.type,
|
||||||
|
fields: parsed.data.fields,
|
||||||
|
});
|
||||||
|
|
||||||
const [cardRow] = await dbOf().transaction(async (tx) => {
|
const [cardRow] = await dbOf().transaction(async (tx) => {
|
||||||
const [card] = await tx
|
const [card] = await tx
|
||||||
|
|
@ -77,6 +82,7 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
||||||
type: parsed.data.type,
|
type: parsed.data.type,
|
||||||
fields: parsed.data.fields,
|
fields: parsed.data.fields,
|
||||||
mediaRefs: parsed.data.media_refs ?? [],
|
mediaRefs: parsed.data.media_refs ?? [],
|
||||||
|
contentHash,
|
||||||
createdAt: now,
|
createdAt: now,
|
||||||
updatedAt: now,
|
updatedAt: now,
|
||||||
})
|
})
|
||||||
|
|
@ -120,6 +126,24 @@ export function cardsRouter(deps: CardsDeps = {}): Hono<{ Variables: AuthVars }>
|
||||||
return c.json({ cards: rows.map(toCardDto), total: rows.length });
|
return c.json({ cards: rows.map(toCardDto), total: rows.length });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Liefert nur die content_hash-Liste des Users — kompakter Pfad für
|
||||||
|
* den Anki-Re-Import-Dedupe. Frontend lädt das einmal und prüft pro
|
||||||
|
* Karte clientseitig, statt für jeden Insert einen Round-Trip zu
|
||||||
|
* machen. Karten ohne content_hash (Pre-Phase-9j) werden weggefiltert.
|
||||||
|
*/
|
||||||
|
r.get('/hashes', async (c) => {
|
||||||
|
const userId = c.get('userId');
|
||||||
|
const rows = await dbOf()
|
||||||
|
.select({ contentHash: cards.contentHash })
|
||||||
|
.from(cards)
|
||||||
|
.where(eq(cards.userId, userId));
|
||||||
|
const hashes = rows
|
||||||
|
.map((r) => r.contentHash)
|
||||||
|
.filter((h): h is string => typeof h === 'string' && h.length > 0);
|
||||||
|
return c.json({ hashes, total: hashes.length });
|
||||||
|
});
|
||||||
|
|
||||||
r.get('/:id', async (c) => {
|
r.get('/:id', async (c) => {
|
||||||
const userId = c.get('userId');
|
const userId = c.get('userId');
|
||||||
const id = c.req.param('id');
|
const id = c.req.param('id');
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,12 @@ describe('cardsRouter — auth-gate', () => {
|
||||||
const res = await app.request('/api/v1/cards');
|
const res = await app.request('/api/v1/cards');
|
||||||
expect(res.status).toBe(401);
|
expect(res.status).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('GET /hashes ohne X-User-Id ist 401', async () => {
|
||||||
|
const { app } = buildApp();
|
||||||
|
const res = await app.request('/api/v1/cards/hashes');
|
||||||
|
expect(res.status).toBe(401);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('cardsRouter — Input-Validation', () => {
|
describe('cardsRouter — Input-Validation', () => {
|
||||||
|
|
|
||||||
|
|
@ -9,16 +9,24 @@
|
||||||
* Phase-8-MVP: Bilder + Audio werden gedroppt (siehe parse.ts
|
* Phase-8-MVP: Bilder + Audio werden gedroppt (siehe parse.ts
|
||||||
* `sanitizeAnkiHtml`). Ein späterer Media-Pfad ist additiv.
|
* `sanitizeAnkiHtml`). Ein späterer Media-Pfad ist additiv.
|
||||||
*
|
*
|
||||||
* No de-dupe: Re-Import derselben .apkg legt doppelte Decks an.
|
* Phase-9j-Re-Import-Dedupe: Vor dem Insert wird der content_hash der
|
||||||
|
* Karte berechnet (gleiche Funktion wie der Server) und gegen die
|
||||||
|
* existierende Hash-Liste des Users geprüft. Duplikate werden gezählt
|
||||||
|
* und übersprungen — Re-Imports bringen also keine doppelten Karten
|
||||||
|
* mehr ins Deck. Decks werden nicht dedupliziert (gewollt: zwei
|
||||||
|
* .apkg-Files mit identischen Decknamen sollen sich nicht
|
||||||
|
* versehentlich zusammenführen).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { cardContentHash } from '@cards/domain';
|
||||||
import { createDeck } from '$lib/api/decks.ts';
|
import { createDeck } from '$lib/api/decks.ts';
|
||||||
import { createCard } from '$lib/api/cards.ts';
|
import { createCard, listCardHashes } from '$lib/api/cards.ts';
|
||||||
import { sanitizeAnkiHtml, type ParsedAnki } from './parse.ts';
|
import { sanitizeAnkiHtml, type ParsedAnki } from './parse.ts';
|
||||||
|
|
||||||
export interface ImportResult {
|
export interface ImportResult {
|
||||||
decksCreated: number;
|
decksCreated: number;
|
||||||
cardsCreated: number;
|
cardsCreated: number;
|
||||||
|
cardsSkippedDuplicate: number;
|
||||||
failed: number;
|
failed: number;
|
||||||
failures: string[];
|
failures: string[];
|
||||||
}
|
}
|
||||||
|
|
@ -36,10 +44,22 @@ export async function importParsedAnki(
|
||||||
const result: ImportResult = {
|
const result: ImportResult = {
|
||||||
decksCreated: 0,
|
decksCreated: 0,
|
||||||
cardsCreated: 0,
|
cardsCreated: 0,
|
||||||
|
cardsSkippedDuplicate: 0,
|
||||||
failed: 0,
|
failed: 0,
|
||||||
failures: [],
|
failures: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Vor dem Insert die Hash-Liste des Users laden — wenn der Endpoint
|
||||||
|
// fehlschlägt (z.B. älterer Server vor Phase 9j), fallen wir
|
||||||
|
// stillschweigend auf "kein Dedupe" zurück.
|
||||||
|
const existingHashes = new Set<string>();
|
||||||
|
try {
|
||||||
|
const r = await listCardHashes();
|
||||||
|
for (const h of r.hashes) existingHashes.add(h);
|
||||||
|
} catch {
|
||||||
|
// Dedupe bleibt aus — Karten werden eingefügt wie zuvor.
|
||||||
|
}
|
||||||
|
|
||||||
// 1) Decks — Anki "::"-Hierarchie zu " / "-Strings flach machen.
|
// 1) Decks — Anki "::"-Hierarchie zu " / "-Strings flach machen.
|
||||||
const ankiIdToDeckId = new Map<string, string>();
|
const ankiIdToDeckId = new Map<string, string>();
|
||||||
let deckIdx = 0;
|
let deckIdx = 0;
|
||||||
|
|
@ -71,11 +91,22 @@ export async function importParsedAnki(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 2) Cards — Felder sanitizen (Media-Refs werden gedroppt).
|
// 2) Cards — Felder sanitizen, content_hash prüfen, einfügen.
|
||||||
for (let i = 0; i < parsed.cards.length; i++) {
|
for (let i = 0; i < parsed.cards.length; i++) {
|
||||||
opts.onProgress?.({ stage: 'cards', current: i, total: parsed.cards.length });
|
opts.onProgress?.({ stage: 'cards', current: i, total: parsed.cards.length });
|
||||||
const card = parsed.cards[i];
|
const card = parsed.cards[i];
|
||||||
|
|
||||||
|
const cleanFields: Record<string, string> = {};
|
||||||
|
for (const [key, value] of Object.entries(card.fields)) {
|
||||||
|
cleanFields[key] = sanitizeAnkiHtml(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
const hash = await cardContentHash({ type: card.type, fields: cleanFields });
|
||||||
|
if (existingHashes.has(hash)) {
|
||||||
|
result.cardsSkippedDuplicate++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId);
|
let targetDeckId = ankiIdToDeckId.get(card.ankiDeckId);
|
||||||
if (!targetDeckId) {
|
if (!targetDeckId) {
|
||||||
const fallback = await ensureFallbackDeck();
|
const fallback = await ensureFallbackDeck();
|
||||||
|
|
@ -86,11 +117,6 @@ export async function importParsedAnki(
|
||||||
targetDeckId = fallback;
|
targetDeckId = fallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
const cleanFields: Record<string, string> = {};
|
|
||||||
for (const [key, value] of Object.entries(card.fields)) {
|
|
||||||
cleanFields[key] = sanitizeAnkiHtml(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await createCard({
|
await createCard({
|
||||||
deck_id: targetDeckId,
|
deck_id: targetDeckId,
|
||||||
|
|
@ -98,6 +124,9 @@ export async function importParsedAnki(
|
||||||
fields: cleanFields,
|
fields: cleanFields,
|
||||||
});
|
});
|
||||||
result.cardsCreated++;
|
result.cardsCreated++;
|
||||||
|
// Hash sofort merken — derselbe Import könnte zwei identische
|
||||||
|
// Karten enthalten (Anki-Drift), zweite würde sonst auch rein.
|
||||||
|
existingHashes.add(hash);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
result.failed++;
|
result.failed++;
|
||||||
result.failures.push(`card "${preview(cleanFields)}": ${errMessage(e)}`);
|
result.failures.push(`card "${preview(cleanFields)}": ${errMessage(e)}`);
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,11 @@ export function listCards(deckId?: string) {
|
||||||
return api<{ cards: Card[]; total: number }>(`/api/v1/cards${qs}`);
|
return api<{ cards: Card[]; total: number }>(`/api/v1/cards${qs}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Holt nur die content_hash-Liste — kompakt für Anki-Re-Import-Dedupe. */
|
||||||
|
export function listCardHashes() {
|
||||||
|
return api<{ hashes: string[]; total: number }>('/api/v1/cards/hashes');
|
||||||
|
}
|
||||||
|
|
||||||
export function getCard(id: string) {
|
export function getCard(id: string) {
|
||||||
return api<Card>(`/api/v1/cards/${id}`);
|
return api<Card>(`/api/v1/cards/${id}`);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -199,6 +199,11 @@
|
||||||
? t('import.done_summary_one', { cards: result.cardsCreated })
|
? t('import.done_summary_one', { cards: result.cardsCreated })
|
||||||
: t('import.done_summary', { cards: result.cardsCreated, decks: result.decksCreated })}
|
: t('import.done_summary', { cards: result.cardsCreated, decks: result.decksCreated })}
|
||||||
</div>
|
</div>
|
||||||
|
{#if result.cardsSkippedDuplicate > 0}
|
||||||
|
<div class="text-[var(--color-muted)]">
|
||||||
|
{t('import.done_dupes', { n: result.cardsSkippedDuplicate })}
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
{#if result.failed > 0}
|
{#if result.failed > 0}
|
||||||
<details class="text-[var(--color-danger)]">
|
<details class="text-[var(--color-danger)]">
|
||||||
<summary class="cursor-pointer">{t('import.done_failures', { n: result.failed })}</summary>
|
<summary class="cursor-pointer">{t('import.done_failures', { n: result.failed })}</summary>
|
||||||
|
|
|
||||||
|
|
@ -156,6 +156,7 @@ export const de: TranslationNode = {
|
||||||
stage_done: 'Fertig.',
|
stage_done: 'Fertig.',
|
||||||
done_summary_one: '✓ {cards} Karten in 1 Deck angelegt.',
|
done_summary_one: '✓ {cards} Karten in 1 Deck angelegt.',
|
||||||
done_summary: '✓ {cards} Karten in {decks} Decks angelegt.',
|
done_summary: '✓ {cards} Karten in {decks} Decks angelegt.',
|
||||||
|
done_dupes: '{n} Duplikate übersprungen (gleicher Inhalt schon vorhanden).',
|
||||||
done_failures: '{n} Fehler',
|
done_failures: '{n} Fehler',
|
||||||
done_more: 'Weitere Datei',
|
done_more: 'Weitere Datei',
|
||||||
error_label: 'Fehler: {msg}',
|
error_label: 'Fehler: {msg}',
|
||||||
|
|
|
||||||
|
|
@ -153,6 +153,7 @@ export const en: TranslationNode = {
|
||||||
stage_done: 'Done.',
|
stage_done: 'Done.',
|
||||||
done_summary_one: '✓ {cards} cards in 1 deck.',
|
done_summary_one: '✓ {cards} cards in 1 deck.',
|
||||||
done_summary: '✓ {cards} cards in {decks} decks.',
|
done_summary: '✓ {cards} cards in {decks} decks.',
|
||||||
|
done_dupes: '{n} duplicates skipped (same content already exists).',
|
||||||
done_failures: '{n} errors',
|
done_failures: '{n} errors',
|
||||||
done_more: 'Another file',
|
done_more: 'Another file',
|
||||||
error_label: 'Error: {msg}',
|
error_label: 'Error: {msg}',
|
||||||
|
|
|
||||||
35
packages/cards-domain/src/content-hash.ts
Normal file
35
packages/cards-domain/src/content-hash.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
/**
|
||||||
|
* Content-Hash für Karten — deterministisch, idempotent.
|
||||||
|
*
|
||||||
|
* Wird beim Card-Insert geschrieben (in `cards.content_hash`) und vom
|
||||||
|
* Anki-Re-Import zur Dedupe genutzt: zwei Imports derselben Karte
|
||||||
|
* landen nicht mehrfach im Deck.
|
||||||
|
*
|
||||||
|
* Eingabe: `type` + `fields`. Field-Keys werden sortiert, damit die
|
||||||
|
* Reihenfolge der Eingabe nichts ändert. Field-Values werden roh
|
||||||
|
* gehasht — Markdown-Whitespace und Cloze-Markup zählen mit
|
||||||
|
* (gewollt: zwei Karten mit demselben Text aber unterschiedlichem
|
||||||
|
* `{{c1::…}}`-Markup sind verschiedene Karten).
|
||||||
|
*
|
||||||
|
* Hash: SHA-256 → hex-string. Pure Web-Crypto (Browser + Bun + Node 20+).
|
||||||
|
*/
|
||||||
|
|
||||||
|
export interface CardContentInput {
|
||||||
|
type: string;
|
||||||
|
fields: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
function canonicalize(input: CardContentInput): string {
|
||||||
|
const keys = Object.keys(input.fields).sort();
|
||||||
|
const fields = keys.map((k) => [k, input.fields[k] ?? '']);
|
||||||
|
return JSON.stringify({ type: input.type, fields });
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function cardContentHash(input: CardContentInput): Promise<string> {
|
||||||
|
const text = canonicalize(input);
|
||||||
|
const data = new TextEncoder().encode(text);
|
||||||
|
const buf = await crypto.subtle.digest('SHA-256', data);
|
||||||
|
return Array.from(new Uint8Array(buf))
|
||||||
|
.map((b) => b.toString(16).padStart(2, '0'))
|
||||||
|
.join('');
|
||||||
|
}
|
||||||
|
|
@ -11,3 +11,4 @@ export * from './schemas/index.ts';
|
||||||
export * from './fsrs.ts';
|
export * from './fsrs.ts';
|
||||||
export * from './protocol/index.ts';
|
export * from './protocol/index.ts';
|
||||||
export * from './cloze.ts';
|
export * from './cloze.ts';
|
||||||
|
export * from './content-hash.ts';
|
||||||
|
|
|
||||||
61
packages/cards-domain/tests/content-hash.test.ts
Normal file
61
packages/cards-domain/tests/content-hash.test.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
|
||||||
|
import { cardContentHash } from '../src/content-hash.ts';
|
||||||
|
|
||||||
|
describe('cardContentHash', () => {
|
||||||
|
it('liefert deterministischen 64-char-hex-String (SHA-256)', async () => {
|
||||||
|
const h = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { front: 'Q', back: 'A' },
|
||||||
|
});
|
||||||
|
expect(h).toMatch(/^[0-9a-f]{64}$/);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ist invariant gegenüber Field-Reihenfolge', async () => {
|
||||||
|
const a = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { front: 'Q', back: 'A' },
|
||||||
|
});
|
||||||
|
const b = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { back: 'A', front: 'Q' },
|
||||||
|
});
|
||||||
|
expect(a).toBe(b);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('unterscheidet basic und basic-reverse', async () => {
|
||||||
|
const a = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { front: 'Q', back: 'A' },
|
||||||
|
});
|
||||||
|
const b = await cardContentHash({
|
||||||
|
type: 'basic-reverse',
|
||||||
|
fields: { front: 'Q', back: 'A' },
|
||||||
|
});
|
||||||
|
expect(a).not.toBe(b);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('unterscheidet zwei Cloze-Karten mit unterschiedlichem Cluster-Markup', async () => {
|
||||||
|
const a = await cardContentHash({
|
||||||
|
type: 'cloze',
|
||||||
|
fields: { text: 'Die {{c1::Hauptstadt}} ist {{c2::Paris}}.' },
|
||||||
|
});
|
||||||
|
const b = await cardContentHash({
|
||||||
|
type: 'cloze',
|
||||||
|
fields: { text: 'Die Hauptstadt ist {{c1::Paris}}.' },
|
||||||
|
});
|
||||||
|
expect(a).not.toBe(b);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('unterscheidet Karten mit Whitespace-Drift', async () => {
|
||||||
|
const a = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { front: 'Q', back: 'A' },
|
||||||
|
});
|
||||||
|
const b = await cardContentHash({
|
||||||
|
type: 'basic',
|
||||||
|
fields: { front: 'Q ', back: 'A' },
|
||||||
|
});
|
||||||
|
expect(a).not.toBe(b);
|
||||||
|
});
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue