From 33b3f656fdf17930b909a0d15a3d1ec28987c9e3 Mon Sep 17 00:00:00 2001 From: Till JS Date: Tue, 28 Apr 2026 22:39:17 +0200 Subject: [PATCH] test(articles): parseUrls unit tests + extract pure module (Phase 7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move `parseUrls` out of stores/imports.svelte.ts (which transitively imports Dexie via collections.ts) into a standalone parse-urls.ts so the test file can exercise it without booting Dexie. The store re- exports parseUrls so existing call sites (BulkImportForm, tools.ts) keep working unchanged. 11 unit tests covering: - empty + whitespace-only inputs - newline / whitespace / comma / tab separator handling - http + https accepted, ftp / mailto / javascript / file rejected - bare domains rejected (URL accepts them as opaque, our parser requires explicit scheme) - duplicate detection preserves first-occurrence order - canonicalisation (trailing slash on root, query+fragment kept) - mixed valid / invalid / duplicate token ordering - title-prefixed-paste behaviour (strict — surfaces non-URL words as invalid for the user to see) - 50-URL stress check Plan: docs/plans/articles-bulk-import.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/lib/modules/articles/parse-urls.ts | 60 +++++++++ .../modules/articles/stores/imports.svelte.ts | 53 +------- .../modules/articles/stores/imports.test.ts | 120 ++++++++++++++++++ 3 files changed, 184 insertions(+), 49 deletions(-) create mode 100644 apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts create mode 100644 apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts diff --git a/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts b/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts new file mode 100644 index 000000000..3e1a4e132 --- /dev/null +++ b/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts @@ -0,0 +1,60 @@ +/** + * Pure URL-list parser for the bulk-import flow. Extracted into its + * own module so tests can import + exercise it without booting Dexie + * (collections.ts and stores/imports.svelte.ts have a transitive + * dependency on the database, which won't open under fake-indexeddb + * if any registered table is currently in a half-migrated state). + * + * Plan: docs/plans/articles-bulk-import.md. + */ + +export interface ParsedUrls { + valid: string[]; + invalid: string[]; + duplicates: string[]; +} + +/** + * Splits the raw textarea blob on any whitespace + comma, drops empty + * tokens, validates with `new URL` + http(s) scheme check, and + * deduplicates while preserving first-occurrence order. + * + * parseUrls('https://a.com\nhttps://a.com\nbroken') + * → { valid: ['https://a.com/'], + * invalid: ['broken'], + * duplicates: ['https://a.com/'] } + */ +export function parseUrls(raw: string): ParsedUrls { + const tokens = raw + .split(/[\s,]+/) + .map((t) => t.trim()) + .filter(Boolean); + const valid: string[] = []; + const invalid: string[] = []; + const duplicates: string[] = []; + const seen = new Set(); + for (const token of tokens) { + // Reject anything without an http(s) scheme — `new URL('foo.com')` + // would happily accept it as an opaque URI and the server-side + // fetch would then 400 on us. + let parsed: URL; + try { + parsed = new URL(token); + } catch { + invalid.push(token); + continue; + } + if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { + invalid.push(token); + continue; + } + const canonical = parsed.toString(); + if (seen.has(canonical)) { + duplicates.push(canonical); + continue; + } + seen.add(canonical); + valid.push(canonical); + } + return { valid, invalid, duplicates }; +} diff --git a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts index 0b590f953..6eaa25ce4 100644 --- a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts +++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts @@ -15,61 +15,16 @@ import { emitDomainEvent } from '$lib/data/events'; import { articleImportJobTable, articleImportItemTable } from '../collections'; +import { parseUrls, type ParsedUrls } from '../parse-urls'; import type { ArticleImportItemState, LocalArticleImportItem, LocalArticleImportJob, } from '../types'; -/** - * Pure URL parser — used by both the store (`createJob` accepts a raw - * textarea blob) and the UI's `$derived` live-validation. Splits on - * any whitespace + comma, drops empties, validates with `new URL`, - * deduplicates while preserving first-occurrence order. - * - * Exported as a standalone pure function so the unit-test file can - * import it without booting Dexie. - */ -export interface ParsedUrls { - valid: string[]; - invalid: string[]; - duplicates: string[]; -} - -export function parseUrls(raw: string): ParsedUrls { - const tokens = raw - .split(/[\s,]+/) - .map((t) => t.trim()) - .filter(Boolean); - const valid: string[] = []; - const invalid: string[] = []; - const duplicates: string[] = []; - const seen = new Set(); - for (const token of tokens) { - // Reject anything without an http(s) scheme — `new URL('foo.com')` - // would happily accept it as an opaque URI and the server-side - // fetch would then 400 on us. - let parsed: URL; - try { - parsed = new URL(token); - } catch { - invalid.push(token); - continue; - } - if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { - invalid.push(token); - continue; - } - const canonical = parsed.toString(); - if (seen.has(canonical)) { - duplicates.push(canonical); - continue; - } - seen.add(canonical); - valid.push(canonical); - } - return { valid, invalid, duplicates }; -} +// Re-export so call sites that already imported from `stores/imports` +// (BulkImportForm, tools.ts) keep working unchanged. +export { parseUrls, type ParsedUrls }; export const articleImportsStore = { /** diff --git a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts new file mode 100644 index 000000000..1e4f0ba54 --- /dev/null +++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts @@ -0,0 +1,120 @@ +/** + * Tests for the pure `parseUrls` URL-list parser. The store's mutation + * methods (createJob, pauseJob, …) are integration-shaped (need Dexie + * + the scope hook) and live under the integration suite; this file + * only covers the parser, which is the deterministic part. + * + * Plan: docs/plans/articles-bulk-import.md. + */ + +import { describe, it, expect } from 'vitest'; +import { parseUrls } from '../parse-urls'; + +describe('parseUrls', () => { + it('returns empty arrays for an empty input', () => { + expect(parseUrls('')).toEqual({ valid: [], invalid: [], duplicates: [] }); + expect(parseUrls(' \n\t ')).toEqual({ valid: [], invalid: [], duplicates: [] }); + }); + + it('parses a single newline-separated list', () => { + const r = parseUrls('https://example.com/a\nhttps://example.com/b\nhttps://example.com/c'); + expect(r.valid).toEqual([ + 'https://example.com/a', + 'https://example.com/b', + 'https://example.com/c', + ]); + expect(r.invalid).toEqual([]); + expect(r.duplicates).toEqual([]); + }); + + it('accepts whitespace + comma + tabs as separators', () => { + const r = parseUrls('https://a.com https://b.com,\thttps://c.com\nhttps://d.com'); + expect(r.valid).toEqual([ + 'https://a.com/', + 'https://b.com/', + 'https://c.com/', + 'https://d.com/', + ]); + }); + + it('accepts http and https, rejects everything else', () => { + const r = parseUrls( + [ + 'http://insecure.example', + 'https://secure.example', + 'ftp://files.example', + 'javascript:alert(1)', + 'mailto:foo@bar.com', + 'file:///etc/passwd', + ].join('\n') + ); + expect(r.valid).toEqual(['http://insecure.example/', 'https://secure.example/']); + expect(r.invalid).toHaveLength(4); + expect(r.invalid).toContain('javascript:alert(1)'); + expect(r.invalid).toContain('mailto:foo@bar.com'); + }); + + it('rejects scheme-less domains (URL accepts them as opaque)', () => { + const r = parseUrls('example.com\ngoogle.com\nhttps://valid.com'); + expect(r.valid).toEqual(['https://valid.com/']); + expect(r.invalid).toEqual(['example.com', 'google.com']); + }); + + it('flags duplicate URLs as duplicates, keeps the first occurrence', () => { + const r = parseUrls( + 'https://example.com/a\nhttps://example.com/b\nhttps://example.com/a\nhttps://example.com/b' + ); + expect(r.valid).toEqual(['https://example.com/a', 'https://example.com/b']); + expect(r.duplicates).toEqual(['https://example.com/a', 'https://example.com/b']); + }); + + it('canonicalises URLs (trailing slash on root, identical query order) so dupes are caught', () => { + const r = parseUrls('https://example.com\nhttps://example.com/'); + expect(r.valid).toEqual(['https://example.com/']); + expect(r.duplicates).toEqual(['https://example.com/']); + }); + + it('preserves first-occurrence order across mixed valid/invalid/dup tokens', () => { + const r = parseUrls( + [ + 'https://first.com', + 'not-a-url', + 'https://second.com', + 'https://first.com', // duplicate of first + 'https://third.com', + ].join('\n') + ); + expect(r.valid).toEqual(['https://first.com/', 'https://second.com/', 'https://third.com/']); + expect(r.invalid).toEqual(['not-a-url']); + expect(r.duplicates).toEqual(['https://first.com/']); + }); + + it('handles realistic paste with title prefixes (extracts URL-shaped tokens only)', () => { + // User pasted from a chat where each line had a title before the URL + // — our parser splits on whitespace, so this leaves bare URL tokens + // + title-noise as "invalid". That's the correct behaviour for a + // strict parser; the UI surfaces both counters so the user sees it. + const r = parseUrls( + 'Awesome article: https://nytimes.com/article-1\nAnother one: https://wsj.com/x' + ); + expect(r.valid).toEqual(['https://nytimes.com/article-1', 'https://wsj.com/x']); + expect(r.invalid).toContain('Awesome'); + expect(r.invalid).toContain('article:'); + }); + + it('keeps query strings + fragments in canonical form', () => { + const r = parseUrls( + 'https://example.com/a?x=1&y=2#section\nhttps://example.com/a?x=1&y=2#section' + ); + expect(r.valid).toEqual(['https://example.com/a?x=1&y=2#section']); + expect(r.duplicates).toEqual(['https://example.com/a?x=1&y=2#section']); + }); + + it('handles a 50-URL input without choking', () => { + const urls = Array.from({ length: 50 }, (_, i) => `https://example.com/article-${i}`); + const r = parseUrls(urls.join('\n')); + expect(r.valid).toHaveLength(50); + expect(r.invalid).toEqual([]); + expect(r.duplicates).toEqual([]); + }); +});