test(articles): parseUrls unit tests + extract pure module (Phase 7)

Move `parseUrls` out of stores/imports.svelte.ts (which transitively imports Dexie via collections.ts) into a standalone parse-urls.ts so the test file can exercise it without booting Dexie. The store re- exports parseUrls so existing call sites (BulkImportForm, tools.ts) keep working unchanged. 11 unit tests covering: - empty + whitespace-only inputs - newline / whitespace / comma / tab separator handling - http + https accepted, ftp / mailto / javascript / file rejected - bare domains rejected (URL accepts them as opaque, our parser requires explicit scheme) - duplicate detection preserves first-occurrence order - canonicalisation (trailing slash on root, query+fragment kept) - mixed valid / invalid / duplicate token ordering - title-prefixed-paste behaviour (strict — surfaces non-URL words as invalid for the user to see) - 50-URL stress check Plan: docs/plans/articles-bulk-import.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 20:41:09 +02:00 · 2026-04-28 22:39:17 +02:00 · 2026-04-28 22:39:17 +02:00 · 33b3f656fd
commit 33b3f656fd
parent 0fc16d1bfd
3 changed files with 184 additions and 49 deletions
--- a/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
+++ b/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
@ -0,0 +1,60 @@
+/**
+ * Pure URL-list parser for the bulk-import flow. Extracted into its
+ * own module so tests can import + exercise it without booting Dexie
+ * (collections.ts and stores/imports.svelte.ts have a transitive
+ * dependency on the database, which won't open under fake-indexeddb
+ * if any registered table is currently in a half-migrated state).
+ *
+ * Plan: docs/plans/articles-bulk-import.md.
+ */
+
+export interface ParsedUrls {
+	valid: string[];
+	invalid: string[];
+	duplicates: string[];
+}
+
+/**
+ * Splits the raw textarea blob on any whitespace + comma, drops empty
+ * tokens, validates with `new URL` + http(s) scheme check, and
+ * deduplicates while preserving first-occurrence order.
+ *
+ *   parseUrls('https://a.com\nhttps://a.com\nbroken')
+ *     → { valid: ['https://a.com/'],
+ *         invalid: ['broken'],
+ *         duplicates: ['https://a.com/'] }
+ */
+export function parseUrls(raw: string): ParsedUrls {
+	const tokens = raw
+		.split(/[\s,]+/)
+		.map((t) => t.trim())
+		.filter(Boolean);
+	const valid: string[] = [];
+	const invalid: string[] = [];
+	const duplicates: string[] = [];
+	const seen = new Set<string>();
+	for (const token of tokens) {
+		// Reject anything without an http(s) scheme — `new URL('foo.com')`
+		// would happily accept it as an opaque URI and the server-side
+		// fetch would then 400 on us.
+		let parsed: URL;
+		try {
+			parsed = new URL(token);
+		} catch {
+			invalid.push(token);
+			continue;
+		}
+		if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+			invalid.push(token);
+			continue;
+		}
+		const canonical = parsed.toString();
+		if (seen.has(canonical)) {
+			duplicates.push(canonical);
+			continue;
+		}
+		seen.add(canonical);
+		valid.push(canonical);
+	}
+	return { valid, invalid, duplicates };
+}
--- a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts
+++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts
@ -15,61 +15,16 @@

 import { emitDomainEvent } from '$lib/data/events';
 import { articleImportJobTable, articleImportItemTable } from '../collections';
+import { parseUrls, type ParsedUrls } from '../parse-urls';
 import type {
 	ArticleImportItemState,
 	LocalArticleImportItem,
 	LocalArticleImportJob,
 } from '../types';

-/**
- * Pure URL parser — used by both the store (`createJob` accepts a raw
- * textarea blob) and the UI's `$derived` live-validation. Splits on
- * any whitespace + comma, drops empties, validates with `new URL`,
- * deduplicates while preserving first-occurrence order.
- *
- * Exported as a standalone pure function so the unit-test file can
- * import it without booting Dexie.
- */
-export interface ParsedUrls {
-	valid: string[];
-	invalid: string[];
-	duplicates: string[];
-}
-
-export function parseUrls(raw: string): ParsedUrls {
-	const tokens = raw
-		.split(/[\s,]+/)
-		.map((t) => t.trim())
-		.filter(Boolean);
-	const valid: string[] = [];
-	const invalid: string[] = [];
-	const duplicates: string[] = [];
-	const seen = new Set<string>();
-	for (const token of tokens) {
-		// Reject anything without an http(s) scheme — `new URL('foo.com')`
-		// would happily accept it as an opaque URI and the server-side
-		// fetch would then 400 on us.
-		let parsed: URL;
-		try {
-			parsed = new URL(token);
-		} catch {
-			invalid.push(token);
-			continue;
-		}
-		if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
-			invalid.push(token);
-			continue;
-		}
-		const canonical = parsed.toString();
-		if (seen.has(canonical)) {
-			duplicates.push(canonical);
-			continue;
-		}
-		seen.add(canonical);
-		valid.push(canonical);
-	}
-	return { valid, invalid, duplicates };
-}
+// Re-export so call sites that already imported from `stores/imports`
+// (BulkImportForm, tools.ts) keep working unchanged.
+export { parseUrls, type ParsedUrls };

 export const articleImportsStore = {
 	/**
--- a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts
+++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts
@ -0,0 +1,120 @@
+/**
+ * Tests for the pure `parseUrls` URL-list parser. The store's mutation
+ * methods (createJob, pauseJob, …) are integration-shaped (need Dexie
+ * + the scope hook) and live under the integration suite; this file
+ * only covers the parser, which is the deterministic part.
+ *
+ * Plan: docs/plans/articles-bulk-import.md.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { parseUrls } from '../parse-urls';
+
+describe('parseUrls', () => {
+	it('returns empty arrays for an empty input', () => {
+		expect(parseUrls('')).toEqual({ valid: [], invalid: [], duplicates: [] });
+		expect(parseUrls('   \n\t  ')).toEqual({ valid: [], invalid: [], duplicates: [] });
+	});
+
+	it('parses a single newline-separated list', () => {
+		const r = parseUrls('https://example.com/a\nhttps://example.com/b\nhttps://example.com/c');
+		expect(r.valid).toEqual([
+			'https://example.com/a',
+			'https://example.com/b',
+			'https://example.com/c',
+		]);
+		expect(r.invalid).toEqual([]);
+		expect(r.duplicates).toEqual([]);
+	});
+
+	it('accepts whitespace + comma + tabs as separators', () => {
+		const r = parseUrls('https://a.com  https://b.com,\thttps://c.com\nhttps://d.com');
+		expect(r.valid).toEqual([
+			'https://a.com/',
+			'https://b.com/',
+			'https://c.com/',
+			'https://d.com/',
+		]);
+	});
+
+	it('accepts http and https, rejects everything else', () => {
+		const r = parseUrls(
+			[
+				'http://insecure.example',
+				'https://secure.example',
+				'ftp://files.example',
+				'javascript:alert(1)',
+				'mailto:foo@bar.com',
+				'file:///etc/passwd',
+			].join('\n')
+		);
+		expect(r.valid).toEqual(['http://insecure.example/', 'https://secure.example/']);
+		expect(r.invalid).toHaveLength(4);
+		expect(r.invalid).toContain('javascript:alert(1)');
+		expect(r.invalid).toContain('mailto:foo@bar.com');
+	});
+
+	it('rejects scheme-less domains (URL accepts them as opaque)', () => {
+		const r = parseUrls('example.com\ngoogle.com\nhttps://valid.com');
+		expect(r.valid).toEqual(['https://valid.com/']);
+		expect(r.invalid).toEqual(['example.com', 'google.com']);
+	});
+
+	it('flags duplicate URLs as duplicates, keeps the first occurrence', () => {
+		const r = parseUrls(
+			'https://example.com/a\nhttps://example.com/b\nhttps://example.com/a\nhttps://example.com/b'
+		);
+		expect(r.valid).toEqual(['https://example.com/a', 'https://example.com/b']);
+		expect(r.duplicates).toEqual(['https://example.com/a', 'https://example.com/b']);
+	});
+
+	it('canonicalises URLs (trailing slash on root, identical query order) so dupes are caught', () => {
+		const r = parseUrls('https://example.com\nhttps://example.com/');
+		expect(r.valid).toEqual(['https://example.com/']);
+		expect(r.duplicates).toEqual(['https://example.com/']);
+	});
+
+	it('preserves first-occurrence order across mixed valid/invalid/dup tokens', () => {
+		const r = parseUrls(
+			[
+				'https://first.com',
+				'not-a-url',
+				'https://second.com',
+				'https://first.com', // duplicate of first
+				'https://third.com',
+			].join('\n')
+		);
+		expect(r.valid).toEqual(['https://first.com/', 'https://second.com/', 'https://third.com/']);
+		expect(r.invalid).toEqual(['not-a-url']);
+		expect(r.duplicates).toEqual(['https://first.com/']);
+	});
+
+	it('handles realistic paste with title prefixes (extracts URL-shaped tokens only)', () => {
+		// User pasted from a chat where each line had a title before the URL
+		// — our parser splits on whitespace, so this leaves bare URL tokens
+		// + title-noise as "invalid". That's the correct behaviour for a
+		// strict parser; the UI surfaces both counters so the user sees it.
+		const r = parseUrls(
+			'Awesome article: https://nytimes.com/article-1\nAnother one: https://wsj.com/x'
+		);
+		expect(r.valid).toEqual(['https://nytimes.com/article-1', 'https://wsj.com/x']);
+		expect(r.invalid).toContain('Awesome');
+		expect(r.invalid).toContain('article:');
+	});
+
+	it('keeps query strings + fragments in canonical form', () => {
+		const r = parseUrls(
+			'https://example.com/a?x=1&y=2#section\nhttps://example.com/a?x=1&y=2#section'
+		);
+		expect(r.valid).toEqual(['https://example.com/a?x=1&y=2#section']);
+		expect(r.duplicates).toEqual(['https://example.com/a?x=1&y=2#section']);
+	});
+
+	it('handles a 50-URL input without choking', () => {
+		const urls = Array.from({ length: 50 }, (_, i) => `https://example.com/article-${i}`);
+		const r = parseUrls(urls.join('\n'));
+		expect(r.valid).toHaveLength(50);
+		expect(r.invalid).toEqual([]);
+		expect(r.duplicates).toEqual([]);
+	});
+});