From 33b3f656fdf17930b909a0d15a3d1ec28987c9e3 Mon Sep 17 00:00:00 2001
From: Till JS <tills95@gmail.com>
Date: Tue, 28 Apr 2026 22:39:17 +0200
Subject: [PATCH] test(articles): parseUrls unit tests + extract pure module
 (Phase 7)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move `parseUrls` out of stores/imports.svelte.ts (which transitively
imports Dexie via collections.ts) into a standalone parse-urls.ts so
the test file can exercise it without booting Dexie. The store re-
exports parseUrls so existing call sites (BulkImportForm, tools.ts)
keep working unchanged.

11 unit tests covering:
  - empty + whitespace-only inputs
  - newline / whitespace / comma / tab separator handling
  - http + https accepted, ftp / mailto / javascript / file rejected
  - bare domains rejected (URL accepts them as opaque, our parser
    requires explicit scheme)
  - duplicate detection preserves first-occurrence order
  - canonicalisation (trailing slash on root, query+fragment kept)
  - mixed valid / invalid / duplicate token ordering
  - title-prefixed-paste behaviour (strict — surfaces non-URL words
    as invalid for the user to see)
  - 50-URL stress check

Plan: docs/plans/articles-bulk-import.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/lib/modules/articles/parse-urls.ts    |  60 +++++++++
 .../modules/articles/stores/imports.svelte.ts |  53 +-------
 .../modules/articles/stores/imports.test.ts   | 120 ++++++++++++++++++
 3 files changed, 184 insertions(+), 49 deletions(-)
 create mode 100644 apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
 create mode 100644 apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts
diff --git a/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts b/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
new file mode 100644
index 000000000..3e1a4e132
--- /dev/null
+++ b/apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
@@ -0,0 +1,60 @@
+/**
+ * Pure URL-list parser for the bulk-import flow. Extracted into its
+ * own module so tests can import + exercise it without booting Dexie
+ * (collections.ts and stores/imports.svelte.ts have a transitive
+ * dependency on the database, which won't open under fake-indexeddb
+ * if any registered table is currently in a half-migrated state).
+ *
+ * Plan: docs/plans/articles-bulk-import.md.
+ */
+
+export interface ParsedUrls {
+	valid: string[];
+	invalid: string[];
+	duplicates: string[];
+}
+
+/**
+ * Splits the raw textarea blob on any whitespace + comma, drops empty
+ * tokens, validates with `new URL` + http(s) scheme check, and
+ * deduplicates while preserving first-occurrence order.
+ *
+ *   parseUrls('https://a.com\nhttps://a.com\nbroken')
+ *     → { valid: ['https://a.com/'],
+ *         invalid: ['broken'],
+ *         duplicates: ['https://a.com/'] }
+ */
+export function parseUrls(raw: string): ParsedUrls {
+	const tokens = raw
+		.split(/[\s,]+/)
+		.map((t) => t.trim())
+		.filter(Boolean);
+	const valid: string[] = [];
+	const invalid: string[] = [];
+	const duplicates: string[] = [];
+	const seen = new Set<string>();
+	for (const token of tokens) {
+		// Reject anything without an http(s) scheme — `new URL('foo.com')`
+		// would happily accept it as an opaque URI and the server-side
+		// fetch would then 400 on us.
+		let parsed: URL;
+		try {
+			parsed = new URL(token);
+		} catch {
+			invalid.push(token);
+			continue;
+		}
+		if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+			invalid.push(token);
+			continue;
+		}
+		const canonical = parsed.toString();
+		if (seen.has(canonical)) {
+			duplicates.push(canonical);
+			continue;
+		}
+		seen.add(canonical);
+		valid.push(canonical);
+	}
+	return { valid, invalid, duplicates };
+}
diff --git a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts
index 0b590f953..6eaa25ce4 100644
--- a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts
+++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.svelte.ts
@@ -15,61 +15,16 @@
 
 import { emitDomainEvent } from '$lib/data/events';
 import { articleImportJobTable, articleImportItemTable } from '../collections';
+import { parseUrls, type ParsedUrls } from '../parse-urls';
 import type {
 	ArticleImportItemState,
 	LocalArticleImportItem,
 	LocalArticleImportJob,
 } from '../types';
 
-/**
- * Pure URL parser — used by both the store (`createJob` accepts a raw
- * textarea blob) and the UI's `$derived` live-validation. Splits on
- * any whitespace + comma, drops empties, validates with `new URL`,
- * deduplicates while preserving first-occurrence order.
- *
- * Exported as a standalone pure function so the unit-test file can
- * import it without booting Dexie.
- */
-export interface ParsedUrls {
-	valid: string[];
-	invalid: string[];
-	duplicates: string[];
-}
-
-export function parseUrls(raw: string): ParsedUrls {
-	const tokens = raw
-		.split(/[\s,]+/)
-		.map((t) => t.trim())
-		.filter(Boolean);
-	const valid: string[] = [];
-	const invalid: string[] = [];
-	const duplicates: string[] = [];
-	const seen = new Set<string>();
-	for (const token of tokens) {
-		// Reject anything without an http(s) scheme — `new URL('foo.com')`
-		// would happily accept it as an opaque URI and the server-side
-		// fetch would then 400 on us.
-		let parsed: URL;
-		try {
-			parsed = new URL(token);
-		} catch {
-			invalid.push(token);
-			continue;
-		}
-		if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
-			invalid.push(token);
-			continue;
-		}
-		const canonical = parsed.toString();
-		if (seen.has(canonical)) {
-			duplicates.push(canonical);
-			continue;
-		}
-		seen.add(canonical);
-		valid.push(canonical);
-	}
-	return { valid, invalid, duplicates };
-}
+// Re-export so call sites that already imported from `stores/imports`
+// (BulkImportForm, tools.ts) keep working unchanged.
+export { parseUrls, type ParsedUrls };
 
 export const articleImportsStore = {
 	/**
diff --git a/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts
new file mode 100644
index 000000000..1e4f0ba54
--- /dev/null
+++ b/apps/mana/apps/web/src/lib/modules/articles/stores/imports.test.ts
@@ -0,0 +1,120 @@
+/**
+ * Tests for the pure `parseUrls` URL-list parser. The store's mutation
+ * methods (createJob, pauseJob, …) are integration-shaped (need Dexie
+ * + the scope hook) and live under the integration suite; this file
+ * only covers the parser, which is the deterministic part.
+ *
+ * Plan: docs/plans/articles-bulk-import.md.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { parseUrls } from '../parse-urls';
+
+describe('parseUrls', () => {
+	it('returns empty arrays for an empty input', () => {
+		expect(parseUrls('')).toEqual({ valid: [], invalid: [], duplicates: [] });
+		expect(parseUrls('   \n\t  ')).toEqual({ valid: [], invalid: [], duplicates: [] });
+	});
+
+	it('parses a single newline-separated list', () => {
+		const r = parseUrls('https://example.com/a\nhttps://example.com/b\nhttps://example.com/c');
+		expect(r.valid).toEqual([
+			'https://example.com/a',
+			'https://example.com/b',
+			'https://example.com/c',
+		]);
+		expect(r.invalid).toEqual([]);
+		expect(r.duplicates).toEqual([]);
+	});
+
+	it('accepts whitespace + comma + tabs as separators', () => {
+		const r = parseUrls('https://a.com  https://b.com,\thttps://c.com\nhttps://d.com');
+		expect(r.valid).toEqual([
+			'https://a.com/',
+			'https://b.com/',
+			'https://c.com/',
+			'https://d.com/',
+		]);
+	});
+
+	it('accepts http and https, rejects everything else', () => {
+		const r = parseUrls(
+			[
+				'http://insecure.example',
+				'https://secure.example',
+				'ftp://files.example',
+				'javascript:alert(1)',
+				'mailto:foo@bar.com',
+				'file:///etc/passwd',
+			].join('\n')
+		);
+		expect(r.valid).toEqual(['http://insecure.example/', 'https://secure.example/']);
+		expect(r.invalid).toHaveLength(4);
+		expect(r.invalid).toContain('javascript:alert(1)');
+		expect(r.invalid).toContain('mailto:foo@bar.com');
+	});
+
+	it('rejects scheme-less domains (URL accepts them as opaque)', () => {
+		const r = parseUrls('example.com\ngoogle.com\nhttps://valid.com');
+		expect(r.valid).toEqual(['https://valid.com/']);
+		expect(r.invalid).toEqual(['example.com', 'google.com']);
+	});
+
+	it('flags duplicate URLs as duplicates, keeps the first occurrence', () => {
+		const r = parseUrls(
+			'https://example.com/a\nhttps://example.com/b\nhttps://example.com/a\nhttps://example.com/b'
+		);
+		expect(r.valid).toEqual(['https://example.com/a', 'https://example.com/b']);
+		expect(r.duplicates).toEqual(['https://example.com/a', 'https://example.com/b']);
+	});
+
+	it('canonicalises URLs (trailing slash on root, identical query order) so dupes are caught', () => {
+		const r = parseUrls('https://example.com\nhttps://example.com/');
+		expect(r.valid).toEqual(['https://example.com/']);
+		expect(r.duplicates).toEqual(['https://example.com/']);
+	});
+
+	it('preserves first-occurrence order across mixed valid/invalid/dup tokens', () => {
+		const r = parseUrls(
+			[
+				'https://first.com',
+				'not-a-url',
+				'https://second.com',
+				'https://first.com', // duplicate of first
+				'https://third.com',
+			].join('\n')
+		);
+		expect(r.valid).toEqual(['https://first.com/', 'https://second.com/', 'https://third.com/']);
+		expect(r.invalid).toEqual(['not-a-url']);
+		expect(r.duplicates).toEqual(['https://first.com/']);
+	});
+
+	it('handles realistic paste with title prefixes (extracts URL-shaped tokens only)', () => {
+		// User pasted from a chat where each line had a title before the URL
+		// — our parser splits on whitespace, so this leaves bare URL tokens
+		// + title-noise as "invalid". That's the correct behaviour for a
+		// strict parser; the UI surfaces both counters so the user sees it.
+		const r = parseUrls(
+			'Awesome article: https://nytimes.com/article-1\nAnother one: https://wsj.com/x'
+		);
+		expect(r.valid).toEqual(['https://nytimes.com/article-1', 'https://wsj.com/x']);
+		expect(r.invalid).toContain('Awesome');
+		expect(r.invalid).toContain('article:');
+	});
+
+	it('keeps query strings + fragments in canonical form', () => {
+		const r = parseUrls(
+			'https://example.com/a?x=1&y=2#section\nhttps://example.com/a?x=1&y=2#section'
+		);
+		expect(r.valid).toEqual(['https://example.com/a?x=1&y=2#section']);
+		expect(r.duplicates).toEqual(['https://example.com/a?x=1&y=2#section']);
+	});
+
+	it('handles a 50-URL input without choking', () => {
+		const urls = Array.from({ length: 50 }, (_, i) => `https://example.com/article-${i}`);
+		const r = parseUrls(urls.join('\n'));
+		expect(r.valid).toHaveLength(50);
+		expect(r.invalid).toEqual([]);
+		expect(r.duplicates).toEqual([]);
+	});
+});