test(articles): parseUrls unit tests + extract pure module (Phase 7)

Move `parseUrls` out of stores/imports.svelte.ts (which transitively
imports Dexie via collections.ts) into a standalone parse-urls.ts so
the test file can exercise it without booting Dexie. The store re-
exports parseUrls so existing call sites (BulkImportForm, tools.ts)
keep working unchanged.

11 unit tests covering:
  - empty + whitespace-only inputs
  - newline / whitespace / comma / tab separator handling
  - http + https accepted, ftp / mailto / javascript / file rejected
  - bare domains rejected (URL accepts them as opaque, our parser
    requires explicit scheme)
  - duplicate detection preserves first-occurrence order
  - canonicalisation (trailing slash on root, query+fragment kept)
  - mixed valid / invalid / duplicate token ordering
  - title-prefixed-paste behaviour (strict — surfaces non-URL words
    as invalid for the user to see)
  - 50-URL stress check

Plan: docs/plans/articles-bulk-import.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-04-28 22:39:17 +02:00
parent 0fc16d1bfd
commit 33b3f656fd
3 changed files with 184 additions and 49 deletions

View file

@ -0,0 +1,60 @@
/**
* Pure URL-list parser for the bulk-import flow. Extracted into its
* own module so tests can import + exercise it without booting Dexie
* (collections.ts and stores/imports.svelte.ts have a transitive
* dependency on the database, which won't open under fake-indexeddb
* if any registered table is currently in a half-migrated state).
*
* Plan: docs/plans/articles-bulk-import.md.
*/
export interface ParsedUrls {
valid: string[];
invalid: string[];
duplicates: string[];
}
/**
* Splits the raw textarea blob on any whitespace + comma, drops empty
* tokens, validates with `new URL` + http(s) scheme check, and
* deduplicates while preserving first-occurrence order.
*
* parseUrls('https://a.com\nhttps://a.com\nbroken')
* { valid: ['https://a.com/'],
* invalid: ['broken'],
* duplicates: ['https://a.com/'] }
*/
export function parseUrls(raw: string): ParsedUrls {
const tokens = raw
.split(/[\s,]+/)
.map((t) => t.trim())
.filter(Boolean);
const valid: string[] = [];
const invalid: string[] = [];
const duplicates: string[] = [];
const seen = new Set<string>();
for (const token of tokens) {
// Reject anything without an http(s) scheme — `new URL('foo.com')`
// would happily accept it as an opaque URI and the server-side
// fetch would then 400 on us.
let parsed: URL;
try {
parsed = new URL(token);
} catch {
invalid.push(token);
continue;
}
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
invalid.push(token);
continue;
}
const canonical = parsed.toString();
if (seen.has(canonical)) {
duplicates.push(canonical);
continue;
}
seen.add(canonical);
valid.push(canonical);
}
return { valid, invalid, duplicates };
}

View file

@ -15,61 +15,16 @@
import { emitDomainEvent } from '$lib/data/events';
import { articleImportJobTable, articleImportItemTable } from '../collections';
import { parseUrls, type ParsedUrls } from '../parse-urls';
import type {
ArticleImportItemState,
LocalArticleImportItem,
LocalArticleImportJob,
} from '../types';
/**
* Pure URL parser used by both the store (`createJob` accepts a raw
* textarea blob) and the UI's `$derived` live-validation. Splits on
* any whitespace + comma, drops empties, validates with `new URL`,
* deduplicates while preserving first-occurrence order.
*
* Exported as a standalone pure function so the unit-test file can
* import it without booting Dexie.
*/
export interface ParsedUrls {
valid: string[];
invalid: string[];
duplicates: string[];
}
export function parseUrls(raw: string): ParsedUrls {
const tokens = raw
.split(/[\s,]+/)
.map((t) => t.trim())
.filter(Boolean);
const valid: string[] = [];
const invalid: string[] = [];
const duplicates: string[] = [];
const seen = new Set<string>();
for (const token of tokens) {
// Reject anything without an http(s) scheme — `new URL('foo.com')`
// would happily accept it as an opaque URI and the server-side
// fetch would then 400 on us.
let parsed: URL;
try {
parsed = new URL(token);
} catch {
invalid.push(token);
continue;
}
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
invalid.push(token);
continue;
}
const canonical = parsed.toString();
if (seen.has(canonical)) {
duplicates.push(canonical);
continue;
}
seen.add(canonical);
valid.push(canonical);
}
return { valid, invalid, duplicates };
}
// Re-export so call sites that already imported from `stores/imports`
// (BulkImportForm, tools.ts) keep working unchanged.
export { parseUrls, type ParsedUrls };
export const articleImportsStore = {
/**

View file

@ -0,0 +1,120 @@
/**
* Tests for the pure `parseUrls` URL-list parser. The store's mutation
* methods (createJob, pauseJob, ) are integration-shaped (need Dexie
* + the scope hook) and live under the integration suite; this file
* only covers the parser, which is the deterministic part.
*
* Plan: docs/plans/articles-bulk-import.md.
*/
import { describe, it, expect } from 'vitest';
import { parseUrls } from '../parse-urls';
describe('parseUrls', () => {
it('returns empty arrays for an empty input', () => {
expect(parseUrls('')).toEqual({ valid: [], invalid: [], duplicates: [] });
expect(parseUrls(' \n\t ')).toEqual({ valid: [], invalid: [], duplicates: [] });
});
it('parses a single newline-separated list', () => {
const r = parseUrls('https://example.com/a\nhttps://example.com/b\nhttps://example.com/c');
expect(r.valid).toEqual([
'https://example.com/a',
'https://example.com/b',
'https://example.com/c',
]);
expect(r.invalid).toEqual([]);
expect(r.duplicates).toEqual([]);
});
it('accepts whitespace + comma + tabs as separators', () => {
const r = parseUrls('https://a.com https://b.com,\thttps://c.com\nhttps://d.com');
expect(r.valid).toEqual([
'https://a.com/',
'https://b.com/',
'https://c.com/',
'https://d.com/',
]);
});
it('accepts http and https, rejects everything else', () => {
const r = parseUrls(
[
'http://insecure.example',
'https://secure.example',
'ftp://files.example',
'javascript:alert(1)',
'mailto:foo@bar.com',
'file:///etc/passwd',
].join('\n')
);
expect(r.valid).toEqual(['http://insecure.example/', 'https://secure.example/']);
expect(r.invalid).toHaveLength(4);
expect(r.invalid).toContain('javascript:alert(1)');
expect(r.invalid).toContain('mailto:foo@bar.com');
});
it('rejects scheme-less domains (URL accepts them as opaque)', () => {
const r = parseUrls('example.com\ngoogle.com\nhttps://valid.com');
expect(r.valid).toEqual(['https://valid.com/']);
expect(r.invalid).toEqual(['example.com', 'google.com']);
});
it('flags duplicate URLs as duplicates, keeps the first occurrence', () => {
const r = parseUrls(
'https://example.com/a\nhttps://example.com/b\nhttps://example.com/a\nhttps://example.com/b'
);
expect(r.valid).toEqual(['https://example.com/a', 'https://example.com/b']);
expect(r.duplicates).toEqual(['https://example.com/a', 'https://example.com/b']);
});
it('canonicalises URLs (trailing slash on root, identical query order) so dupes are caught', () => {
const r = parseUrls('https://example.com\nhttps://example.com/');
expect(r.valid).toEqual(['https://example.com/']);
expect(r.duplicates).toEqual(['https://example.com/']);
});
it('preserves first-occurrence order across mixed valid/invalid/dup tokens', () => {
const r = parseUrls(
[
'https://first.com',
'not-a-url',
'https://second.com',
'https://first.com', // duplicate of first
'https://third.com',
].join('\n')
);
expect(r.valid).toEqual(['https://first.com/', 'https://second.com/', 'https://third.com/']);
expect(r.invalid).toEqual(['not-a-url']);
expect(r.duplicates).toEqual(['https://first.com/']);
});
it('handles realistic paste with title prefixes (extracts URL-shaped tokens only)', () => {
// User pasted from a chat where each line had a title before the URL
// — our parser splits on whitespace, so this leaves bare URL tokens
// + title-noise as "invalid". That's the correct behaviour for a
// strict parser; the UI surfaces both counters so the user sees it.
const r = parseUrls(
'Awesome article: https://nytimes.com/article-1\nAnother one: https://wsj.com/x'
);
expect(r.valid).toEqual(['https://nytimes.com/article-1', 'https://wsj.com/x']);
expect(r.invalid).toContain('Awesome');
expect(r.invalid).toContain('article:');
});
it('keeps query strings + fragments in canonical form', () => {
const r = parseUrls(
'https://example.com/a?x=1&y=2#section\nhttps://example.com/a?x=1&y=2#section'
);
expect(r.valid).toEqual(['https://example.com/a?x=1&y=2#section']);
expect(r.duplicates).toEqual(['https://example.com/a?x=1&y=2#section']);
});
it('handles a 50-URL input without choking', () => {
const urls = Array.from({ length: 50 }, (_, i) => `https://example.com/article-${i}`);
const r = parseUrls(urls.join('\n'));
expect(r.valid).toHaveLength(50);
expect(r.invalid).toEqual([]);
expect(r.duplicates).toEqual([]);
});
});