mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:21:09 +02:00
test(articles): parseUrls unit tests + extract pure module (Phase 7)
Move `parseUrls` out of stores/imports.svelte.ts (which transitively
imports Dexie via collections.ts) into a standalone parse-urls.ts so
the test file can exercise it without booting Dexie. The store re-
exports parseUrls so existing call sites (BulkImportForm, tools.ts)
keep working unchanged.
11 unit tests covering:
- empty + whitespace-only inputs
- newline / whitespace / comma / tab separator handling
- http + https accepted, ftp / mailto / javascript / file rejected
- bare domains rejected (URL accepts them as opaque, our parser
requires explicit scheme)
- duplicate detection preserves first-occurrence order
- canonicalisation (trailing slash on root, query+fragment kept)
- mixed valid / invalid / duplicate token ordering
- title-prefixed-paste behaviour (strict — surfaces non-URL words
as invalid for the user to see)
- 50-URL stress check
Plan: docs/plans/articles-bulk-import.md.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0fc16d1bfd
commit
33b3f656fd
3 changed files with 184 additions and 49 deletions
60
apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
Normal file
60
apps/mana/apps/web/src/lib/modules/articles/parse-urls.ts
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
/**
|
||||
* Pure URL-list parser for the bulk-import flow. Extracted into its
|
||||
* own module so tests can import + exercise it without booting Dexie
|
||||
* (collections.ts and stores/imports.svelte.ts have a transitive
|
||||
* dependency on the database, which won't open under fake-indexeddb
|
||||
* if any registered table is currently in a half-migrated state).
|
||||
*
|
||||
* Plan: docs/plans/articles-bulk-import.md.
|
||||
*/
|
||||
|
||||
export interface ParsedUrls {
|
||||
valid: string[];
|
||||
invalid: string[];
|
||||
duplicates: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the raw textarea blob on any whitespace + comma, drops empty
|
||||
* tokens, validates with `new URL` + http(s) scheme check, and
|
||||
* deduplicates while preserving first-occurrence order.
|
||||
*
|
||||
* parseUrls('https://a.com\nhttps://a.com\nbroken')
|
||||
* → { valid: ['https://a.com/'],
|
||||
* invalid: ['broken'],
|
||||
* duplicates: ['https://a.com/'] }
|
||||
*/
|
||||
export function parseUrls(raw: string): ParsedUrls {
|
||||
const tokens = raw
|
||||
.split(/[\s,]+/)
|
||||
.map((t) => t.trim())
|
||||
.filter(Boolean);
|
||||
const valid: string[] = [];
|
||||
const invalid: string[] = [];
|
||||
const duplicates: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const token of tokens) {
|
||||
// Reject anything without an http(s) scheme — `new URL('foo.com')`
|
||||
// would happily accept it as an opaque URI and the server-side
|
||||
// fetch would then 400 on us.
|
||||
let parsed: URL;
|
||||
try {
|
||||
parsed = new URL(token);
|
||||
} catch {
|
||||
invalid.push(token);
|
||||
continue;
|
||||
}
|
||||
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
||||
invalid.push(token);
|
||||
continue;
|
||||
}
|
||||
const canonical = parsed.toString();
|
||||
if (seen.has(canonical)) {
|
||||
duplicates.push(canonical);
|
||||
continue;
|
||||
}
|
||||
seen.add(canonical);
|
||||
valid.push(canonical);
|
||||
}
|
||||
return { valid, invalid, duplicates };
|
||||
}
|
||||
|
|
@ -15,61 +15,16 @@
|
|||
|
||||
import { emitDomainEvent } from '$lib/data/events';
|
||||
import { articleImportJobTable, articleImportItemTable } from '../collections';
|
||||
import { parseUrls, type ParsedUrls } from '../parse-urls';
|
||||
import type {
|
||||
ArticleImportItemState,
|
||||
LocalArticleImportItem,
|
||||
LocalArticleImportJob,
|
||||
} from '../types';
|
||||
|
||||
/**
|
||||
* Pure URL parser — used by both the store (`createJob` accepts a raw
|
||||
* textarea blob) and the UI's `$derived` live-validation. Splits on
|
||||
* any whitespace + comma, drops empties, validates with `new URL`,
|
||||
* deduplicates while preserving first-occurrence order.
|
||||
*
|
||||
* Exported as a standalone pure function so the unit-test file can
|
||||
* import it without booting Dexie.
|
||||
*/
|
||||
export interface ParsedUrls {
|
||||
valid: string[];
|
||||
invalid: string[];
|
||||
duplicates: string[];
|
||||
}
|
||||
|
||||
export function parseUrls(raw: string): ParsedUrls {
|
||||
const tokens = raw
|
||||
.split(/[\s,]+/)
|
||||
.map((t) => t.trim())
|
||||
.filter(Boolean);
|
||||
const valid: string[] = [];
|
||||
const invalid: string[] = [];
|
||||
const duplicates: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const token of tokens) {
|
||||
// Reject anything without an http(s) scheme — `new URL('foo.com')`
|
||||
// would happily accept it as an opaque URI and the server-side
|
||||
// fetch would then 400 on us.
|
||||
let parsed: URL;
|
||||
try {
|
||||
parsed = new URL(token);
|
||||
} catch {
|
||||
invalid.push(token);
|
||||
continue;
|
||||
}
|
||||
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
||||
invalid.push(token);
|
||||
continue;
|
||||
}
|
||||
const canonical = parsed.toString();
|
||||
if (seen.has(canonical)) {
|
||||
duplicates.push(canonical);
|
||||
continue;
|
||||
}
|
||||
seen.add(canonical);
|
||||
valid.push(canonical);
|
||||
}
|
||||
return { valid, invalid, duplicates };
|
||||
}
|
||||
// Re-export so call sites that already imported from `stores/imports`
|
||||
// (BulkImportForm, tools.ts) keep working unchanged.
|
||||
export { parseUrls, type ParsedUrls };
|
||||
|
||||
export const articleImportsStore = {
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,120 @@
|
|||
/**
|
||||
* Tests for the pure `parseUrls` URL-list parser. The store's mutation
|
||||
* methods (createJob, pauseJob, …) are integration-shaped (need Dexie
|
||||
* + the scope hook) and live under the integration suite; this file
|
||||
* only covers the parser, which is the deterministic part.
|
||||
*
|
||||
* Plan: docs/plans/articles-bulk-import.md.
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { parseUrls } from '../parse-urls';
|
||||
|
||||
describe('parseUrls', () => {
|
||||
it('returns empty arrays for an empty input', () => {
|
||||
expect(parseUrls('')).toEqual({ valid: [], invalid: [], duplicates: [] });
|
||||
expect(parseUrls(' \n\t ')).toEqual({ valid: [], invalid: [], duplicates: [] });
|
||||
});
|
||||
|
||||
it('parses a single newline-separated list', () => {
|
||||
const r = parseUrls('https://example.com/a\nhttps://example.com/b\nhttps://example.com/c');
|
||||
expect(r.valid).toEqual([
|
||||
'https://example.com/a',
|
||||
'https://example.com/b',
|
||||
'https://example.com/c',
|
||||
]);
|
||||
expect(r.invalid).toEqual([]);
|
||||
expect(r.duplicates).toEqual([]);
|
||||
});
|
||||
|
||||
it('accepts whitespace + comma + tabs as separators', () => {
|
||||
const r = parseUrls('https://a.com https://b.com,\thttps://c.com\nhttps://d.com');
|
||||
expect(r.valid).toEqual([
|
||||
'https://a.com/',
|
||||
'https://b.com/',
|
||||
'https://c.com/',
|
||||
'https://d.com/',
|
||||
]);
|
||||
});
|
||||
|
||||
it('accepts http and https, rejects everything else', () => {
|
||||
const r = parseUrls(
|
||||
[
|
||||
'http://insecure.example',
|
||||
'https://secure.example',
|
||||
'ftp://files.example',
|
||||
'javascript:alert(1)',
|
||||
'mailto:foo@bar.com',
|
||||
'file:///etc/passwd',
|
||||
].join('\n')
|
||||
);
|
||||
expect(r.valid).toEqual(['http://insecure.example/', 'https://secure.example/']);
|
||||
expect(r.invalid).toHaveLength(4);
|
||||
expect(r.invalid).toContain('javascript:alert(1)');
|
||||
expect(r.invalid).toContain('mailto:foo@bar.com');
|
||||
});
|
||||
|
||||
it('rejects scheme-less domains (URL accepts them as opaque)', () => {
|
||||
const r = parseUrls('example.com\ngoogle.com\nhttps://valid.com');
|
||||
expect(r.valid).toEqual(['https://valid.com/']);
|
||||
expect(r.invalid).toEqual(['example.com', 'google.com']);
|
||||
});
|
||||
|
||||
it('flags duplicate URLs as duplicates, keeps the first occurrence', () => {
|
||||
const r = parseUrls(
|
||||
'https://example.com/a\nhttps://example.com/b\nhttps://example.com/a\nhttps://example.com/b'
|
||||
);
|
||||
expect(r.valid).toEqual(['https://example.com/a', 'https://example.com/b']);
|
||||
expect(r.duplicates).toEqual(['https://example.com/a', 'https://example.com/b']);
|
||||
});
|
||||
|
||||
it('canonicalises URLs (trailing slash on root, identical query order) so dupes are caught', () => {
|
||||
const r = parseUrls('https://example.com\nhttps://example.com/');
|
||||
expect(r.valid).toEqual(['https://example.com/']);
|
||||
expect(r.duplicates).toEqual(['https://example.com/']);
|
||||
});
|
||||
|
||||
it('preserves first-occurrence order across mixed valid/invalid/dup tokens', () => {
|
||||
const r = parseUrls(
|
||||
[
|
||||
'https://first.com',
|
||||
'not-a-url',
|
||||
'https://second.com',
|
||||
'https://first.com', // duplicate of first
|
||||
'https://third.com',
|
||||
].join('\n')
|
||||
);
|
||||
expect(r.valid).toEqual(['https://first.com/', 'https://second.com/', 'https://third.com/']);
|
||||
expect(r.invalid).toEqual(['not-a-url']);
|
||||
expect(r.duplicates).toEqual(['https://first.com/']);
|
||||
});
|
||||
|
||||
it('handles realistic paste with title prefixes (extracts URL-shaped tokens only)', () => {
|
||||
// User pasted from a chat where each line had a title before the URL
|
||||
// — our parser splits on whitespace, so this leaves bare URL tokens
|
||||
// + title-noise as "invalid". That's the correct behaviour for a
|
||||
// strict parser; the UI surfaces both counters so the user sees it.
|
||||
const r = parseUrls(
|
||||
'Awesome article: https://nytimes.com/article-1\nAnother one: https://wsj.com/x'
|
||||
);
|
||||
expect(r.valid).toEqual(['https://nytimes.com/article-1', 'https://wsj.com/x']);
|
||||
expect(r.invalid).toContain('Awesome');
|
||||
expect(r.invalid).toContain('article:');
|
||||
});
|
||||
|
||||
it('keeps query strings + fragments in canonical form', () => {
|
||||
const r = parseUrls(
|
||||
'https://example.com/a?x=1&y=2#section\nhttps://example.com/a?x=1&y=2#section'
|
||||
);
|
||||
expect(r.valid).toEqual(['https://example.com/a?x=1&y=2#section']);
|
||||
expect(r.duplicates).toEqual(['https://example.com/a?x=1&y=2#section']);
|
||||
});
|
||||
|
||||
it('handles a 50-URL input without choking', () => {
|
||||
const urls = Array.from({ length: 50 }, (_, i) => `https://example.com/article-${i}`);
|
||||
const r = parseUrls(urls.join('\n'));
|
||||
expect(r.valid).toHaveLength(50);
|
||||
expect(r.invalid).toEqual([]);
|
||||
expect(r.duplicates).toEqual([]);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue