managarten/packages/shared-utils/src/parsers/base-parser.ts
Till JS 5286404129 feat(parsers): add intelligent quick-create parsers for 6 apps with multilingual support
- Base parser: multilingual (DE/EN/FR/ES/IT) date, time, weekday, month parsing
- Base parser: fuzzy/typo tolerance (Levenshtein), recurrence (RRULE), relative time
- Base parser: timezone extraction, date ranges, ordinal dates, confidence scoring
- Base parser: past dates (gestern/yesterday), this/next week distinction
- Base parser: compose helper (createAppParser), multiple @references
- Calendar: event-parser with duration, time ranges, location, all-day, calendar ref
- Calendar: wire up UnifiedBar with onCreate/onParseCreate for quick event creation
- Todo: task-parser multilingual priority keywords (urgent/important/normal/later)
- Planta: plant-parser with acquisition keywords (gekauft/bought/acheté)
- Mukke: song-parser with Artist-Title format, BPM, genre, playlist/project creation
- NutriPhi: meal-parser with meal type detection, add QuickInputBar to layout
- All parsers: 210 tests across 7 test suites, all passing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 22:18:05 +01:00

1260 lines
37 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Base Natural Language Parser
*
* Shared parsing utilities for date, time, and tags across all apps.
* App-specific parsers (task-parser, event-parser, contact-parser) extend this.
*
* Supports locales: de, en, fr, es, it
*/
import {
addDays,
addWeeks,
addHours,
addMinutes,
nextMonday,
nextTuesday,
nextWednesday,
nextThursday,
nextFriday,
nextSaturday,
nextSunday,
setHours,
setMinutes,
isBefore,
startOfWeek,
} from 'date-fns';
export type ParserLocale = 'de' | 'en' | 'fr' | 'es' | 'it';
export interface BaseParsedInput {
title: string;
date?: Date;
time?: { hours: number; minutes: number };
tagNames: string[];
rawInput: string;
/** Confidence score 0-1. 1.0 = exact match, 0.8 = fuzzy, 0.5 = ambiguous */
confidence: number;
}
export interface ExtractResult<T> {
value: T | undefined;
remaining: string;
}
// ============================================================================
// Locale-aware Pattern Definitions
// ============================================================================
interface DatePattern {
pattern: RegExp;
getDate: (match?: RegExpMatchArray) => Date;
}
type DayFn = (date: Date) => Date;
const NEXT_DAY_FNS: DayFn[] = [
nextMonday,
nextTuesday,
nextWednesday,
nextThursday,
nextFriday,
nextSaturday,
nextSunday,
];
// Weekday names per locale (Monday-Sunday order)
const WEEKDAY_NAMES: Record<ParserLocale, string[]> = {
de: ['montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag', 'sonntag'],
en: ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'],
fr: ['lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche'],
es: ['lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo'],
it: ['lunedì', 'martedì', 'mercoledì', 'giovedì', 'venerdì', 'sabato', 'domenica'],
};
// Relative date keywords per locale
interface RelativeDateWords {
today: string[];
tomorrow: string[];
dayAfterTomorrow: string[];
yesterday: string[];
dayBeforeYesterday: string[];
nextWeek: RegExp;
weekAfterNext: RegExp;
nextPrefix: RegExp;
thisPrefix: RegExp;
}
const RELATIVE_DATE_WORDS: Record<ParserLocale, RelativeDateWords> = {
de: {
today: ['heute'],
tomorrow: ['morgen'],
dayAfterTomorrow: ['übermorgen'],
yesterday: ['gestern'],
dayBeforeYesterday: ['vorgestern'],
nextWeek: /(?<!\p{L})nächste[nr]?\s*woche(?!\p{L})/iu,
weekAfterNext: /(?<!\p{L})über\s*nächste[nr]?\s*woche(?!\p{L})/iu,
nextPrefix: /(?<!\p{L})nächste[nr]?\s*/iu,
thisPrefix: /\bdiese[nr]?\s*/i,
},
en: {
today: ['today'],
tomorrow: ['tomorrow'],
dayAfterTomorrow: [],
yesterday: ['yesterday'],
dayBeforeYesterday: [],
nextWeek: /\bnext\s*week\b/i,
weekAfterNext: /\bweek\s*after\s*next\b/i,
nextPrefix: /\bnext\s*/i,
thisPrefix: /\bthis\s*/i,
},
fr: {
today: ["aujourd'hui", 'aujourdhui'],
tomorrow: ['demain'],
dayAfterTomorrow: ['après-demain', 'apres-demain'],
yesterday: ['hier'],
dayBeforeYesterday: ['avant-hier'],
nextWeek: /\bsemaine\s*prochaine\b/i,
weekAfterNext: /\bsemaine\s*d'?après\b/i,
nextPrefix: /\bprochain[e]?\s*/i,
thisPrefix: /\bce(?:tte?)?\s*/i,
},
es: {
today: ['hoy'],
tomorrow: ['mañana', 'manana'],
dayAfterTomorrow: ['pasado\\s*mañana', 'pasado\\s*manana'],
yesterday: ['ayer'],
dayBeforeYesterday: ['anteayer'],
nextWeek: /(?<!\p{L})próxima\s*semana(?!\p{L})/iu,
weekAfterNext: /(?<!\p{L})semana\s*después(?!\p{L})/iu,
nextPrefix: /(?<!\p{L})próxim[oa]\s*/iu,
thisPrefix: /\best[ea]\s*/i,
},
it: {
today: ['oggi'],
tomorrow: ['domani'],
dayAfterTomorrow: ['dopodomani'],
yesterday: ['ieri'],
dayBeforeYesterday: ["l'?altro\\s*ieri", 'avantieri'],
nextWeek: /\bprossima\s*settimana\b/i,
weekAfterNext: /\bsettimana\s*dopo\b/i,
nextPrefix: /\bprossim[oa]\s*/i,
thisPrefix: /\bquest[oa]\s*/i,
},
};
// "in X days" and "in X weeks" patterns per locale
const IN_DAYS_PATTERNS: Record<ParserLocale, RegExp> = {
de: /\bin\s*(\d+)\s*tage?n?\b/i,
en: /\bin\s*(\d+)\s*days?\b/i,
fr: /\bdans\s*(\d+)\s*jours?\b/i,
es: /\ben\s*(\d+)\s*d[ií]as?\b/i,
it: /\btra\s*(\d+)\s*giorni?\b/i,
};
const IN_WEEKS_PATTERNS: Record<ParserLocale, RegExp> = {
de: /\bin\s*(\d+)\s*wochen?\b/i,
en: /\bin\s*(\d+)\s*weeks?\b/i,
fr: /\bdans\s*(\d+)\s*semaines?\b/i,
es: /\ben\s*(\d+)\s*semanas?\b/i,
it: /\btra\s*(\d+)\s*settimane?\b/i,
};
// Month names per locale (January=0)
const MONTH_NAMES: Record<ParserLocale, string[]> = {
de: [
'januar',
'februar',
'märz',
'april',
'mai',
'juni',
'juli',
'august',
'september',
'oktober',
'november',
'dezember',
],
en: [
'january',
'february',
'march',
'april',
'may',
'june',
'july',
'august',
'september',
'october',
'november',
'december',
],
fr: [
'janvier',
'février',
'mars',
'avril',
'mai',
'juin',
'juillet',
'août',
'septembre',
'octobre',
'novembre',
'décembre',
],
es: [
'enero',
'febrero',
'marzo',
'abril',
'mayo',
'junio',
'julio',
'agosto',
'septiembre',
'octubre',
'noviembre',
'diciembre',
],
it: [
'gennaio',
'febbraio',
'marzo',
'aprile',
'maggio',
'giugno',
'luglio',
'agosto',
'settembre',
'ottobre',
'novembre',
'dicembre',
],
};
// Short month names (3 chars)
const SHORT_MONTH_NAMES: Record<ParserLocale, string[]> = {
de: ['jan', 'feb', 'mär', 'apr', 'mai', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dez'],
en: ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
fr: ['jan', 'fév', 'mar', 'avr', 'mai', 'jun', 'jul', 'aoû', 'sep', 'oct', 'nov', 'déc'],
es: ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', 'sep', 'oct', 'nov', 'dic'],
it: ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', 'set', 'ott', 'nov', 'dic'],
};
// Time patterns per locale
const TIME_PATTERNS: Record<ParserLocale, RegExp> = {
de: /\b(?:um\s*)?(\d{1,2})(?::(\d{2}))?\s*(?:uhr)?\b/i,
en: /\b(?:at\s*)?(\d{1,2})(?::(\d{2}))?\s*(?:o'?clock|am|pm)?\b/i,
fr: /\b(?:à\s*)?(\d{1,2})(?:[h:](\d{2}))?\s*(?:heures?)?\b/i,
es: /\b(?:a\s*las?\s*)?(\d{1,2})(?::(\d{2}))?\s*(?:horas?)?\b/i,
it: /\b(?:alle?\s*)?(\d{1,2})(?::(\d{2}))?\b/i,
};
// Preview formatting words
const PREVIEW_WORDS: Record<ParserLocale, { today: string; tomorrow: string; locale: string }> = {
de: { today: 'Heute', tomorrow: 'Morgen', locale: 'de-DE' },
en: { today: 'Today', tomorrow: 'Tomorrow', locale: 'en-US' },
fr: { today: "Aujourd'hui", tomorrow: 'Demain', locale: 'fr-FR' },
es: { today: 'Hoy', tomorrow: 'Mañana', locale: 'es-ES' },
it: { today: 'Oggi', tomorrow: 'Domani', locale: 'it-IT' },
};
// ============================================================================
// Fuzzy Matching Utilities
// ============================================================================
/**
* Simple Levenshtein distance (for short words only)
*/
function levenshtein(a: string, b: string): number {
const m = a.length;
const n = b.length;
const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
dp[i][j] = Math.min(
dp[i - 1][j] + 1,
dp[i][j - 1] + 1,
dp[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1)
);
}
}
return dp[m][n];
}
// Keywords that should support fuzzy matching (max distance 1-2 depending on length)
const FUZZY_DATE_WORDS: Record<ParserLocale, string[]> = {
de: [
'heute',
'morgen',
'übermorgen',
'montag',
'dienstag',
'mittwoch',
'donnerstag',
'freitag',
'samstag',
'sonntag',
],
en: [
'today',
'tomorrow',
'monday',
'tuesday',
'wednesday',
'thursday',
'friday',
'saturday',
'sunday',
],
fr: ['demain', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche'],
es: ['hoy', 'lunes', 'martes', 'jueves', 'viernes'],
it: ['oggi', 'domani'],
};
/**
* Try fuzzy matching a word against known date keywords.
* Returns the canonical keyword if a close match is found, undefined otherwise.
* Max distance: 1 for words <= 5 chars, 2 for longer words.
*/
export function fuzzyMatchDateKeyword(
word: string,
locale: ParserLocale = 'de'
): string | undefined {
const keywords = FUZZY_DATE_WORDS[locale];
if (!keywords) return undefined;
const lower = word.toLowerCase();
const maxDist = lower.length <= 5 ? 1 : 2;
let bestMatch: string | undefined;
let bestDist = Infinity;
for (const keyword of keywords) {
// Skip if length difference is too large
if (Math.abs(lower.length - keyword.length) > maxDist) continue;
const dist = levenshtein(lower, keyword);
if (dist <= maxDist && dist < bestDist) {
bestDist = dist;
bestMatch = keyword;
}
}
return bestMatch;
}
// ============================================================================
// Pattern Builder
// ============================================================================
// Word boundary that works with accented characters (lunedì, mañana, etc.)
// Standard \b doesn't treat accented chars as word chars.
// We use Unicode-aware regex with lookbehind/lookahead.
function wb(word: string): string {
// Use negative lookbehind/lookahead for word-like chars including accented ones
return `(?<![\\p{L}\\p{N}])${word}(?![\\p{L}\\p{N}])`;
}
function buildDatePatterns(locale: ParserLocale): DatePattern[] {
const words = RELATIVE_DATE_WORDS[locale];
const weekdays = WEEKDAY_NAMES[locale];
const patterns: DatePattern[] = [];
// Today
for (const word of words.today) {
patterns.push({ pattern: new RegExp(wb(word), 'iu'), getDate: () => new Date() });
}
// Tomorrow
for (const word of words.tomorrow) {
patterns.push({
pattern: new RegExp(wb(word), 'iu'),
getDate: () => addDays(new Date(), 1),
});
}
// Day after tomorrow
for (const word of words.dayAfterTomorrow) {
patterns.push({
pattern: new RegExp(wb(word), 'iu'),
getDate: () => addDays(new Date(), 2),
});
}
// Yesterday
for (const word of words.yesterday) {
patterns.push({
pattern: new RegExp(wb(word), 'iu'),
getDate: () => addDays(new Date(), -1),
});
}
// Day before yesterday
for (const word of words.dayBeforeYesterday) {
patterns.push({
pattern: new RegExp(wb(word), 'iu'),
getDate: () => addDays(new Date(), -2),
});
}
// Week after next (must come before "next week")
patterns.push({ pattern: words.weekAfterNext, getDate: () => addDays(new Date(), 14) });
// Next week
patterns.push({ pattern: words.nextWeek, getDate: () => addDays(new Date(), 7) });
// "this <weekday>" patterns - gets the day in the current week
// If already past, still returns this week's day (for logging retroactively)
for (let i = 0; i < weekdays.length; i++) {
const day = weekdays[i];
const targetDayOfWeek = [1, 2, 3, 4, 5, 6, 0][i]; // Mon=1..Sun=0
patterns.push({
pattern: new RegExp(`${words.thisPrefix.source}${day}(?![\\p{L}\\p{N}])`, 'iu'),
getDate: () => {
const now = new Date();
const currentDay = now.getDay();
if (currentDay === targetDayOfWeek) return now;
// If the target day is earlier in the week, use previous, otherwise next
const thisWeekStart = startOfWeek(now, { weekStartsOn: 1 });
const diff = targetDayOfWeek === 0 ? 6 : targetDayOfWeek - 1; // days from Monday
return addDays(thisWeekStart, diff);
},
});
}
// "next <weekday>" patterns
for (let i = 0; i < weekdays.length; i++) {
const dayFn = NEXT_DAY_FNS[i];
const day = weekdays[i];
patterns.push({
pattern: new RegExp(`${words.nextPrefix.source}${day}(?![\\p{L}\\p{N}])`, 'iu'),
getDate: () => dayFn(new Date()),
});
}
// Plain weekday names (implies "next")
for (let i = 0; i < weekdays.length; i++) {
const dayFn = NEXT_DAY_FNS[i];
const day = weekdays[i];
patterns.push({
pattern: new RegExp(wb(day), 'iu'),
getDate: () => dayFn(new Date()),
});
}
// Month names: "im März", "in January", "en février"
const months = MONTH_NAMES[locale];
const monthPrepositions: Record<ParserLocale, string> = {
de: '(?:im|in)\\s+',
en: '(?:in)\\s+',
fr: '(?:en)\\s+',
es: '(?:en)\\s+',
it: '(?:in|a)\\s+',
};
for (let i = 0; i < months.length; i++) {
const monthIndex = i;
const monthName = months[i];
// "im März" / "in January"
patterns.push({
pattern: new RegExp(`\\b${monthPrepositions[locale]}${monthName}\\b`, 'iu'),
getDate: () => {
const now = new Date();
let year = now.getFullYear();
// If month already passed, use next year
if (monthIndex < now.getMonth()) year++;
return new Date(year, monthIndex, 1);
},
});
}
return patterns;
}
// Cache built patterns per locale
const datePatternCache = new Map<ParserLocale, DatePattern[]>();
function getDatePatterns(locale: ParserLocale): DatePattern[] {
let patterns = datePatternCache.get(locale);
if (!patterns) {
patterns = buildDatePatterns(locale);
datePatternCache.set(locale, patterns);
}
return patterns;
}
// ============================================================================
// Specific date pattern (DD.MM. or DD.MM.YYYY or MM/DD/YYYY)
// ============================================================================
// DD.MM. or DD.MM.YYYY (European)
const EU_DATE_PATTERN = /\b(\d{1,2})\.(\d{1,2})\.?(\d{2,4})?\b/;
// MM/DD/YYYY or MM/DD (US)
const US_DATE_PATTERN = /\b(\d{1,2})\/(\d{1,2})(?:\/(\d{2,4}))?\b/;
function getSpecificDatePattern(locale: ParserLocale): {
pattern: RegExp;
parse: (match: RegExpMatchArray) => Date;
} {
if (locale === 'en') {
return {
pattern: US_DATE_PATTERN,
parse: (match) => {
const month = parseInt(match[1], 10) - 1;
const day = parseInt(match[2], 10);
const year = match[3]
? parseInt(match[3], 10) < 100
? 2000 + parseInt(match[3], 10)
: parseInt(match[3], 10)
: new Date().getFullYear();
return new Date(year, month, day);
},
};
}
// European format (DE, FR, ES, IT)
return {
pattern: EU_DATE_PATTERN,
parse: (match) => {
const day = parseInt(match[1], 10);
const month = parseInt(match[2], 10) - 1;
const year = match[3]
? parseInt(match[3], 10) < 100
? 2000 + parseInt(match[3], 10)
: parseInt(match[3], 10)
: new Date().getFullYear();
return new Date(year, month, day);
},
};
}
// ============================================================================
// Date Extraction
// ============================================================================
/**
* Extract date from text
*/
export function extractDate(text: string, locale: ParserLocale = 'de'): ExtractResult<Date> {
let remaining = text;
// Try "in X weeks" pattern first (before "in X days" to avoid partial match)
const inWeeksPattern = IN_WEEKS_PATTERNS[locale];
const inWeeksMatch = remaining.match(inWeeksPattern);
if (inWeeksMatch) {
const weeks = parseInt(inWeeksMatch[1], 10);
const date = addWeeks(new Date(), weeks);
remaining = remaining.replace(inWeeksPattern, '').trim();
return { value: date, remaining };
}
// Try "in X days" pattern
const inDaysPattern = IN_DAYS_PATTERNS[locale];
const inDaysMatch = remaining.match(inDaysPattern);
if (inDaysMatch) {
const days = parseInt(inDaysMatch[1], 10);
const date = addDays(new Date(), days);
remaining = remaining.replace(inDaysPattern, '').trim();
return { value: date, remaining };
}
// Try ordinal + month: "5. März", "3rd of May", "le 5 mars"
const months = MONTH_NAMES[locale];
const shortMonthsList = SHORT_MONTH_NAMES[locale];
const allMonths = [...months, ...shortMonthsList];
const monthPattern = allMonths.join('|');
// Ordinal patterns per locale
const ordinalPatterns: Record<ParserLocale, RegExp> = {
de: new RegExp(`\\b(\\d{1,2})\\.\\s*(${monthPattern})\\b`, 'iu'),
en: new RegExp(`\\b(\\d{1,2})(?:st|nd|rd|th)?\\s+(?:of\\s+)?(${monthPattern})\\b`, 'iu'),
fr: new RegExp(`\\b(?:le\\s+)?(\\d{1,2})(?:er|e|ème)?\\s+(${monthPattern})\\b`, 'iu'),
es: new RegExp(`\\b(?:el\\s+)?(\\d{1,2})\\s+(?:de\\s+)?(${monthPattern})\\b`, 'iu'),
it: new RegExp(`\\b(?:il\\s+)?(\\d{1,2})\\s+(${monthPattern})\\b`, 'iu'),
};
const ordinalMatch = remaining.match(ordinalPatterns[locale]);
if (ordinalMatch) {
const day = parseInt(ordinalMatch[1], 10);
const monthStr = ordinalMatch[2].toLowerCase();
let monthIndex = months.findIndex((m) => m.toLowerCase() === monthStr);
if (monthIndex === -1) {
monthIndex = shortMonthsList.findIndex((m) => m.toLowerCase() === monthStr);
}
if (monthIndex >= 0 && day >= 1 && day <= 31) {
const now = new Date();
let year = now.getFullYear();
const candidate = new Date(year, monthIndex, day);
if (isBefore(candidate, now)) year++;
remaining = remaining.replace(ordinalPatterns[locale], '').trim();
return { value: new Date(year, monthIndex, day), remaining };
}
}
// Try specific date (DD.MM. or MM/DD)
const { pattern: specificPattern, parse: parseSpecific } = getSpecificDatePattern(locale);
const specificDateMatch = remaining.match(specificPattern);
if (specificDateMatch) {
const date = parseSpecific(specificDateMatch);
remaining = remaining.replace(specificPattern, '').trim();
return { value: date, remaining };
}
// Try relative date patterns (exact match)
const patterns = getDatePatterns(locale);
for (const { pattern, getDate } of patterns) {
if (pattern.test(remaining)) {
const date = getDate();
remaining = remaining.replace(pattern, '').trim();
return { value: date, remaining };
}
}
// Fuzzy match: try each word against known date keywords
const words = remaining.split(/\s+/);
for (const word of words) {
if (word.length < 3) continue; // Skip very short words
const matched = fuzzyMatchDateKeyword(word, locale);
if (matched) {
// Re-run extraction with the corrected keyword
const corrected = remaining.replace(
new RegExp(word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'i'),
matched
);
const retryResult = extractDate(corrected, locale);
if (retryResult.value) {
return retryResult;
}
}
}
return { value: undefined, remaining };
}
// ============================================================================
// Date Range Extraction
// ============================================================================
export interface DateRange {
start: Date;
end: Date;
}
// "15.-17. März", "15-17 March", "Mon-Fri"
const EU_DATE_RANGE_PATTERN = /\b(\d{1,2})\.\s*[-]\s*(\d{1,2})\.\s*(\d{1,2})?\.\s*/;
/**
* Extract a date range (e.g., "15.-17.3.", "15.-17. März")
*/
export function extractDateRange(
text: string,
locale: ParserLocale = 'de'
): ExtractResult<DateRange> {
// Try "DD.-DD.MM." or "DD.-DD. MonthName"
const months = MONTH_NAMES[locale];
const monthPattern = months.join('|');
// "15.-17. März" / "15-17 March"
const withMonthName = new RegExp(
`\\b(\\d{1,2})\\.?\\s*[-]\\s*(\\d{1,2})\\.?\\s+(${monthPattern})\\b`,
'iu'
);
const match = text.match(withMonthName);
if (match) {
const startDay = parseInt(match[1]);
const endDay = parseInt(match[2]);
const monthStr = match[3].toLowerCase();
const monthIndex = months.findIndex((m) => m.toLowerCase() === monthStr);
if (monthIndex >= 0 && startDay >= 1 && endDay >= 1) {
const year = new Date().getFullYear();
return {
value: {
start: new Date(year, monthIndex, startDay),
end: new Date(year, monthIndex, endDay),
},
remaining: text.replace(withMonthName, '').trim(),
};
}
}
// "15.-17.3." (EU numeric format)
const euMatch = text.match(EU_DATE_RANGE_PATTERN);
if (euMatch && euMatch[3]) {
const startDay = parseInt(euMatch[1]);
const endDay = parseInt(euMatch[2]);
const month = parseInt(euMatch[3]) - 1;
const year = new Date().getFullYear();
if (startDay >= 1 && endDay >= 1 && month >= 0 && month <= 11) {
return {
value: {
start: new Date(year, month, startDay),
end: new Date(year, month, endDay),
},
remaining: text.replace(EU_DATE_RANGE_PATTERN, '').trim(),
};
}
}
return { value: undefined, remaining: text };
}
// ============================================================================
// Relative Time Extraction ("in 2 Stunden", "in 30 Minuten")
// ============================================================================
interface RelativeTimePattern {
pattern: RegExp;
getDate: (match: RegExpMatchArray) => Date;
}
const RELATIVE_TIME_PATTERNS: Record<ParserLocale, RelativeTimePattern[]> = {
de: [
{ pattern: /\bin\s+einer?\s+halben\s+stunde\b/i, getDate: () => addMinutes(new Date(), 30) },
{
pattern: /\bin\s+(\d+)\s+stunde[n]?\b/i,
getDate: (m) => addHours(new Date(), parseInt(m[1])),
},
{
pattern: /\bin\s+(\d+)\s+minute[n]?\b/i,
getDate: (m) => addMinutes(new Date(), parseInt(m[1])),
},
],
en: [
{ pattern: /\bin\s+half\s+an?\s+hour\b/i, getDate: () => addMinutes(new Date(), 30) },
{
pattern: /\bin\s+(\d+)\s+hours?\b/i,
getDate: (m) => addHours(new Date(), parseInt(m[1])),
},
{
pattern: /\bin\s+(\d+)\s+minutes?\b/i,
getDate: (m) => addMinutes(new Date(), parseInt(m[1])),
},
],
fr: [
{ pattern: /\bdans\s+une?\s+demi[e]?\s+heure\b/i, getDate: () => addMinutes(new Date(), 30) },
{
pattern: /\bdans\s+(\d+)\s+heures?\b/i,
getDate: (m) => addHours(new Date(), parseInt(m[1])),
},
{
pattern: /\bdans\s+(\d+)\s+minutes?\b/i,
getDate: (m) => addMinutes(new Date(), parseInt(m[1])),
},
],
es: [
{ pattern: /\ben\s+media\s+hora\b/i, getDate: () => addMinutes(new Date(), 30) },
{
pattern: /\ben\s+(\d+)\s+horas?\b/i,
getDate: (m) => addHours(new Date(), parseInt(m[1])),
},
{
pattern: /\ben\s+(\d+)\s+minutos?\b/i,
getDate: (m) => addMinutes(new Date(), parseInt(m[1])),
},
],
it: [
{ pattern: /\btra\s+mezz'?ora\b/i, getDate: () => addMinutes(new Date(), 30) },
{
pattern: /\btra\s+(\d+)\s+or[ea]\b/i,
getDate: (m) => addHours(new Date(), parseInt(m[1])),
},
{
pattern: /\btra\s+(\d+)\s+minut[io]\b/i,
getDate: (m) => addMinutes(new Date(), parseInt(m[1])),
},
],
};
/**
* Extract relative time expressions ("in 2 hours", "in 30 minutes")
* Returns a full Date since relative time implies date + time
*/
export function extractRelativeTime(
text: string,
locale: ParserLocale = 'de'
): ExtractResult<Date> {
const patterns = RELATIVE_TIME_PATTERNS[locale];
for (const { pattern, getDate } of patterns) {
const match = text.match(pattern);
if (match) {
return {
value: getDate(match),
remaining: text.replace(pattern, '').trim(),
};
}
}
return { value: undefined, remaining: text };
}
// ============================================================================
// Time Extraction
// ============================================================================
/**
* Extract time from text
*/
export function extractTime(
text: string,
locale: ParserLocale = 'de'
): ExtractResult<{ hours: number; minutes: number }> {
const timePattern = TIME_PATTERNS[locale];
const match = text.match(timePattern);
if (match) {
let hours = parseInt(match[1], 10);
const minutes = match[2] ? parseInt(match[2], 10) : 0;
// Handle AM/PM for English
if (locale === 'en') {
const fullMatch = match[0].toLowerCase();
if (fullMatch.includes('pm') && hours < 12) hours += 12;
if (fullMatch.includes('am') && hours === 12) hours = 0;
}
// Validate time
if (hours >= 0 && hours <= 23 && minutes >= 0 && minutes <= 59) {
const remaining = text.replace(timePattern, '').trim();
return { value: { hours, minutes }, remaining };
}
}
return { value: undefined, remaining: text };
}
// ============================================================================
// Timezone Extraction
// ============================================================================
// Common timezone abbreviations mapped to IANA timezone identifiers
const TIMEZONE_MAP: Record<string, string> = {
// European
CET: 'Europe/Berlin',
CEST: 'Europe/Berlin',
MET: 'Europe/Berlin',
MEST: 'Europe/Berlin',
WET: 'Europe/London',
WEST: 'Europe/London',
EET: 'Europe/Athens',
EEST: 'Europe/Athens',
GMT: 'Europe/London',
// US
EST: 'America/New_York',
EDT: 'America/New_York',
CST: 'America/Chicago',
CDT: 'America/Chicago',
MST: 'America/Denver',
MDT: 'America/Denver',
PST: 'America/Los_Angeles',
PDT: 'America/Los_Angeles',
// Asia/Pacific
JST: 'Asia/Tokyo',
KST: 'Asia/Seoul',
IST: 'Asia/Kolkata',
AEST: 'Australia/Sydney',
// Universal
UTC: 'UTC',
};
const TIMEZONE_ABBREVS = Object.keys(TIMEZONE_MAP).join('|');
const TIMEZONE_PATTERN = new RegExp(`\\b(${TIMEZONE_ABBREVS})\\b`);
/**
* Extract timezone abbreviation from text
* Returns the IANA timezone identifier
*/
export function extractTimezone(text: string): ExtractResult<string> {
const match = text.match(TIMEZONE_PATTERN);
if (match) {
const tz = TIMEZONE_MAP[match[1].toUpperCase()];
if (tz) {
return {
value: tz,
remaining: text.replace(TIMEZONE_PATTERN, '').trim(),
};
}
}
return { value: undefined, remaining: text };
}
// ============================================================================
// Tag Extraction
// ============================================================================
/**
* Extract tags (#tag1 #tag2) from text
*/
export function extractTags(text: string): ExtractResult<string[]> {
const tags: string[] = [];
const tagRegex = /#(\S+)/g;
let match;
while ((match = tagRegex.exec(text)) !== null) {
tags.push(match[1]);
}
const remaining = text.replace(/#\S+/g, '').trim();
return { value: tags, remaining };
}
// ============================================================================
// @ Reference Extraction (Projects, Calendars, Companies)
// ============================================================================
/**
* Extract @reference from text (single)
*/
export function extractAtReference(text: string): ExtractResult<string> {
const match = text.match(/@(\S+)/);
if (match) {
const remaining = text.replace(/@\S+/, '').trim();
return { value: match[1], remaining };
}
return { value: undefined, remaining: text };
}
/**
* Extract all @references from text
*/
export function extractAtReferences(text: string): ExtractResult<string[]> {
const refs: string[] = [];
const refRegex = /@(\S+)/g;
let match;
while ((match = refRegex.exec(text)) !== null) {
refs.push(match[1]);
}
const remaining = text.replace(/@\S+/g, '').trim();
return { value: refs.length > 0 ? refs : undefined, remaining };
}
// ============================================================================
// Combined Date + Time
// ============================================================================
/**
* Combine date and time into a single Date object
*/
export function combineDateAndTime(
date?: Date,
time?: { hours: number; minutes: number }
): Date | undefined {
if (!date) return undefined;
if (time) {
return setHours(setMinutes(date, time.minutes), time.hours);
}
return date;
}
// ============================================================================
// Preview Formatting
// ============================================================================
/**
* Format date for preview display
*/
export function formatDatePreview(date: Date, locale: ParserLocale = 'de'): string {
const now = new Date();
const tomorrow = addDays(now, 1);
const words = PREVIEW_WORDS[locale];
if (date.toDateString() === now.toDateString()) {
return words.today;
}
if (date.toDateString() === tomorrow.toDateString()) {
return words.tomorrow;
}
return date.toLocaleDateString(words.locale, {
weekday: 'short',
day: 'numeric',
month: 'short',
});
}
/**
* Format time for preview display
*/
export function formatTimePreview(time: { hours: number; minutes: number }): string {
return `${time.hours.toString().padStart(2, '0')}:${time.minutes.toString().padStart(2, '0')}`;
}
/**
* Format date and time for preview
*/
export function formatDateTimePreview(
date?: Date,
time?: { hours: number; minutes: number },
locale: ParserLocale = 'de'
): string {
if (!date) return '';
let result = formatDatePreview(date, locale);
if (time) {
result += ` ${formatTimePreview(time)}`;
}
return result;
}
// ============================================================================
// Recurrence Extraction
// ============================================================================
interface RecurrencePattern {
pattern: RegExp;
rrule: string;
}
const RECURRENCE_PATTERNS: Record<ParserLocale, RecurrencePattern[]> = {
de: [
{ pattern: /\bjeden\s+tag\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\btäglich\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bjede\s+woche\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bwöchentlich\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bjeden\s+monat\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bmonatlich\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bjedes\s+jahr\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bjährlich\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bjeden\s+montag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=MO' },
{ pattern: /\bjeden\s+dienstag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TU' },
{ pattern: /\bjeden\s+mittwoch\b/i, rrule: 'FREQ=WEEKLY;BYDAY=WE' },
{ pattern: /\bjeden\s+donnerstag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TH' },
{ pattern: /\bjeden\s+freitag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=FR' },
{ pattern: /\bjeden\s+samstag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=SA' },
{ pattern: /\bjeden\s+sonntag\b/i, rrule: 'FREQ=WEEKLY;BYDAY=SU' },
{ pattern: /\balle\s+(\d+)\s+tage\b/i, rrule: 'FREQ=DAILY;INTERVAL=$1' },
{ pattern: /\balle\s+(\d+)\s+wochen\b/i, rrule: 'FREQ=WEEKLY;INTERVAL=$1' },
],
en: [
{ pattern: /\bevery\s+day\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bdaily\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bevery\s+week\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bweekly\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bevery\s+month\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bmonthly\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bevery\s+year\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\byearly\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bevery\s+monday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=MO' },
{ pattern: /\bevery\s+tuesday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TU' },
{ pattern: /\bevery\s+wednesday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=WE' },
{ pattern: /\bevery\s+thursday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TH' },
{ pattern: /\bevery\s+friday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=FR' },
{ pattern: /\bevery\s+saturday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=SA' },
{ pattern: /\bevery\s+sunday\b/i, rrule: 'FREQ=WEEKLY;BYDAY=SU' },
{ pattern: /\bevery\s+(\d+)\s+days\b/i, rrule: 'FREQ=DAILY;INTERVAL=$1' },
{ pattern: /\bevery\s+(\d+)\s+weeks\b/i, rrule: 'FREQ=WEEKLY;INTERVAL=$1' },
],
fr: [
{ pattern: /\btous\s+les\s+jours\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bquotidien\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bchaque\s+semaine\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bhebdomadaire\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bchaque\s+mois\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bmensuel\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bchaque\s+année\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bannuel\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bchaque\s+lundi\b/i, rrule: 'FREQ=WEEKLY;BYDAY=MO' },
{ pattern: /\bchaque\s+mardi\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TU' },
{ pattern: /\bchaque\s+mercredi\b/i, rrule: 'FREQ=WEEKLY;BYDAY=WE' },
{ pattern: /\bchaque\s+jeudi\b/i, rrule: 'FREQ=WEEKLY;BYDAY=TH' },
{ pattern: /\bchaque\s+vendredi\b/i, rrule: 'FREQ=WEEKLY;BYDAY=FR' },
],
es: [
{ pattern: /\btodos\s+los\s+d[ií]as\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bdiario\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bcada\s+semana\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bsemanal\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bcada\s+mes\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bmensual\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bcada\s+año\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\banual\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bcada\s+lunes\b/i, rrule: 'FREQ=WEEKLY;BYDAY=MO' },
],
it: [
{ pattern: /\bogni\s+giorno\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bgiornaliero\b/i, rrule: 'FREQ=DAILY' },
{ pattern: /\bogni\s+settimana\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bsettimanale\b/i, rrule: 'FREQ=WEEKLY' },
{ pattern: /\bogni\s+mese\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bmensile\b/i, rrule: 'FREQ=MONTHLY' },
{ pattern: /\bogni\s+anno\b/i, rrule: 'FREQ=YEARLY' },
{ pattern: /\bannuale\b/i, rrule: 'FREQ=YEARLY' },
],
};
/**
* Extract recurrence rule from text, returns RFC 5545 RRULE string
*/
export function extractRecurrence(
text: string,
locale: ParserLocale = 'de'
): ExtractResult<string> {
const patterns = RECURRENCE_PATTERNS[locale];
for (const { pattern, rrule } of patterns) {
const match = text.match(pattern);
if (match) {
// Replace $1 with captured group if present
const resolvedRrule = match[1] ? rrule.replace('$1', match[1]) : rrule;
const remaining = text.replace(pattern, '').trim();
return { value: resolvedRrule, remaining };
}
}
return { value: undefined, remaining: text };
}
// ============================================================================
// Main Parser Function
// ============================================================================
/**
* Parse base input - extracts common patterns (date, time, tags, @reference)
*
* App-specific parsers should call this first, then extract their own patterns.
*/
export function parseBaseInput(input: string, locale: ParserLocale = 'de'): BaseParsedInput {
let text = input.trim();
const rawInput = text;
// Extract tags first (they're clearly delimited)
const tagsResult = extractTags(text);
text = tagsResult.remaining;
const tagNames = tagsResult.value || [];
// Extract date
const dateResult = extractDate(text, locale);
text = dateResult.remaining;
const date = dateResult.value;
// Extract time
const timeResult = extractTime(text, locale);
text = timeResult.remaining;
const time = timeResult.value;
// If we got time but no date, assume today
const finalDate = time && !date ? new Date() : date;
// Clean up multiple spaces
const title = text.replace(/\s+/g, ' ').trim();
// Calculate confidence: how much was extracted vs raw input
const hasExtractions = !!(finalDate || time || tagNames.length > 0);
let confidence = 1.0;
if (!hasExtractions && title === rawInput) {
confidence = 0.5; // Nothing was extracted - ambiguous
} else if (hasExtractions) {
// Check if the remaining title still looks clean
confidence = title.length > 0 ? 1.0 : 0.8;
}
return {
title,
date: finalDate,
time,
tagNames,
rawInput,
confidence,
};
}
// ============================================================================
// Utility: Clean title from all patterns
// ============================================================================
/**
* Remove all recognized patterns from text to get clean title
*/
export function cleanTitle(text: string, locale: ParserLocale = 'de'): string {
let result = text;
// Remove tags
result = result.replace(/#\S+/g, '');
// Remove @references
result = result.replace(/@\S+/g, '');
// Remove "in X days"
result = result.replace(IN_DAYS_PATTERNS[locale], '');
// Remove specific dates
const { pattern: specificPattern } = getSpecificDatePattern(locale);
result = result.replace(specificPattern, '');
// Remove relative date patterns
for (const { pattern } of getDatePatterns(locale)) {
result = result.replace(pattern, '');
}
// Remove time
result = result.replace(TIME_PATTERNS[locale], '');
// Clean up
return result.replace(/\s+/g, ' ').trim();
}
// ============================================================================
// Parser Compose Helper
// ============================================================================
/**
* Extraction step definition for compose helper
*/
export interface ExtractionStep<T> {
/** Name of this extraction (used as key in result) */
name: string;
/** Extract function: takes text, returns value and remaining text */
extract: (text: string) => { value: T | undefined; remaining: string };
}
/**
* Create an app-specific parser from a list of extraction steps.
* Runs base parser first (date, time, tags), then custom steps.
*
* @example
* ```ts
* const { parse } = createAppParser('de', [
* { name: 'priority', extract: extractPriority },
* { name: 'project', extract: (t) => extractAtReference(t) },
* ]);
* const result = parse('Task morgen @Arbeit !!!');
* // result.base = { title, date, time, tagNames, ... }
* // result.extractions = { priority: 'urgent', project: 'Arbeit' }
* ```
*/
export function createAppParser<T extends Record<string, unknown>>(
locale: ParserLocale,
steps: ExtractionStep<unknown>[]
): {
parse: (input: string) => { base: BaseParsedInput; extractions: T };
} {
return {
parse(input: string) {
let text = input.trim();
const extractions: Record<string, unknown> = {};
// Run custom extraction steps first (before base parser)
for (const step of steps) {
const result = step.extract(text);
extractions[step.name] = result.value;
text = result.remaining;
}
// Run base parser on remaining text
const base = parseBaseInput(text, locale);
return { base, extractions: extractions as T };
},
};
}