diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index fc04722e..0930523b 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -1,8 +1,21 @@ # Standard Packages import re +from datetime import timedelta, datetime +from dateutil.relativedelta import relativedelta, MO +from math import inf # External Packages import torch +import dateparser as dtparse + + +# Date Range Filter Regexes +# Example filter queries: +# - dt>=yesterday dt<"tomorrow" +# - dt>="last week" +# - dt:"next year" +date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?' +date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?' def date_filter(query, entries, embeddings): @@ -30,4 +43,44 @@ def date_filter(query, entries, embeddings): del entries[id] embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) - return query, entries, embeddings \ No newline at end of file + return query, entries, embeddings + +def parse(date_str, relative_base=None): + "Parse date string passed in date filter of query to datetime object" + # clean date string to handle future date parsing by date parser + clean_date_str = re.sub(r'later|from now|from today', '', date_str) + + # parse date passed in query date filter + parsed_date = dtparse.parse( + clean_date_str, + settings= { + 'RELATIVE_BASE': relative_base or datetime.now(), + 'PREFER_DAY_OF_MONTH': 'first', + 'PREFER_DATES_FROM': 'future' + }) + + if parsed_date is None: + return None + + return date_to_daterange(parsed_date, date_str) + + +def date_to_daterange(parsed_date, date_str): + "Convert parsed date to date ranges at natural granularity (day, week, month or year)" + + start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0) + + if 'year' in date_str: + return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0)) + if 'month' in date_str: + start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0) + next_month = start_of_month + relativedelta(months=1) + return (start_of_month, next_month) + if 'week' in date_str: + # if week in date string, dateparser parses it to next week start + # so today = end of this week + start_of_week = start_of_day - timedelta(days=7) + return (start_of_week, start_of_day) + else: + next_day = start_of_day + relativedelta(days=1) + return (start_of_day, next_day) diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py new file mode 100644 index 00000000..0c30695c --- /dev/null +++ b/tests/test_date_filter.py @@ -0,0 +1,44 @@ +# Standard Packages +import re +from datetime import timedelta, datetime + +# Application Packages +from src.search_filter import date_filter + +def test_parse(): + test_now = datetime(1984, 4, 1, 21, 21, 21) + + # day variations + assert date_filter.parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0)) + assert date_filter.parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0)) + assert date_filter.parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0)) + assert date_filter.parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0)) + + # week variations + assert date_filter.parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0)) + assert date_filter.parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0)) + + # month variations + assert date_filter.parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0)) + assert date_filter.parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0)) + + # year variations + assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0)) + assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0)) + + +def test_date_filter_regex(): + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail') + assert dtrange_match.groups() == ('>', 'today', ':', '2020-01-01') + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="2020-01-01"') + assert dtrange_match.groups() == ('>=', 'today', '=', '2020-01-01') + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail') + assert dtrange_match.groups() == ('<', 'today', None, None) + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"') + assert dtrange_match.groups() == ('<=', 'today', None, None) + + dtrange_match = re.search(date_filter.date_range_regex, 'head tail') + assert dtrange_match.groups() == (None, None, None, None)