Add, test date filter regex and date parsing to get natural date range

This commit is contained in:
Debanjum Singh Solanky
2022-07-14 16:47:32 +04:00
parent b54588717f
commit 4a201d52af
2 changed files with 98 additions and 1 deletions

View File

@@ -1,8 +1,21 @@
# Standard Packages
import re
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta, MO
from math import inf
# External Packages
import torch
import dateparser as dtparse
# Date Range Filter Regexes
# Example filter queries:
# - dt>=yesterday dt<"tomorrow"
# - dt>="last week"
# - dt:"next year"
date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?'
date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?'
def date_filter(query, entries, embeddings):
@@ -30,4 +43,44 @@ def date_filter(query, entries, embeddings):
del entries[id]
embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
return query, entries, embeddings
return query, entries, embeddings
def parse(date_str, relative_base=None):
"Parse date string passed in date filter of query to datetime object"
# clean date string to handle future date parsing by date parser
clean_date_str = re.sub(r'later|from now|from today', '', date_str)
# parse date passed in query date filter
parsed_date = dtparse.parse(
clean_date_str,
settings= {
'RELATIVE_BASE': relative_base or datetime.now(),
'PREFER_DAY_OF_MONTH': 'first',
'PREFER_DATES_FROM': 'future'
})
if parsed_date is None:
return None
return date_to_daterange(parsed_date, date_str)
def date_to_daterange(parsed_date, date_str):
"Convert parsed date to date ranges at natural granularity (day, week, month or year)"
start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0)
if 'year' in date_str:
return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0))
if 'month' in date_str:
start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0)
next_month = start_of_month + relativedelta(months=1)
return (start_of_month, next_month)
if 'week' in date_str:
# if week in date string, dateparser parses it to next week start
# so today = end of this week
start_of_week = start_of_day - timedelta(days=7)
return (start_of_week, start_of_day)
else:
next_day = start_of_day + relativedelta(days=1)
return (start_of_day, next_day)

44
tests/test_date_filter.py Normal file
View File

@@ -0,0 +1,44 @@
# Standard Packages
import re
from datetime import timedelta, datetime
# Application Packages
from src.search_filter import date_filter
def test_parse():
test_now = datetime(1984, 4, 1, 21, 21, 21)
# day variations
assert date_filter.parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0))
assert date_filter.parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0))
assert date_filter.parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0))
assert date_filter.parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0))
# week variations
assert date_filter.parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0))
assert date_filter.parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0))
# month variations
assert date_filter.parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0))
assert date_filter.parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0))
# year variations
assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0))
assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0))
def test_date_filter_regex():
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail')
assert dtrange_match.groups() == ('>', 'today', ':', '2020-01-01')
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="2020-01-01"')
assert dtrange_match.groups() == ('>=', 'today', '=', '2020-01-01')
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail')
assert dtrange_match.groups() == ('<', 'today', None, None)
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"')
assert dtrange_match.groups() == ('<=', 'today', None, None)
dtrange_match = re.search(date_filter.date_range_regex, 'head tail')
assert dtrange_match.groups() == (None, None, None, None)