Fix date filter usage with multi word queries. Simplify date regex

This commit is contained in:
Debanjum Singh Solanky
2022-07-14 21:32:58 +04:00
parent 67e9366c0f
commit 9de2097182
2 changed files with 23 additions and 24 deletions

View File

@@ -11,11 +11,10 @@ import dateparser as dtparse
# Date Range Filter Regexes # Date Range Filter Regexes
# Example filter queries: # Example filter queries:
# - dt>=yesterday dt<"tomorrow" # - dt>="yesterday" dt<"tomorrow"
# - dt>="last week" # - dt>="last week"
# - dt:"next year" # - dt:"2 years ago"
date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?' date_regex = r"dt([:><=]{1,2})\"(.*?)\""
date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?'
def date_filter(query, entries, embeddings): def date_filter(query, entries, embeddings):
@@ -54,20 +53,17 @@ def date_filter(query, entries, embeddings):
def extract_date_range(query): def extract_date_range(query):
# find date range filter in query # find date range filter in query
date_range_match = re.search(date_range_regex, query) date_range_matches = re.findall(date_regex, query)
if not date_range_match or date_range_match.groups() == (None, None, None, None):
return None
# extract comparators (e.g >,<,=) applied on dates in date filter if len(date_range_matches) == 0:
date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] return None
# extract, parse natural dates ranges from date range filter passed in query # extract, parse natural dates ranges from date range filter passed in query
# e.g today maps to (start_of_day, start_of_tomorrow) # e.g today maps to (start_of_day, start_of_tomorrow)
query_dtranges = [] for index, (cmp, date_str) in enumerate(date_range_matches):
for date_str in date_range_match.groups()[1::2]: if parse(date_str):
if date_str and parse(date_str):
dt_start, dt_end = parse(date_str) dt_start, dt_end = parse(date_str)
query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) date_range_matches[index] = [cmp, (dt_start.timestamp(), dt_end.timestamp())]
# Combine dates with their comparators to form date range intervals # Combine dates with their comparators to form date range intervals
# For e.g # For e.g
@@ -76,7 +72,7 @@ def extract_date_range(query):
# --- # ---
effective_date_range = [0, inf] effective_date_range = [0, inf]
date_range_considering_comparator = [] date_range_considering_comparator = []
for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators): for cmp, (dtrange_start, dtrange_end) in date_range_matches:
if cmp == '>': if cmp == '>':
date_range_considering_comparator += [[dtrange_end, inf]] date_range_considering_comparator += [[dtrange_end, inf]]
elif cmp == '>=': elif cmp == '>=':

View File

@@ -94,17 +94,20 @@ def test_parse():
def test_date_filter_regex(): def test_date_filter_regex():
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"1984-01-01" tail') dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>"today" dt:"1984-01-01"')
assert dtrange_match.groups() == ('>', 'today', ':', '1984-01-01') assert dtrange_match == [('>', 'today'), (':', '1984-01-01')]
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="1984-01-01"') dtrange_match = re.findall(date_filter.date_regex, 'head dt>"today" dt:"1984-01-01" multi word tail')
assert dtrange_match.groups() == ('>=', 'today', '=', '1984-01-01') assert dtrange_match == [('>', 'today'), (':', '1984-01-01')]
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail') dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>="today" dt="1984-01-01"')
assert dtrange_match.groups() == ('<', 'today', None, None) assert dtrange_match == [('>=', 'today'), ('=', '1984-01-01')]
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"') dtrange_match = re.findall(date_filter.date_regex, 'dt<"multi word date" multi word tail')
assert dtrange_match.groups() == ('<=', 'today', None, None) assert dtrange_match == [('<', 'multi word date')]
dtrange_match = re.search(date_filter.date_range_regex, 'head tail') dtrange_match = re.findall(date_filter.date_regex, 'head dt<="multi word date"')
assert dtrange_match.groups() == (None, None, None, None) assert dtrange_match == [('<=', 'multi word date')]
dtrange_match = re.findall(date_filter.date_regex, 'head tail')
assert dtrange_match == []