Fix date_filter to handle non overlapping date ranges

This commit is contained in:
Debanjum Singh Solanky
2022-07-14 18:53:38 +04:00
parent 70ac35b2a5
commit 011f81fac5
2 changed files with 41 additions and 25 deletions

View File

@@ -19,23 +19,28 @@ date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?'
def date_filter(query, entries, embeddings): def date_filter(query, entries, embeddings):
# extract date from query "Find entries containing any dates that fall within date range specified in query"
date_regex = r'\d{4}-\d{2}-\d{2}' # extract date range specified in date filter of query
dates_in_query = re.findall(date_regex, query) query_daterange = extract_date_range(query)
# if no date in query, return all entries # if no date in query, return all entries
if dates_in_query is None or len(dates_in_query) == 0: if query_daterange is None:
return query, entries, embeddings return query, entries, embeddings
# remove dates from query # remove date range filter from query
query = re.sub(date_regex, '', query) query = re.sub(date_range_regex, '', query)
# find entries with dates from query in them # find entries containing any dates that fall with date range specified in query
entries_to_include = set() entries_to_include = set()
for id, entry in enumerate(entries): for id, entry in enumerate(entries):
for date in dates_in_query: # Extract dates from entry
if date in entry[1]: for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]):
# Convert date string in entry to unix timestamp
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
# Check if date in entry is within date range specified in query
if query_daterange[0] <= date_in_entry <= query_daterange[1]:
entries_to_include.add(id) entries_to_include.add(id)
break
# delete entries (and their embeddings) marked for exclusion # delete entries (and their embeddings) marked for exclusion
entries_to_exclude = set(range(len(entries))) - entries_to_include entries_to_exclude = set(range(len(entries))) - entries_to_include
@@ -52,6 +57,9 @@ def extract_date_range(query):
if not date_range_match or date_range_match.groups() == (None, None, None, None): if not date_range_match or date_range_match.groups() == (None, None, None, None):
return None return None
# extract comparators (e.g >,<,=) applied on dates in date filter
date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp]
# extract, parse natural dates ranges from date range filter passed in query # extract, parse natural dates ranges from date range filter passed in query
# e.g today maps to (start_of_day, start_of_tomorrow) # e.g today maps to (start_of_day, start_of_tomorrow)
query_dtranges = [] query_dtranges = []
@@ -60,30 +68,35 @@ def extract_date_range(query):
dt_start, dt_end = parse(date_str) dt_start, dt_end = parse(date_str)
query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) query_dtranges.append((dt_start.timestamp(), dt_end.timestamp()))
date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp]
# Combine dates with their comparators to form date range intervals # Combine dates with their comparators to form date range intervals
# For e.g # For e.g
# >=yesterday maps to [start_of_yesterday, inf) # >=yesterday maps to [start_of_yesterday, inf)
# <tomorrow maps to [0, start_of_tomorrow) # <tomorrow maps to [0, start_of_tomorrow)
# Then combine above intervals (via AND/intersect) # ---
effective_date_range = [0, inf]
date_range_considering_comparator = []
for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators):
if cmp == '>':
date_range_considering_comparator += [[dtrange_end, inf]]
elif cmp == '>=':
date_range_considering_comparator += [[dtrange_start, inf]]
elif cmp == '<':
date_range_considering_comparator += [[0, dtrange_start]]
elif cmp == '<=':
date_range_considering_comparator += [[0, dtrange_end]]
elif cmp == '=' or cmp == ':' or cmp == '==':
date_range_considering_comparator += [[dtrange_start, dtrange_end]]
# Combine above intervals (via AND/intersect)
# In the above example, this gives us [start_of_yesterday, start_of_tomorrow) # In the above example, this gives us [start_of_yesterday, start_of_tomorrow)
# This is the effective date range to filter entries by # This is the effective date range to filter entries by
# --- # ---
effective_date_range = [0, inf] for date_range in date_range_considering_comparator:
for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators): effective_date_range = [
if cmp == '>' and dtrange_end < effective_date_range[1]: max(effective_date_range[0], date_range[0]),
effective_date_range[0] = dtrange_end min(effective_date_range[1], date_range[1])]
elif cmp == '<' and dtrange_start > effective_date_range[0]:
effective_date_range[1] = dtrange_start
elif cmp == '>=' and dtrange_end < effective_date_range[1]:
effective_date_range[0] = dtrange_start
elif cmp == '<=' and dtrange_start > effective_date_range[0]:
effective_date_range[1] = dtrange_end
elif cmp == '=' or cmp == ':' or cmp == '==':
effective_date_range = [dtrange_start, dtrange_end]
if effective_date_range == [0, inf]: if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]:
return None return None
else: else:
return effective_date_range return effective_date_range

View File

@@ -16,6 +16,9 @@ def test_extract_date_range():
# No date filter specified in query # No date filter specified in query
assert date_filter.extract_date_range('head tail') == None assert date_filter.extract_date_range('head tail') == None
# Non intersecting date ranges
assert date_filter.extract_date_range('head dt>"2020-01-01" dt<"2020-01-01" tail') == None
def test_parse(): def test_parse():
test_now = datetime(1984, 4, 1, 21, 21, 21) test_now = datetime(1984, 4, 1, 21, 21, 21)