From 011f81fac55d08f9dff56ff2ce35ed00ad68e707 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 18:53:38 +0400 Subject: [PATCH] Fix date_filter to handle non overlapping date ranges --- src/search_filter/date_filter.py | 63 +++++++++++++++++++------------- tests/test_date_filter.py | 3 ++ 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index b1c2cadc..06cf8878 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -19,23 +19,28 @@ date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?' def date_filter(query, entries, embeddings): - # extract date from query - date_regex = r'\d{4}-\d{2}-\d{2}' - dates_in_query = re.findall(date_regex, query) + "Find entries containing any dates that fall within date range specified in query" + # extract date range specified in date filter of query + query_daterange = extract_date_range(query) # if no date in query, return all entries - if dates_in_query is None or len(dates_in_query) == 0: + if query_daterange is None: return query, entries, embeddings - # remove dates from query - query = re.sub(date_regex, '', query) + # remove date range filter from query + query = re.sub(date_range_regex, '', query) - # find entries with dates from query in them + # find entries containing any dates that fall with date range specified in query entries_to_include = set() for id, entry in enumerate(entries): - for date in dates_in_query: - if date in entry[1]: + # Extract dates from entry + for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]): + # Convert date string in entry to unix timestamp + date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() + # Check if date in entry is within date range specified in query + if query_daterange[0] <= date_in_entry <= query_daterange[1]: entries_to_include.add(id) + break # delete entries (and their embeddings) marked for exclusion entries_to_exclude = set(range(len(entries))) - entries_to_include @@ -52,6 +57,9 @@ def extract_date_range(query): if not date_range_match or date_range_match.groups() == (None, None, None, None): return None + # extract comparators (e.g >,<,=) applied on dates in date filter + date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] + # extract, parse natural dates ranges from date range filter passed in query # e.g today maps to (start_of_day, start_of_tomorrow) query_dtranges = [] @@ -60,30 +68,35 @@ def extract_date_range(query): dt_start, dt_end = parse(date_str) query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) - date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] - # Combine dates with their comparators to form date range intervals # For e.g # >=yesterday maps to [start_of_yesterday, inf) # ': + date_range_considering_comparator += [[dtrange_end, inf]] + elif cmp == '>=': + date_range_considering_comparator += [[dtrange_start, inf]] + elif cmp == '<': + date_range_considering_comparator += [[0, dtrange_start]] + elif cmp == '<=': + date_range_considering_comparator += [[0, dtrange_end]] + elif cmp == '=' or cmp == ':' or cmp == '==': + date_range_considering_comparator += [[dtrange_start, dtrange_end]] + + # Combine above intervals (via AND/intersect) # In the above example, this gives us [start_of_yesterday, start_of_tomorrow) # This is the effective date range to filter entries by # --- - effective_date_range = [0, inf] - for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators): - if cmp == '>' and dtrange_end < effective_date_range[1]: - effective_date_range[0] = dtrange_end - elif cmp == '<' and dtrange_start > effective_date_range[0]: - effective_date_range[1] = dtrange_start - elif cmp == '>=' and dtrange_end < effective_date_range[1]: - effective_date_range[0] = dtrange_start - elif cmp == '<=' and dtrange_start > effective_date_range[0]: - effective_date_range[1] = dtrange_end - elif cmp == '=' or cmp == ':' or cmp == '==': - effective_date_range = [dtrange_start, dtrange_end] + for date_range in date_range_considering_comparator: + effective_date_range = [ + max(effective_date_range[0], date_range[0]), + min(effective_date_range[1], date_range[1])] - if effective_date_range == [0, inf]: + if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]: return None else: return effective_date_range diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 76dad113..272643cf 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -16,6 +16,9 @@ def test_extract_date_range(): # No date filter specified in query assert date_filter.extract_date_range('head tail') == None + # Non intersecting date ranges + assert date_filter.extract_date_range('head dt>"2020-01-01" dt<"2020-01-01" tail') == None + def test_parse(): test_now = datetime(1984, 4, 1, 21, 21, 21)