mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Fix date_filter to handle non overlapping date ranges
This commit is contained in:
@@ -19,23 +19,28 @@ date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?'
|
|||||||
|
|
||||||
|
|
||||||
def date_filter(query, entries, embeddings):
|
def date_filter(query, entries, embeddings):
|
||||||
# extract date from query
|
"Find entries containing any dates that fall within date range specified in query"
|
||||||
date_regex = r'\d{4}-\d{2}-\d{2}'
|
# extract date range specified in date filter of query
|
||||||
dates_in_query = re.findall(date_regex, query)
|
query_daterange = extract_date_range(query)
|
||||||
|
|
||||||
# if no date in query, return all entries
|
# if no date in query, return all entries
|
||||||
if dates_in_query is None or len(dates_in_query) == 0:
|
if query_daterange is None:
|
||||||
return query, entries, embeddings
|
return query, entries, embeddings
|
||||||
|
|
||||||
# remove dates from query
|
# remove date range filter from query
|
||||||
query = re.sub(date_regex, '', query)
|
query = re.sub(date_range_regex, '', query)
|
||||||
|
|
||||||
# find entries with dates from query in them
|
# find entries containing any dates that fall with date range specified in query
|
||||||
entries_to_include = set()
|
entries_to_include = set()
|
||||||
for id, entry in enumerate(entries):
|
for id, entry in enumerate(entries):
|
||||||
for date in dates_in_query:
|
# Extract dates from entry
|
||||||
if date in entry[1]:
|
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]):
|
||||||
|
# Convert date string in entry to unix timestamp
|
||||||
|
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
|
||||||
|
# Check if date in entry is within date range specified in query
|
||||||
|
if query_daterange[0] <= date_in_entry <= query_daterange[1]:
|
||||||
entries_to_include.add(id)
|
entries_to_include.add(id)
|
||||||
|
break
|
||||||
|
|
||||||
# delete entries (and their embeddings) marked for exclusion
|
# delete entries (and their embeddings) marked for exclusion
|
||||||
entries_to_exclude = set(range(len(entries))) - entries_to_include
|
entries_to_exclude = set(range(len(entries))) - entries_to_include
|
||||||
@@ -52,6 +57,9 @@ def extract_date_range(query):
|
|||||||
if not date_range_match or date_range_match.groups() == (None, None, None, None):
|
if not date_range_match or date_range_match.groups() == (None, None, None, None):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# extract comparators (e.g >,<,=) applied on dates in date filter
|
||||||
|
date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp]
|
||||||
|
|
||||||
# extract, parse natural dates ranges from date range filter passed in query
|
# extract, parse natural dates ranges from date range filter passed in query
|
||||||
# e.g today maps to (start_of_day, start_of_tomorrow)
|
# e.g today maps to (start_of_day, start_of_tomorrow)
|
||||||
query_dtranges = []
|
query_dtranges = []
|
||||||
@@ -60,30 +68,35 @@ def extract_date_range(query):
|
|||||||
dt_start, dt_end = parse(date_str)
|
dt_start, dt_end = parse(date_str)
|
||||||
query_dtranges.append((dt_start.timestamp(), dt_end.timestamp()))
|
query_dtranges.append((dt_start.timestamp(), dt_end.timestamp()))
|
||||||
|
|
||||||
date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp]
|
|
||||||
|
|
||||||
# Combine dates with their comparators to form date range intervals
|
# Combine dates with their comparators to form date range intervals
|
||||||
# For e.g
|
# For e.g
|
||||||
# >=yesterday maps to [start_of_yesterday, inf)
|
# >=yesterday maps to [start_of_yesterday, inf)
|
||||||
# <tomorrow maps to [0, start_of_tomorrow)
|
# <tomorrow maps to [0, start_of_tomorrow)
|
||||||
# Then combine above intervals (via AND/intersect)
|
# ---
|
||||||
|
effective_date_range = [0, inf]
|
||||||
|
date_range_considering_comparator = []
|
||||||
|
for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators):
|
||||||
|
if cmp == '>':
|
||||||
|
date_range_considering_comparator += [[dtrange_end, inf]]
|
||||||
|
elif cmp == '>=':
|
||||||
|
date_range_considering_comparator += [[dtrange_start, inf]]
|
||||||
|
elif cmp == '<':
|
||||||
|
date_range_considering_comparator += [[0, dtrange_start]]
|
||||||
|
elif cmp == '<=':
|
||||||
|
date_range_considering_comparator += [[0, dtrange_end]]
|
||||||
|
elif cmp == '=' or cmp == ':' or cmp == '==':
|
||||||
|
date_range_considering_comparator += [[dtrange_start, dtrange_end]]
|
||||||
|
|
||||||
|
# Combine above intervals (via AND/intersect)
|
||||||
# In the above example, this gives us [start_of_yesterday, start_of_tomorrow)
|
# In the above example, this gives us [start_of_yesterday, start_of_tomorrow)
|
||||||
# This is the effective date range to filter entries by
|
# This is the effective date range to filter entries by
|
||||||
# ---
|
# ---
|
||||||
effective_date_range = [0, inf]
|
for date_range in date_range_considering_comparator:
|
||||||
for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators):
|
effective_date_range = [
|
||||||
if cmp == '>' and dtrange_end < effective_date_range[1]:
|
max(effective_date_range[0], date_range[0]),
|
||||||
effective_date_range[0] = dtrange_end
|
min(effective_date_range[1], date_range[1])]
|
||||||
elif cmp == '<' and dtrange_start > effective_date_range[0]:
|
|
||||||
effective_date_range[1] = dtrange_start
|
|
||||||
elif cmp == '>=' and dtrange_end < effective_date_range[1]:
|
|
||||||
effective_date_range[0] = dtrange_start
|
|
||||||
elif cmp == '<=' and dtrange_start > effective_date_range[0]:
|
|
||||||
effective_date_range[1] = dtrange_end
|
|
||||||
elif cmp == '=' or cmp == ':' or cmp == '==':
|
|
||||||
effective_date_range = [dtrange_start, dtrange_end]
|
|
||||||
|
|
||||||
if effective_date_range == [0, inf]:
|
if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return effective_date_range
|
return effective_date_range
|
||||||
|
|||||||
@@ -16,6 +16,9 @@ def test_extract_date_range():
|
|||||||
# No date filter specified in query
|
# No date filter specified in query
|
||||||
assert date_filter.extract_date_range('head tail') == None
|
assert date_filter.extract_date_range('head tail') == None
|
||||||
|
|
||||||
|
# Non intersecting date ranges
|
||||||
|
assert date_filter.extract_date_range('head dt>"2020-01-01" dt<"2020-01-01" tail') == None
|
||||||
|
|
||||||
|
|
||||||
def test_parse():
|
def test_parse():
|
||||||
test_now = datetime(1984, 4, 1, 21, 21, 21)
|
test_now = datetime(1984, 4, 1, 21, 21, 21)
|
||||||
|
|||||||
Reference in New Issue
Block a user