mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Add, test date filter regex and date parsing to get natural date range
This commit is contained in:
@@ -1,8 +1,21 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
from datetime import timedelta, datetime
|
||||
from dateutil.relativedelta import relativedelta, MO
|
||||
from math import inf
|
||||
|
||||
# External Packages
|
||||
import torch
|
||||
import dateparser as dtparse
|
||||
|
||||
|
||||
# Date Range Filter Regexes
|
||||
# Example filter queries:
|
||||
# - dt>=yesterday dt<"tomorrow"
|
||||
# - dt>="last week"
|
||||
# - dt:"next year"
|
||||
date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?'
|
||||
date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?'
|
||||
|
||||
|
||||
def date_filter(query, entries, embeddings):
|
||||
@@ -30,4 +43,44 @@ def date_filter(query, entries, embeddings):
|
||||
del entries[id]
|
||||
embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
|
||||
|
||||
return query, entries, embeddings
|
||||
return query, entries, embeddings
|
||||
|
||||
def parse(date_str, relative_base=None):
|
||||
"Parse date string passed in date filter of query to datetime object"
|
||||
# clean date string to handle future date parsing by date parser
|
||||
clean_date_str = re.sub(r'later|from now|from today', '', date_str)
|
||||
|
||||
# parse date passed in query date filter
|
||||
parsed_date = dtparse.parse(
|
||||
clean_date_str,
|
||||
settings= {
|
||||
'RELATIVE_BASE': relative_base or datetime.now(),
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'PREFER_DATES_FROM': 'future'
|
||||
})
|
||||
|
||||
if parsed_date is None:
|
||||
return None
|
||||
|
||||
return date_to_daterange(parsed_date, date_str)
|
||||
|
||||
|
||||
def date_to_daterange(parsed_date, date_str):
|
||||
"Convert parsed date to date ranges at natural granularity (day, week, month or year)"
|
||||
|
||||
start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
if 'year' in date_str:
|
||||
return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0))
|
||||
if 'month' in date_str:
|
||||
start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0)
|
||||
next_month = start_of_month + relativedelta(months=1)
|
||||
return (start_of_month, next_month)
|
||||
if 'week' in date_str:
|
||||
# if week in date string, dateparser parses it to next week start
|
||||
# so today = end of this week
|
||||
start_of_week = start_of_day - timedelta(days=7)
|
||||
return (start_of_week, start_of_day)
|
||||
else:
|
||||
next_day = start_of_day + relativedelta(days=1)
|
||||
return (start_of_day, next_day)
|
||||
|
||||
44
tests/test_date_filter.py
Normal file
44
tests/test_date_filter.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
# Application Packages
|
||||
from src.search_filter import date_filter
|
||||
|
||||
def test_parse():
|
||||
test_now = datetime(1984, 4, 1, 21, 21, 21)
|
||||
|
||||
# day variations
|
||||
assert date_filter.parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0))
|
||||
assert date_filter.parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0))
|
||||
assert date_filter.parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0))
|
||||
assert date_filter.parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0))
|
||||
|
||||
# week variations
|
||||
assert date_filter.parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0))
|
||||
assert date_filter.parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0))
|
||||
|
||||
# month variations
|
||||
assert date_filter.parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0))
|
||||
assert date_filter.parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0))
|
||||
|
||||
# year variations
|
||||
assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0))
|
||||
assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0))
|
||||
|
||||
|
||||
def test_date_filter_regex():
|
||||
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail')
|
||||
assert dtrange_match.groups() == ('>', 'today', ':', '2020-01-01')
|
||||
|
||||
dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="2020-01-01"')
|
||||
assert dtrange_match.groups() == ('>=', 'today', '=', '2020-01-01')
|
||||
|
||||
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail')
|
||||
assert dtrange_match.groups() == ('<', 'today', None, None)
|
||||
|
||||
dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"')
|
||||
assert dtrange_match.groups() == ('<=', 'today', None, None)
|
||||
|
||||
dtrange_match = re.search(date_filter.date_range_regex, 'head tail')
|
||||
assert dtrange_match.groups() == (None, None, None, None)
|
||||
Reference in New Issue
Block a user