From b54588717f7d805e44820dee01121433923ffb28 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 13 Jul 2022 18:46:34 +0400 Subject: [PATCH 01/14] Filter for entries with dates specified by user in query - Create Date filter - Users can pass dates in YYYY-MM-DD format in their query - Use it to filter asymmetric search to user specified dates --- src/main.py | 3 ++- src/search_filter/date_filter.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 src/search_filter/date_filter.py diff --git a/src/main.py b/src/main.py index 00e68612..b9723879 100644 --- a/src/main.py +++ b/src/main.py @@ -18,6 +18,7 @@ from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, Con from src.utils.rawconfig import FullConfig from src.processor.conversation.gpt import converse, extract_search_type, message_to_log, message_to_prompt, understand, summarize from src.search_filter.explicit_filter import explicit_filter +from src.search_filter.date_filter import date_filter # Application Global State config = FullConfig() @@ -59,7 +60,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Notes or t == None) and model.notes_search: # query notes - hits, entries = asymmetric.query(user_query, model.notes_search, device=device, filters=[explicit_filter]) + hits, entries = asymmetric.query(user_query, model.notes_search, device=device, filters=[explicit_filter, date_filter]) # collate and return results return asymmetric.collate_results(hits, entries, results_count) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py new file mode 100644 index 00000000..fc04722e --- /dev/null +++ b/src/search_filter/date_filter.py @@ -0,0 +1,33 @@ +# Standard Packages +import re + +# External Packages +import torch + + +def date_filter(query, entries, embeddings): + # extract date from query + date_regex = r'\d{4}-\d{2}-\d{2}' + dates_in_query = re.findall(date_regex, query) + + # if no date in query, return all entries + if dates_in_query is None or len(dates_in_query) == 0: + return query, entries, embeddings + + # remove dates from query + query = re.sub(date_regex, '', query) + + # find entries with dates from query in them + entries_to_include = set() + for id, entry in enumerate(entries): + for date in dates_in_query: + if date in entry[1]: + entries_to_include.add(id) + + # delete entries (and their embeddings) marked for exclusion + entries_to_exclude = set(range(len(entries))) - entries_to_include + for id in sorted(list(entries_to_exclude), reverse=True): + del entries[id] + embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) + + return query, entries, embeddings \ No newline at end of file From 4a201d52afe41a678938c6acdb74c3ad08539c62 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 16:47:32 +0400 Subject: [PATCH 02/14] Add, test date filter regex and date parsing to get natural date range --- src/search_filter/date_filter.py | 55 +++++++++++++++++++++++++++++++- tests/test_date_filter.py | 44 +++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 tests/test_date_filter.py diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index fc04722e..0930523b 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -1,8 +1,21 @@ # Standard Packages import re +from datetime import timedelta, datetime +from dateutil.relativedelta import relativedelta, MO +from math import inf # External Packages import torch +import dateparser as dtparse + + +# Date Range Filter Regexes +# Example filter queries: +# - dt>=yesterday dt<"tomorrow" +# - dt>="last week" +# - dt:"next year" +date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?' +date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?' def date_filter(query, entries, embeddings): @@ -30,4 +43,44 @@ def date_filter(query, entries, embeddings): del entries[id] embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) - return query, entries, embeddings \ No newline at end of file + return query, entries, embeddings + +def parse(date_str, relative_base=None): + "Parse date string passed in date filter of query to datetime object" + # clean date string to handle future date parsing by date parser + clean_date_str = re.sub(r'later|from now|from today', '', date_str) + + # parse date passed in query date filter + parsed_date = dtparse.parse( + clean_date_str, + settings= { + 'RELATIVE_BASE': relative_base or datetime.now(), + 'PREFER_DAY_OF_MONTH': 'first', + 'PREFER_DATES_FROM': 'future' + }) + + if parsed_date is None: + return None + + return date_to_daterange(parsed_date, date_str) + + +def date_to_daterange(parsed_date, date_str): + "Convert parsed date to date ranges at natural granularity (day, week, month or year)" + + start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0) + + if 'year' in date_str: + return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0)) + if 'month' in date_str: + start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0) + next_month = start_of_month + relativedelta(months=1) + return (start_of_month, next_month) + if 'week' in date_str: + # if week in date string, dateparser parses it to next week start + # so today = end of this week + start_of_week = start_of_day - timedelta(days=7) + return (start_of_week, start_of_day) + else: + next_day = start_of_day + relativedelta(days=1) + return (start_of_day, next_day) diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py new file mode 100644 index 00000000..0c30695c --- /dev/null +++ b/tests/test_date_filter.py @@ -0,0 +1,44 @@ +# Standard Packages +import re +from datetime import timedelta, datetime + +# Application Packages +from src.search_filter import date_filter + +def test_parse(): + test_now = datetime(1984, 4, 1, 21, 21, 21) + + # day variations + assert date_filter.parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0)) + assert date_filter.parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0)) + assert date_filter.parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0)) + assert date_filter.parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0)) + + # week variations + assert date_filter.parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0)) + assert date_filter.parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0)) + + # month variations + assert date_filter.parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0)) + assert date_filter.parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0)) + + # year variations + assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0)) + assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0)) + + +def test_date_filter_regex(): + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail') + assert dtrange_match.groups() == ('>', 'today', ':', '2020-01-01') + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="2020-01-01"') + assert dtrange_match.groups() == ('>=', 'today', '=', '2020-01-01') + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail') + assert dtrange_match.groups() == ('<', 'today', None, None) + + dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"') + assert dtrange_match.groups() == ('<=', 'today', None, None) + + dtrange_match = re.search(date_filter.date_range_regex, 'head tail') + assert dtrange_match.groups() == (None, None, None, None) From e6db3e3d00a2750d998184d9cbdefb97e4414993 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 18:13:12 +0400 Subject: [PATCH 03/14] Prefer Dates From Future only when specific words in date string - Default to looking at dates from past, as most notes are from past - Look for dates in future for cases where it's obvious query is for dates in the future but dateparser's parse doesn't parse it at all. E.g parse('5 months from now') returns nothing - Setting PREFER_DATES_FROM_FUTURE in this case and passing just parse('5 months') to dateparser.parse works as expected --- src/search_filter/date_filter.py | 6 ++++-- tests/test_date_filter.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 0930523b..fe7ee5e5 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -48,7 +48,9 @@ def date_filter(query, entries, embeddings): def parse(date_str, relative_base=None): "Parse date string passed in date filter of query to datetime object" # clean date string to handle future date parsing by date parser - clean_date_str = re.sub(r'later|from now|from today', '', date_str) + future_strings = ['later', 'from now', 'from today'] + prefer_dates_from = {True: 'future', False: 'past'}[any([True for fstr in future_strings if fstr in date_str])] + clean_date_str = re.sub('|'.join(future_strings), '', date_str) # parse date passed in query date filter parsed_date = dtparse.parse( @@ -56,7 +58,7 @@ def parse(date_str, relative_base=None): settings= { 'RELATIVE_BASE': relative_base or datetime.now(), 'PREFER_DAY_OF_MONTH': 'first', - 'PREFER_DATES_FROM': 'future' + 'PREFER_DATES_FROM': prefer_dates_from }) if parsed_date is None: diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 0c30695c..8abec3af 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -26,6 +26,10 @@ def test_parse(): assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0)) assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0)) + # specific month/date variation + assert date_filter.parse('in august', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) + assert date_filter.parse('on 1983-08-01', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) + def test_date_filter_regex(): dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail') From 70ac35b2a5b0640e6ebc6ceacd04ae7fdd32ad95 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 18:20:09 +0400 Subject: [PATCH 04/14] Compute Date Range to filter entries to, from Comparators, Dates in Query --- src/search_filter/date_filter.py | 44 ++++++++++++++++++++++++++++++++ tests/test_date_filter.py | 14 +++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index fe7ee5e5..b1c2cadc 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -45,6 +45,50 @@ def date_filter(query, entries, embeddings): return query, entries, embeddings + +def extract_date_range(query): + # find date range filter in query + date_range_match = re.search(date_range_regex, query) + if not date_range_match or date_range_match.groups() == (None, None, None, None): + return None + + # extract, parse natural dates ranges from date range filter passed in query + # e.g today maps to (start_of_day, start_of_tomorrow) + query_dtranges = [] + for date_str in date_range_match.groups()[1::2]: + if date_str and parse(date_str): + dt_start, dt_end = parse(date_str) + query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) + + date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] + + # Combine dates with their comparators to form date range intervals + # For e.g + # >=yesterday maps to [start_of_yesterday, inf) + # ' and dtrange_end < effective_date_range[1]: + effective_date_range[0] = dtrange_end + elif cmp == '<' and dtrange_start > effective_date_range[0]: + effective_date_range[1] = dtrange_start + elif cmp == '>=' and dtrange_end < effective_date_range[1]: + effective_date_range[0] = dtrange_start + elif cmp == '<=' and dtrange_start > effective_date_range[0]: + effective_date_range[1] = dtrange_end + elif cmp == '=' or cmp == ':' or cmp == '==': + effective_date_range = [dtrange_start, dtrange_end] + + if effective_date_range == [0, inf]: + return None + else: + return effective_date_range + + def parse(date_str, relative_base=None): "Parse date string passed in date filter of query to datetime object" # clean date string to handle future date parsing by date parser diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 8abec3af..76dad113 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -1,10 +1,22 @@ # Standard Packages import re -from datetime import timedelta, datetime +from datetime import datetime +from math import inf # Application Packages from src.search_filter import date_filter + +def test_extract_date_range(): + assert date_filter.extract_date_range('head dt>"2020-01-04" dt<"2020-01-07" tail') == [datetime(2020, 1, 5, 0, 0, 0).timestamp(), datetime(2020, 1, 7, 0, 0, 0).timestamp()] + assert date_filter.extract_date_range('head dt<="2020-01-01"') == [0, datetime(2020, 1, 2, 0, 0, 0).timestamp()] + assert date_filter.extract_date_range('head dt>="2020-01-01"') == [datetime(2020, 1, 1, 0, 0, 0).timestamp(), inf] + assert date_filter.extract_date_range('head dt:"2020-01-01"') == [datetime(2020, 1, 1, 0, 0, 0).timestamp(), datetime(2020, 1, 2, 0, 0, 0).timestamp()] + + # No date filter specified in query + assert date_filter.extract_date_range('head tail') == None + + def test_parse(): test_now = datetime(1984, 4, 1, 21, 21, 21) From 011f81fac55d08f9dff56ff2ce35ed00ad68e707 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 18:53:38 +0400 Subject: [PATCH 05/14] Fix date_filter to handle non overlapping date ranges --- src/search_filter/date_filter.py | 63 +++++++++++++++++++------------- tests/test_date_filter.py | 3 ++ 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index b1c2cadc..06cf8878 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -19,23 +19,28 @@ date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?' def date_filter(query, entries, embeddings): - # extract date from query - date_regex = r'\d{4}-\d{2}-\d{2}' - dates_in_query = re.findall(date_regex, query) + "Find entries containing any dates that fall within date range specified in query" + # extract date range specified in date filter of query + query_daterange = extract_date_range(query) # if no date in query, return all entries - if dates_in_query is None or len(dates_in_query) == 0: + if query_daterange is None: return query, entries, embeddings - # remove dates from query - query = re.sub(date_regex, '', query) + # remove date range filter from query + query = re.sub(date_range_regex, '', query) - # find entries with dates from query in them + # find entries containing any dates that fall with date range specified in query entries_to_include = set() for id, entry in enumerate(entries): - for date in dates_in_query: - if date in entry[1]: + # Extract dates from entry + for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]): + # Convert date string in entry to unix timestamp + date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() + # Check if date in entry is within date range specified in query + if query_daterange[0] <= date_in_entry <= query_daterange[1]: entries_to_include.add(id) + break # delete entries (and their embeddings) marked for exclusion entries_to_exclude = set(range(len(entries))) - entries_to_include @@ -52,6 +57,9 @@ def extract_date_range(query): if not date_range_match or date_range_match.groups() == (None, None, None, None): return None + # extract comparators (e.g >,<,=) applied on dates in date filter + date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] + # extract, parse natural dates ranges from date range filter passed in query # e.g today maps to (start_of_day, start_of_tomorrow) query_dtranges = [] @@ -60,30 +68,35 @@ def extract_date_range(query): dt_start, dt_end = parse(date_str) query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) - date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] - # Combine dates with their comparators to form date range intervals # For e.g # >=yesterday maps to [start_of_yesterday, inf) # ': + date_range_considering_comparator += [[dtrange_end, inf]] + elif cmp == '>=': + date_range_considering_comparator += [[dtrange_start, inf]] + elif cmp == '<': + date_range_considering_comparator += [[0, dtrange_start]] + elif cmp == '<=': + date_range_considering_comparator += [[0, dtrange_end]] + elif cmp == '=' or cmp == ':' or cmp == '==': + date_range_considering_comparator += [[dtrange_start, dtrange_end]] + + # Combine above intervals (via AND/intersect) # In the above example, this gives us [start_of_yesterday, start_of_tomorrow) # This is the effective date range to filter entries by # --- - effective_date_range = [0, inf] - for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators): - if cmp == '>' and dtrange_end < effective_date_range[1]: - effective_date_range[0] = dtrange_end - elif cmp == '<' and dtrange_start > effective_date_range[0]: - effective_date_range[1] = dtrange_start - elif cmp == '>=' and dtrange_end < effective_date_range[1]: - effective_date_range[0] = dtrange_start - elif cmp == '<=' and dtrange_start > effective_date_range[0]: - effective_date_range[1] = dtrange_end - elif cmp == '=' or cmp == ':' or cmp == '==': - effective_date_range = [dtrange_start, dtrange_end] + for date_range in date_range_considering_comparator: + effective_date_range = [ + max(effective_date_range[0], date_range[0]), + min(effective_date_range[1], date_range[1])] - if effective_date_range == [0, inf]: + if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]: return None else: return effective_date_range diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 76dad113..272643cf 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -16,6 +16,9 @@ def test_extract_date_range(): # No date filter specified in query assert date_filter.extract_date_range('head tail') == None + # Non intersecting date ranges + assert date_filter.extract_date_range('head dt>"2020-01-01" dt<"2020-01-01" tail') == None + def test_parse(): test_now = datetime(1984, 4, 1, 21, 21, 21) From dcb6fe479edf6f480eb596bd85b35efa6d2a3aea Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 20:01:35 +0400 Subject: [PATCH 06/14] Fix date_filter query, entry in query range check. Add tests for it - Fix date_filter date_in_entry within query range check - Extracted_date_range is in [included_date, excluded_date) format - But check was checking for date_in_entry <= excluded_date - Fixed it to do date_in_entry < excluded_date - Fix removal of date filter from query - Add tests for date_filter --- src/search_filter/date_filter.py | 5 ++-- tests/test_date_filter.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 06cf8878..aec28e01 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -28,7 +28,8 @@ def date_filter(query, entries, embeddings): return query, entries, embeddings # remove date range filter from query - query = re.sub(date_range_regex, '', query) + query = re.sub(f'\s+{date_regex}', ' ', query) + query = re.sub(r'\s{2,}', ' ', query).strip() # remove multiple spaces # find entries containing any dates that fall with date range specified in query entries_to_include = set() @@ -38,7 +39,7 @@ def date_filter(query, entries, embeddings): # Convert date string in entry to unix timestamp date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() # Check if date in entry is within date range specified in query - if query_daterange[0] <= date_in_entry <= query_daterange[1]: + if query_daterange[0] <= date_in_entry < query_daterange[1]: entries_to_include.add(id) break diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 272643cf..615eeb5b 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -3,10 +3,57 @@ import re from datetime import datetime from math import inf +# External Packages +import torch + # Application Packages from src.search_filter import date_filter +def test_date_filter(): + embeddings = torch.randn(3, 10) + entries = [ + ['', 'Entry with no date'], + ['', 'April Fools entry: 1984-04-01'], + ['', 'Entry with date:1984-04-02']] + + q_with_no_date_filter = 'head tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(q_with_no_date_filter, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert len(ret_emb) == 3 + assert ret_entries == entries + + q_with_dtrange_non_overlapping_at_boundary = 'head dt>"1984-04-01" dt<"1984-04-02" tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(q_with_dtrange_non_overlapping_at_boundary, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert len(ret_emb) == 0 + assert ret_entries == [] + + query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<"1984-04-03" tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert ret_entries == [entries[2]] + assert len(ret_emb) == 1 + + query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<"1984-04-02" tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert ret_entries == [entries[1]] + assert len(ret_emb) == 1 + + query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<="1984-04-02" tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert ret_entries == [entries[2]] + assert len(ret_emb) == 1 + + query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<="1984-04-02" tail' + ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + assert ret_query == 'head tail' + assert ret_entries == [entries[1], entries[2]] + assert len(ret_emb) == 2 + + def test_extract_date_range(): assert date_filter.extract_date_range('head dt>"2020-01-04" dt<"2020-01-07" tail') == [datetime(2020, 1, 5, 0, 0, 0).timestamp(), datetime(2020, 1, 7, 0, 0, 0).timestamp()] assert date_filter.extract_date_range('head dt<="2020-01-01"') == [0, datetime(2020, 1, 2, 0, 0, 0).timestamp()] From 67e9366c0f6ed0f502a1a4b2a371c0bdbc23b741 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 20:06:39 +0400 Subject: [PATCH 07/14] Minor style fix. Use consistent/standard dates for date_filter tests --- tests/test_date_filter.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 615eeb5b..7426e14e 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -55,16 +55,16 @@ def test_date_filter(): def test_extract_date_range(): - assert date_filter.extract_date_range('head dt>"2020-01-04" dt<"2020-01-07" tail') == [datetime(2020, 1, 5, 0, 0, 0).timestamp(), datetime(2020, 1, 7, 0, 0, 0).timestamp()] - assert date_filter.extract_date_range('head dt<="2020-01-01"') == [0, datetime(2020, 1, 2, 0, 0, 0).timestamp()] - assert date_filter.extract_date_range('head dt>="2020-01-01"') == [datetime(2020, 1, 1, 0, 0, 0).timestamp(), inf] - assert date_filter.extract_date_range('head dt:"2020-01-01"') == [datetime(2020, 1, 1, 0, 0, 0).timestamp(), datetime(2020, 1, 2, 0, 0, 0).timestamp()] + assert date_filter.extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [datetime(1984, 1, 5, 0, 0, 0).timestamp(), datetime(1984, 1, 7, 0, 0, 0).timestamp()] + assert date_filter.extract_date_range('head dt<="1984-01-01"') == [0, datetime(1984, 1, 2, 0, 0, 0).timestamp()] + assert date_filter.extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf] + assert date_filter.extract_date_range('head dt:"1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp()] # No date filter specified in query assert date_filter.extract_date_range('head tail') == None # Non intersecting date ranges - assert date_filter.extract_date_range('head dt>"2020-01-01" dt<"2020-01-01" tail') == None + assert date_filter.extract_date_range('head dt>"1984-01-01" dt<"1984-01-01" tail') == None def test_parse(): @@ -94,11 +94,11 @@ def test_parse(): def test_date_filter_regex(): - dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"2020-01-01" tail') - assert dtrange_match.groups() == ('>', 'today', ':', '2020-01-01') + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"1984-01-01" tail') + assert dtrange_match.groups() == ('>', 'today', ':', '1984-01-01') - dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="2020-01-01"') - assert dtrange_match.groups() == ('>=', 'today', '=', '2020-01-01') + dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="1984-01-01"') + assert dtrange_match.groups() == ('>=', 'today', '=', '1984-01-01') dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail') assert dtrange_match.groups() == ('<', 'today', None, None) From 9de2097182cc2b5e0951cd633fb48a3ea19d7307 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 21:32:58 +0400 Subject: [PATCH 08/14] Fix date filter usage with multi word queries. Simplify date regex --- src/search_filter/date_filter.py | 24 ++++++++++-------------- tests/test_date_filter.py | 23 +++++++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index aec28e01..578366ff 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -11,11 +11,10 @@ import dateparser as dtparse # Date Range Filter Regexes # Example filter queries: -# - dt>=yesterday dt<"tomorrow" +# - dt>="yesterday" dt<"tomorrow" # - dt>="last week" -# - dt:"next year" -date_regex = r'(?:dt([:><=]{1,2})\"?([\w\-\/]+))?\"?' -date_range_regex=f'.*?\s+{date_regex}\s*{date_regex}.*?' +# - dt:"2 years ago" +date_regex = r"dt([:><=]{1,2})\"(.*?)\"" def date_filter(query, entries, embeddings): @@ -54,20 +53,17 @@ def date_filter(query, entries, embeddings): def extract_date_range(query): # find date range filter in query - date_range_match = re.search(date_range_regex, query) - if not date_range_match or date_range_match.groups() == (None, None, None, None): - return None + date_range_matches = re.findall(date_regex, query) - # extract comparators (e.g >,<,=) applied on dates in date filter - date_comparators = [date_cmp for date_cmp in date_range_match.groups()[0::2] if date_cmp] + if len(date_range_matches) == 0: + return None # extract, parse natural dates ranges from date range filter passed in query # e.g today maps to (start_of_day, start_of_tomorrow) - query_dtranges = [] - for date_str in date_range_match.groups()[1::2]: - if date_str and parse(date_str): + for index, (cmp, date_str) in enumerate(date_range_matches): + if parse(date_str): dt_start, dt_end = parse(date_str) - query_dtranges.append((dt_start.timestamp(), dt_end.timestamp())) + date_range_matches[index] = [cmp, (dt_start.timestamp(), dt_end.timestamp())] # Combine dates with their comparators to form date range intervals # For e.g @@ -76,7 +72,7 @@ def extract_date_range(query): # --- effective_date_range = [0, inf] date_range_considering_comparator = [] - for ((dtrange_start, dtrange_end), cmp) in zip(query_dtranges, date_comparators): + for cmp, (dtrange_start, dtrange_end) in date_range_matches: if cmp == '>': date_range_considering_comparator += [[dtrange_end, inf]] elif cmp == '>=': diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 7426e14e..2224b865 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -94,17 +94,20 @@ def test_parse(): def test_date_filter_regex(): - dtrange_match = re.search(date_filter.date_range_regex, 'head dt>"today" dt:"1984-01-01" tail') - assert dtrange_match.groups() == ('>', 'today', ':', '1984-01-01') + dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>"today" dt:"1984-01-01"') + assert dtrange_match == [('>', 'today'), (':', '1984-01-01')] - dtrange_match = re.search(date_filter.date_range_regex, 'head dt>="today" dt="1984-01-01"') - assert dtrange_match.groups() == ('>=', 'today', '=', '1984-01-01') + dtrange_match = re.findall(date_filter.date_regex, 'head dt>"today" dt:"1984-01-01" multi word tail') + assert dtrange_match == [('>', 'today'), (':', '1984-01-01')] - dtrange_match = re.search(date_filter.date_range_regex, 'head dt<"today" tail') - assert dtrange_match.groups() == ('<', 'today', None, None) + dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>="today" dt="1984-01-01"') + assert dtrange_match == [('>=', 'today'), ('=', '1984-01-01')] - dtrange_match = re.search(date_filter.date_range_regex, 'head dt<="today"') - assert dtrange_match.groups() == ('<=', 'today', None, None) + dtrange_match = re.findall(date_filter.date_regex, 'dt<"multi word date" multi word tail') + assert dtrange_match == [('<', 'multi word date')] - dtrange_match = re.search(date_filter.date_range_regex, 'head tail') - assert dtrange_match.groups() == (None, None, None, None) + dtrange_match = re.findall(date_filter.date_regex, 'head dt<="multi word date"') + assert dtrange_match == [('<=', 'multi word date')] + + dtrange_match = re.findall(date_filter.date_regex, 'head tail') + assert dtrange_match == [] \ No newline at end of file From 7640e2ab0c61efe57aa4e3ff7a32dea38f3e9a21 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 21:38:00 +0400 Subject: [PATCH 09/14] Wrap attempt to extract dates from entry in try/catch - Not all YYYY-MM-DD strings in entry are necessarily dates --- src/search_filter/date_filter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 578366ff..39fd28ae 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -36,7 +36,10 @@ def date_filter(query, entries, embeddings): # Extract dates from entry for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]): # Convert date string in entry to unix timestamp - date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() + try: + date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() + except ValueError: + continue # Check if date in entry is within date range specified in query if query_daterange[0] <= date_in_entry < query_daterange[1]: entries_to_include.add(id) From 3aac3c7d5293c6ba160dd6d4384e62904832071c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 21:54:04 +0400 Subject: [PATCH 10/14] Run explicit filter on raw entry, add more terms to split entries by - With \t Last Word in Headings was suffixed by \t and so couldn't be filtered by - User interacts with raw entries, so run explicit filters on raw entry - For semantic search using the filtered entry is cleaner, still --- src/search_filter/explicit_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search_filter/explicit_filter.py b/src/search_filter/explicit_filter.py index 363dbd71..e6a7f551 100644 --- a/src/search_filter/explicit_filter.py +++ b/src/search_filter/explicit_filter.py @@ -18,8 +18,8 @@ def explicit_filter(raw_query, entries, embeddings): entries_by_word_set = [set(word.lower() for word in re.split( - r',|\.| |\]|\[\(|\)|\{|\}', # split on fullstop, comma or any brackets - entry[0]) + r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:', # split on fullstop, comma or any brackets + entry[1]) if word != "") for entry in entries] From c3b3e8959dfdad792a9109fbc6994e8922ef95c2 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 22:00:10 +0400 Subject: [PATCH 11/14] Put entry splitting regex in explicit filter into a variable for code readability --- src/search_filter/explicit_filter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/search_filter/explicit_filter.py b/src/search_filter/explicit_filter.py index e6a7f551..f913a820 100644 --- a/src/search_filter/explicit_filter.py +++ b/src/search_filter/explicit_filter.py @@ -15,11 +15,11 @@ def explicit_filter(raw_query, entries, embeddings): return query, entries, embeddings # convert each entry to a set of words + # split on fullstop, comma, colon, tab, newline or any brackets + entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' entries_by_word_set = [set(word.lower() for word - in re.split( - r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:', # split on fullstop, comma or any brackets - entry[1]) + in re.split(entry_splitter, entry[1]) if word != "") for entry in entries] From e96253a7c19d17e59a40fdea869721f91cebff36 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 22:29:07 +0400 Subject: [PATCH 12/14] Add dateparser library to conda environment YAML --- config/environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/environment.yml b/config/environment.yml index 289d20d7..bf3bb82a 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -18,3 +18,4 @@ dependencies: - jinja2=3.1.2 - aiofiles=0.8.0 - huggingface_hub=0.8.1 + - dateparser=1.1.1 \ No newline at end of file From a60de2c02b277c0e90cee08d406a3d5348dd9387 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 22:37:17 +0400 Subject: [PATCH 13/14] Include date filter in asymmetic search on music as well --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index b9723879..addd55d5 100644 --- a/src/main.py +++ b/src/main.py @@ -67,7 +67,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Music or t == None) and model.music_search: # query music library - hits, entries = asymmetric.query(user_query, model.music_search, device=device, filters=[explicit_filter]) + hits, entries = asymmetric.query(user_query, model.music_search, device=device, filters=[explicit_filter, date_filter]) # collate and return results return asymmetric.collate_results(hits, entries, results_count) From 85077bc1d1651dffb69b35c67b42e2e078c62d20 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 14 Jul 2022 22:47:23 +0400 Subject: [PATCH 14/14] Handle unparseable date range passed via date filter in query - Do not reuse the same list - Just create new list, so only parsed data is in it --- src/search_filter/date_filter.py | 7 ++++--- tests/test_date_filter.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 39fd28ae..81febe1e 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -63,10 +63,11 @@ def extract_date_range(query): # extract, parse natural dates ranges from date range filter passed in query # e.g today maps to (start_of_day, start_of_tomorrow) - for index, (cmp, date_str) in enumerate(date_range_matches): + date_ranges_from_filter = [] + for (cmp, date_str) in date_range_matches: if parse(date_str): dt_start, dt_end = parse(date_str) - date_range_matches[index] = [cmp, (dt_start.timestamp(), dt_end.timestamp())] + date_ranges_from_filter += [[cmp, (dt_start.timestamp(), dt_end.timestamp())]] # Combine dates with their comparators to form date range intervals # For e.g @@ -75,7 +76,7 @@ def extract_date_range(query): # --- effective_date_range = [0, inf] date_range_considering_comparator = [] - for cmp, (dtrange_start, dtrange_end) in date_range_matches: + for cmp, (dtrange_start, dtrange_end) in date_ranges_from_filter: if cmp == '>': date_range_considering_comparator += [[dtrange_end, inf]] elif cmp == '>=': diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index 2224b865..525f011e 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -60,6 +60,9 @@ def test_extract_date_range(): assert date_filter.extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf] assert date_filter.extract_date_range('head dt:"1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp()] + # Unparseable date filter specified in query + assert date_filter.extract_date_range('head dt:"Summer of 69" tail') == None + # No date filter specified in query assert date_filter.extract_date_range('head tail') == None