Improve date filter perf. Precompute date to entry map, Cache results

- Precompute date to entry map
- Cache results for faster recall
- Log preformance timers in date filter
This commit is contained in:
Debanjum Singh Solanky
2022-09-05 18:21:29 +03:00
parent 31503e7afd
commit 3707a4cdd4

View File

@@ -1,33 +1,51 @@
# Standard Packages # Standard Packages
import re import re
import time
import logging
from collections import defaultdict
from datetime import timedelta, datetime from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from math import inf from math import inf
from copy import deepcopy
# External Packages # External Packages
import torch
import dateparser as dtparse import dateparser as dtparse
# Internal Packages # Internal Packages
from src.search_filter.base_filter import BaseFilter from src.search_filter.base_filter import BaseFilter
from src.utils.helpers import LRU
logger = logging.getLogger(__name__)
class DateFilter(BaseFilter): class DateFilter(BaseFilter):
# Date Range Filter Regexes # Date Range Filter Regexes
# Example filter queries: # Example filter queries:
# - dt>="yesterday" dt<"tomorrow" # - dt>="yesterday" dt<"tomorrow"
# - dt>="last week" # - dt>="last week"
# - dt:"2 years ago" # - dt:"2 years ago"
date_regex = r"dt([:><=]{1,2})\"(.*?)\"" date_regex = r"dt([:><=]{1,2})\"(.*?)\""
def __init__(self, entry_key='raw'): def __init__(self, entry_key='raw'):
self.entry_key = entry_key self.entry_key = entry_key
self.date_to_entry_ids = defaultdict(set)
self.cache = LRU()
def load(*args, **kwargs): def load(self, entries, **_):
pass start = time.time()
for id, entry in enumerate(entries):
# Extract dates from entry
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[self.entry_key]):
# Convert date string in entry to unix timestamp
try:
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
except ValueError:
continue
self.date_to_entry_ids[date_in_entry].add(id)
end = time.time()
logger.debug(f"Created file filter index: {end - start} seconds")
def can_filter(self, raw_query): def can_filter(self, raw_query):
@@ -38,7 +56,10 @@ class DateFilter(BaseFilter):
def apply(self, query, raw_entries): def apply(self, query, raw_entries):
"Find entries containing any dates that fall within date range specified in query" "Find entries containing any dates that fall within date range specified in query"
# extract date range specified in date filter of query # extract date range specified in date filter of query
start = time.time()
query_daterange = self.extract_date_range(query) query_daterange = self.extract_date_range(query)
end = time.time()
logger.debug(f"Extract date range to filter from query: {end - start} seconds")
# if no date in query, return all entries # if no date in query, return all entries
if query_daterange is None: if query_daterange is None:
@@ -48,20 +69,28 @@ class DateFilter(BaseFilter):
query = re.sub(rf'\s+{self.date_regex}', ' ', query) query = re.sub(rf'\s+{self.date_regex}', ' ', query)
query = re.sub(r'\s{2,}', ' ', query).strip() # remove multiple spaces query = re.sub(r'\s{2,}', ' ', query).strip() # remove multiple spaces
# return results from cache if exists
cache_key = tuple(query_daterange)
if cache_key in self.cache:
logger.info(f"Return date filter results from cache")
entries_to_include = self.cache[cache_key]
return query, entries_to_include
if not self.date_to_entry_ids:
self.load(raw_entries)
# find entries containing any dates that fall with date range specified in query # find entries containing any dates that fall with date range specified in query
start = time.time()
entries_to_include = set() entries_to_include = set()
for id, entry in enumerate(raw_entries): for date_in_entry in self.date_to_entry_ids.keys():
# Extract dates from entry # Check if date in entry is within date range specified in query
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[self.entry_key]): if query_daterange[0] <= date_in_entry < query_daterange[1]:
# Convert date string in entry to unix timestamp entries_to_include |= self.date_to_entry_ids[date_in_entry]
try: end = time.time()
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() logger.debug(f"Mark entries satisfying filter: {end - start} seconds")
except ValueError:
continue # cache results
# Check if date in entry is within date range specified in query self.cache[cache_key] = entries_to_include
if query_daterange[0] <= date_in_entry < query_daterange[1]:
entries_to_include.add(id)
break
return query, entries_to_include return query, entries_to_include