mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 21:29:11 +00:00
Create and use a context manager to time code
Use the timer context manager in all places where code was being timed - Benefits - Deduplicate timing code scattered across codebase. - Provides single place to manage perf timing code - Use consistent timing log patterns
This commit is contained in:
@@ -12,7 +12,7 @@ import dateparser as dtparse
|
||||
|
||||
# Internal Packages
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
from src.utils.helpers import LRU
|
||||
from src.utils.helpers import LRU, timer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -34,19 +34,16 @@ class DateFilter(BaseFilter):
|
||||
|
||||
|
||||
def load(self, entries, *args, **kwargs):
|
||||
start = time.time()
|
||||
for id, entry in enumerate(entries):
|
||||
# Extract dates from entry
|
||||
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', getattr(entry, self.entry_key)):
|
||||
# Convert date string in entry to unix timestamp
|
||||
try:
|
||||
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
self.date_to_entry_ids[date_in_entry].add(id)
|
||||
end = time.time()
|
||||
logger.debug(f"Created date filter index: {end - start} seconds")
|
||||
|
||||
with timer("Created date filter index", logger):
|
||||
for id, entry in enumerate(entries):
|
||||
# Extract dates from entry
|
||||
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', getattr(entry, self.entry_key)):
|
||||
# Convert date string in entry to unix timestamp
|
||||
try:
|
||||
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
self.date_to_entry_ids[date_in_entry].add(id)
|
||||
|
||||
def can_filter(self, raw_query):
|
||||
"Check if query contains date filters"
|
||||
@@ -56,10 +53,8 @@ class DateFilter(BaseFilter):
|
||||
def apply(self, query, entries):
|
||||
"Find entries containing any dates that fall within date range specified in query"
|
||||
# extract date range specified in date filter of query
|
||||
start = time.time()
|
||||
query_daterange = self.extract_date_range(query)
|
||||
end = time.time()
|
||||
logger.debug(f"Extract date range to filter from query: {end - start} seconds")
|
||||
with timer("Extract date range to filter from query", logger):
|
||||
query_daterange = self.extract_date_range(query)
|
||||
|
||||
# if no date in query, return all entries
|
||||
if query_daterange is None:
|
||||
@@ -80,14 +75,12 @@ class DateFilter(BaseFilter):
|
||||
self.load(entries)
|
||||
|
||||
# find entries containing any dates that fall with date range specified in query
|
||||
start = time.time()
|
||||
entries_to_include = set()
|
||||
for date_in_entry in self.date_to_entry_ids.keys():
|
||||
# Check if date in entry is within date range specified in query
|
||||
if query_daterange[0] <= date_in_entry < query_daterange[1]:
|
||||
entries_to_include |= self.date_to_entry_ids[date_in_entry]
|
||||
end = time.time()
|
||||
logger.debug(f"Mark entries satisfying filter: {end - start} seconds")
|
||||
with timer("Mark entries satisfying filter", logger):
|
||||
entries_to_include = set()
|
||||
for date_in_entry in self.date_to_entry_ids.keys():
|
||||
# Check if date in entry is within date range specified in query
|
||||
if query_daterange[0] <= date_in_entry < query_daterange[1]:
|
||||
entries_to_include |= self.date_to_entry_ids[date_in_entry]
|
||||
|
||||
# cache results
|
||||
self.cache[cache_key] = entries_to_include
|
||||
|
||||
@@ -7,7 +7,7 @@ from collections import defaultdict
|
||||
|
||||
# Internal Packages
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
from src.utils.helpers import LRU
|
||||
from src.utils.helpers import LRU, timer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -22,32 +22,28 @@ class FileFilter(BaseFilter):
|
||||
self.cache = LRU()
|
||||
|
||||
def load(self, entries, *args, **kwargs):
|
||||
start = time.time()
|
||||
for id, entry in enumerate(entries):
|
||||
self.file_to_entry_map[getattr(entry, self.entry_key)].add(id)
|
||||
end = time.time()
|
||||
logger.debug(f"Created file filter index: {end - start} seconds")
|
||||
with timer("Created file filter index", logger):
|
||||
for id, entry in enumerate(entries):
|
||||
self.file_to_entry_map[getattr(entry, self.entry_key)].add(id)
|
||||
|
||||
def can_filter(self, raw_query):
|
||||
return re.search(self.file_filter_regex, raw_query) is not None
|
||||
|
||||
def apply(self, query, entries):
|
||||
# Extract file filters from raw query
|
||||
start = time.time()
|
||||
raw_files_to_search = re.findall(self.file_filter_regex, query)
|
||||
if not raw_files_to_search:
|
||||
return query, set(range(len(entries)))
|
||||
with timer("Extract files_to_search from query", logger):
|
||||
raw_files_to_search = re.findall(self.file_filter_regex, query)
|
||||
if not raw_files_to_search:
|
||||
return query, set(range(len(entries)))
|
||||
|
||||
# Convert simple file filters with no path separator into regex
|
||||
# e.g. "file:notes.org" -> "file:.*notes.org"
|
||||
files_to_search = []
|
||||
for file in sorted(raw_files_to_search):
|
||||
if '/' not in file and '\\' not in file and '*' not in file:
|
||||
files_to_search += [f'*{file}']
|
||||
else:
|
||||
files_to_search += [file]
|
||||
end = time.time()
|
||||
logger.debug(f"Extract files_to_search from query: {end - start} seconds")
|
||||
# Convert simple file filters with no path separator into regex
|
||||
# e.g. "file:notes.org" -> "file:.*notes.org"
|
||||
files_to_search = []
|
||||
for file in sorted(raw_files_to_search):
|
||||
if '/' not in file and '\\' not in file and '*' not in file:
|
||||
files_to_search += [f'*{file}']
|
||||
else:
|
||||
files_to_search += [file]
|
||||
|
||||
# Return item from cache if exists
|
||||
query = re.sub(self.file_filter_regex, '', query).strip()
|
||||
@@ -61,17 +57,13 @@ class FileFilter(BaseFilter):
|
||||
self.load(entries, regenerate=False)
|
||||
|
||||
# Mark entries that contain any blocked_words for exclusion
|
||||
start = time.time()
|
||||
|
||||
included_entry_indices = set.union(*[self.file_to_entry_map[entry_file]
|
||||
for entry_file in self.file_to_entry_map.keys()
|
||||
for search_file in files_to_search
|
||||
if fnmatch.fnmatch(entry_file, search_file)], set())
|
||||
if not included_entry_indices:
|
||||
return query, {}
|
||||
|
||||
end = time.time()
|
||||
logger.debug(f"Mark entries satisfying filter: {end - start} seconds")
|
||||
with timer("Mark entries satisfying filter", logger):
|
||||
included_entry_indices = set.union(*[self.file_to_entry_map[entry_file]
|
||||
for entry_file in self.file_to_entry_map.keys()
|
||||
for search_file in files_to_search
|
||||
if fnmatch.fnmatch(entry_file, search_file)], set())
|
||||
if not included_entry_indices:
|
||||
return query, {}
|
||||
|
||||
# Cache results
|
||||
self.cache[cache_key] = included_entry_indices
|
||||
|
||||
@@ -6,7 +6,7 @@ from collections import defaultdict
|
||||
|
||||
# Internal Packages
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
from src.utils.helpers import LRU
|
||||
from src.utils.helpers import LRU, timer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -24,17 +24,15 @@ class WordFilter(BaseFilter):
|
||||
|
||||
|
||||
def load(self, entries, *args, **kwargs):
|
||||
start = time.time()
|
||||
self.cache = {} # Clear cache on filter (re-)load
|
||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||
# Create map of words to entries they exist in
|
||||
for entry_index, entry in enumerate(entries):
|
||||
for word in re.split(entry_splitter, getattr(entry, self.entry_key).lower()):
|
||||
if word == '':
|
||||
continue
|
||||
self.word_to_entry_index[word].add(entry_index)
|
||||
end = time.time()
|
||||
logger.debug(f"Created word filter index: {end - start} seconds")
|
||||
with timer("Created word filter index", logger):
|
||||
self.cache = {} # Clear cache on filter (re-)load
|
||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||
# Create map of words to entries they exist in
|
||||
for entry_index, entry in enumerate(entries):
|
||||
for word in re.split(entry_splitter, getattr(entry, self.entry_key).lower()):
|
||||
if word == '':
|
||||
continue
|
||||
self.word_to_entry_index[word].add(entry_index)
|
||||
|
||||
return self.word_to_entry_index
|
||||
|
||||
@@ -50,14 +48,10 @@ class WordFilter(BaseFilter):
|
||||
def apply(self, query, entries):
|
||||
"Find entries containing required and not blocked words specified in query"
|
||||
# Separate natural query from required, blocked words filters
|
||||
start = time.time()
|
||||
|
||||
required_words = set([word.lower() for word in re.findall(self.required_regex, query)])
|
||||
blocked_words = set([word.lower() for word in re.findall(self.blocked_regex, query)])
|
||||
query = re.sub(self.blocked_regex, '', re.sub(self.required_regex, '', query)).strip()
|
||||
|
||||
end = time.time()
|
||||
logger.debug(f"Extract required, blocked filters from query: {end - start} seconds")
|
||||
with timer("Extract required, blocked filters from query", logger):
|
||||
required_words = set([word.lower() for word in re.findall(self.required_regex, query)])
|
||||
blocked_words = set([word.lower() for word in re.findall(self.blocked_regex, query)])
|
||||
query = re.sub(self.blocked_regex, '', re.sub(self.required_regex, '', query)).strip()
|
||||
|
||||
if len(required_words) == 0 and len(blocked_words) == 0:
|
||||
return query, set(range(len(entries)))
|
||||
@@ -72,20 +66,16 @@ class WordFilter(BaseFilter):
|
||||
if not self.word_to_entry_index:
|
||||
self.load(entries, regenerate=False)
|
||||
|
||||
start = time.time()
|
||||
|
||||
# mark entries that contain all required_words for inclusion
|
||||
entries_with_all_required_words = set(range(len(entries)))
|
||||
if len(required_words) > 0:
|
||||
entries_with_all_required_words = set.intersection(*[self.word_to_entry_index.get(word, set()) for word in required_words])
|
||||
with timer("Mark entries satisfying filter", logger):
|
||||
entries_with_all_required_words = set(range(len(entries)))
|
||||
if len(required_words) > 0:
|
||||
entries_with_all_required_words = set.intersection(*[self.word_to_entry_index.get(word, set()) for word in required_words])
|
||||
|
||||
# mark entries that contain any blocked_words for exclusion
|
||||
entries_with_any_blocked_words = set()
|
||||
if len(blocked_words) > 0:
|
||||
entries_with_any_blocked_words = set.union(*[self.word_to_entry_index.get(word, set()) for word in blocked_words])
|
||||
|
||||
end = time.time()
|
||||
logger.debug(f"Mark entries satisfying filter: {end - start} seconds")
|
||||
# mark entries that contain any blocked_words for exclusion
|
||||
entries_with_any_blocked_words = set()
|
||||
if len(blocked_words) > 0:
|
||||
entries_with_any_blocked_words = set.union(*[self.word_to_entry_index.get(word, set()) for word in blocked_words])
|
||||
|
||||
# get entries satisfying inclusion and exclusion filters
|
||||
included_entry_indices = entries_with_all_required_words - entries_with_any_blocked_words
|
||||
|
||||
Reference in New Issue
Block a user