mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Pre-compute file to entry map in file filter to mark ids to include faster
This commit is contained in:
@@ -3,6 +3,7 @@ import re
|
|||||||
import fnmatch
|
import fnmatch
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import torch
|
import torch
|
||||||
@@ -20,10 +21,15 @@ class FileFilter(BaseFilter):
|
|||||||
|
|
||||||
def __init__(self, entry_key='file'):
|
def __init__(self, entry_key='file'):
|
||||||
self.entry_key = entry_key
|
self.entry_key = entry_key
|
||||||
|
self.file_to_entry_map = defaultdict(set)
|
||||||
self.cache = LRU()
|
self.cache = LRU()
|
||||||
|
|
||||||
def load(self, *args, **kwargs):
|
def load(self, entries, *args, **kwargs):
|
||||||
pass
|
start = time.time()
|
||||||
|
for id, entry in enumerate(entries):
|
||||||
|
self.file_to_entry_map[entry[self.entry_key]].add(id)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Created file filter index: {end - start} seconds")
|
||||||
|
|
||||||
def can_filter(self, raw_query):
|
def can_filter(self, raw_query):
|
||||||
return re.search(self.file_filter_regex, raw_query) is not None
|
return re.search(self.file_filter_regex, raw_query) is not None
|
||||||
@@ -57,7 +63,10 @@ class FileFilter(BaseFilter):
|
|||||||
# Mark entries that contain any blocked_words for exclusion
|
# Mark entries that contain any blocked_words for exclusion
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
included_entry_indices = [id for id, entry in enumerate(raw_entries) for search_file in files_to_search if fnmatch.fnmatch(entry[self.entry_key], search_file)]
|
included_entry_indices = set.union(*[self.file_to_entry_map[entry_file]
|
||||||
|
for entry_file in self.file_to_entry_map.keys()
|
||||||
|
for search_file in files_to_search
|
||||||
|
if fnmatch.fnmatch(entry_file, search_file)], set())
|
||||||
if not included_entry_indices:
|
if not included_entry_indices:
|
||||||
return query, [], torch.empty(0)
|
return query, [], torch.empty(0)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user