mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 13:21:18 +00:00
Init word filter dictionary with default value as set to simplify code
This commit is contained in:
@@ -3,6 +3,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
import pickle
|
import pickle
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.search_filter.base_filter import BaseFilter
|
from src.search_filter.base_filter import BaseFilter
|
||||||
@@ -37,19 +38,18 @@ class WordFilter(BaseFilter):
|
|||||||
start = time.time()
|
start = time.time()
|
||||||
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
||||||
|
self.word_to_entry_index = defaultdict(set)
|
||||||
# Create map of words to entries they exist in
|
# Create map of words to entries they exist in
|
||||||
for entry_index, entry in enumerate(entries):
|
for entry_index, entry in enumerate(entries):
|
||||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||||
if word == '':
|
if word == '':
|
||||||
continue
|
continue
|
||||||
if word not in self.word_to_entry_index:
|
|
||||||
self.word_to_entry_index[word] = set()
|
|
||||||
self.word_to_entry_index[word].add(entry_index)
|
self.word_to_entry_index[word].add(entry_index)
|
||||||
|
|
||||||
with self.filter_file.open('wb') as f:
|
with self.filter_file.open('wb') as f:
|
||||||
pickle.dump(self.word_to_entry_index, f)
|
pickle.dump(self.word_to_entry_index, f)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds")
|
logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds")
|
||||||
|
|
||||||
return self.word_to_entry_index
|
return self.word_to_entry_index
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user