mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 21:29:11 +00:00
Use new Text Entry class to track text entries in Intermediate Format
- Context
- The app maintains all text content in a standard, intermediate format
- The intermediate format was loaded, passed around as a dictionary
for easier, faster updates to the intermediate format schema initially
- The intermediate format is reasonably stable now, given it's usage
by all 3 text content types currently implemented
- Changes
- Concretize text entries into `Entries' class instead of using dictionaries
- Code is updated to load, pass around entries as `Entries' objects
instead of as dictionaries
- `text_search' and `text_to_jsonl' methods are annotated with
type hints for the new `Entries' type
- Code and Tests referencing entries are updated to use class style
access patterns instead of the previous dictionary access patterns
- Move `mark_entries_for_update' method into `TextToJsonl' base class
- This is a more natural location for the method as it is only
(to be) used by `text_to_jsonl' classes
- Avoid circular reference issues on importing `Entries' class
This commit is contained in:
@@ -37,7 +37,7 @@ class DateFilter(BaseFilter):
|
||||
start = time.time()
|
||||
for id, entry in enumerate(entries):
|
||||
# Extract dates from entry
|
||||
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[self.entry_key]):
|
||||
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', getattr(entry, self.entry_key)):
|
||||
# Convert date string in entry to unix timestamp
|
||||
try:
|
||||
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
|
||||
|
||||
@@ -24,7 +24,7 @@ class FileFilter(BaseFilter):
|
||||
def load(self, entries, *args, **kwargs):
|
||||
start = time.time()
|
||||
for id, entry in enumerate(entries):
|
||||
self.file_to_entry_map[entry[self.entry_key]].add(id)
|
||||
self.file_to_entry_map[getattr(entry, self.entry_key)].add(id)
|
||||
end = time.time()
|
||||
logger.debug(f"Created file filter index: {end - start} seconds")
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ class WordFilter(BaseFilter):
|
||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||
# Create map of words to entries they exist in
|
||||
for entry_index, entry in enumerate(entries):
|
||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||
for word in re.split(entry_splitter, getattr(entry, self.entry_key).lower()):
|
||||
if word == '':
|
||||
continue
|
||||
self.word_to_entry_index[word].add(entry_index)
|
||||
|
||||
Reference in New Issue
Block a user