Use new Text Entry class to track text entries in Intermediate Format

- Context - The app maintains all text content in a standard, intermediate format - The intermediate format was loaded, passed around as a dictionary for easier, faster updates to the intermediate format schema initially - The intermediate format is reasonably stable now, given it's usage by all 3 text content types currently implemented - Changes - Concretize text entries into `Entries' class instead of using dictionaries - Code is updated to load, pass around entries as `Entries' objects instead of as dictionaries - `text_search' and `text_to_jsonl' methods are annotated with type hints for the new `Entries' type - Code and Tests referencing entries are updated to use class style access patterns instead of the previous dictionary access patterns - Move `mark_entries_for_update' method into `TextToJsonl' base class - This is a more natural location for the method as it is only (to be) used by `text_to_jsonl' classes - Avoid circular reference issues on importing `Entries' class
2026-03-05 21:29:11 +00:00 · 2022-09-15 23:34:43 +03:00
parent 99754970ab
commit 7e9298f315
15 changed files with 161 additions and 131 deletions
--- a/src/search_filter/date_filter.py
+++ b/src/search_filter/date_filter.py
@@ -37,7 +37,7 @@ class DateFilter(BaseFilter):
        start = time.time()
        for id, entry in enumerate(entries):
            # Extract dates from entry
-            for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[self.entry_key]):
+            for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', getattr(entry, self.entry_key)):
                # Convert date string in entry to unix timestamp
                try:
                    date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
--- a/src/search_filter/file_filter.py
+++ b/src/search_filter/file_filter.py
@@ -24,7 +24,7 @@ class FileFilter(BaseFilter):
    def load(self, entries, *args, **kwargs):
        start = time.time()
        for id, entry in enumerate(entries):
-            self.file_to_entry_map[entry[self.entry_key]].add(id)
+            self.file_to_entry_map[getattr(entry, self.entry_key)].add(id)
        end = time.time()
        logger.debug(f"Created file filter index: {end - start} seconds")

--- a/src/search_filter/word_filter.py
+++ b/src/search_filter/word_filter.py
@@ -29,7 +29,7 @@ class WordFilter(BaseFilter):
        entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
        # Create map of words to entries they exist in
        for entry_index, entry in enumerate(entries):
-            for word in re.split(entry_splitter, entry[self.entry_key].lower()):
+            for word in re.split(entry_splitter, getattr(entry, self.entry_key).lower()):
                if word == '':
                    continue
                self.word_to_entry_index[word].add(entry_index)