Use new Text Entry class to track text entries in Intermediate Format

- Context - The app maintains all text content in a standard, intermediate format - The intermediate format was loaded, passed around as a dictionary for easier, faster updates to the intermediate format schema initially - The intermediate format is reasonably stable now, given it's usage by all 3 text content types currently implemented - Changes - Concretize text entries into `Entries' class instead of using dictionaries - Code is updated to load, pass around entries as `Entries' objects instead of as dictionaries - `text_search' and `text_to_jsonl' methods are annotated with type hints for the new `Entries' type - Code and Tests referencing entries are updated to use class style access patterns instead of the previous dictionary access patterns - Move `mark_entries_for_update' method into `TextToJsonl' base class - This is a more natural location for the method as it is only (to be) used by `text_to_jsonl' classes - Avoid circular reference issues on importing `Entries' class
2026-03-04 21:29:12 +00:00 · 2022-09-15 23:34:43 +03:00
parent 99754970ab
commit 7e9298f315
15 changed files with 161 additions and 131 deletions
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@@ -1,8 +1,6 @@
 # Standard Packages
 from pathlib import Path
 import sys
-import time
-import hashlib
 from os.path import join
 from collections import OrderedDict
 from typing import Optional, Union
@@ -83,38 +81,3 @@ class LRU(OrderedDict):
            oldest = next(iter(self))
            del self[oldest]

-
-def mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=None):
-    # Hash all current and previous entries to identify new entries
-    start = time.time()
-    current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), current_entries))
-    previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), previous_entries))
-    end = time.time()
-    logger.debug(f"Hash previous, current entries: {end - start} seconds")
-
-    start = time.time()
-    hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
-    hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
-
-    # All entries that did not exist in the previous set are to be added
-    new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
-    # All entries that exist in both current and previous sets are kept
-    existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
-
-    # Mark new entries with no ids for later embeddings generation
-    new_entries = [
-        (None, hash_to_current_entries[entry_hash])
-        for entry_hash in new_entry_hashes
-    ]
-    # Set id of existing entries to their previous ids to reuse their existing encoded embeddings
-    existing_entries = [
-        (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
-        for entry_hash in existing_entry_hashes
-    ]
-
-    existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
-    entries_with_ids = existing_entries_sorted + new_entries
-    end = time.time()
-    logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds")
-
-    return entries_with_ids