Use new Text Entry class to track text entries in Intermediate Format

- Context
  - The app maintains all text content in a standard, intermediate format
  - The intermediate format was loaded, passed around as a dictionary
    for easier, faster updates to the intermediate format schema initially
  - The intermediate format is reasonably stable now, given it's usage
    by all 3 text content types currently implemented

- Changes
  - Concretize text entries into `Entries' class instead of using dictionaries
    - Code is updated to load, pass around entries as `Entries' objects
      instead of as dictionaries
    - `text_search' and `text_to_jsonl' methods are annotated with
       type hints for the new `Entries' type
    - Code and Tests referencing entries are updated to use class style
      access patterns instead of the previous dictionary access patterns

  - Move `mark_entries_for_update' method into `TextToJsonl' base class
    - This is a more natural location for the method as it is only
      (to be) used by `text_to_jsonl' classes
    - Avoid circular reference issues on importing `Entries' class
This commit is contained in:
Debanjum Singh Solanky
2022-09-15 23:34:43 +03:00
parent 99754970ab
commit 7e9298f315
15 changed files with 161 additions and 131 deletions

View File

@@ -1,8 +1,6 @@
# Standard Packages
from pathlib import Path
import sys
import time
import hashlib
from os.path import join
from collections import OrderedDict
from typing import Optional, Union
@@ -83,38 +81,3 @@ class LRU(OrderedDict):
oldest = next(iter(self))
del self[oldest]
def mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=None):
# Hash all current and previous entries to identify new entries
start = time.time()
current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), current_entries))
previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), previous_entries))
end = time.time()
logger.debug(f"Hash previous, current entries: {end - start} seconds")
start = time.time()
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
# All entries that did not exist in the previous set are to be added
new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
# All entries that exist in both current and previous sets are kept
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
# Mark new entries with no ids for later embeddings generation
new_entries = [
(None, hash_to_current_entries[entry_hash])
for entry_hash in new_entry_hashes
]
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
existing_entries = [
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
for entry_hash in existing_entry_hashes
]
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
entries_with_ids = existing_entries_sorted + new_entries
end = time.time()
logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds")
return entries_with_ids

View File

@@ -1,4 +1,5 @@
# System Packages
import json
from pathlib import Path
from typing import List, Optional
@@ -75,4 +76,28 @@ class FullConfig(ConfigBase):
class SearchResponse(ConfigBase):
entry: str
score: str
additional: Optional[dict]
additional: Optional[dict]
class Entry():
raw: str
compiled: str
file: Optional[str]
def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
self.raw = raw
self.compiled = compiled
self.file = file
def to_json(self) -> str:
return json.dumps(self.__dict__, ensure_ascii=False)
def __repr__(self) -> str:
return self.__dict__.__repr__()
@classmethod
def from_dict(cls, dictionary: dict):
return cls(
raw=dictionary['raw'],
compiled=dictionary['compiled'],
file=dictionary.get('file', None)
)