Use new Text Entry class to track text entries in Intermediate Format

- Context
  - The app maintains all text content in a standard, intermediate format
  - The intermediate format was loaded, passed around as a dictionary
    for easier, faster updates to the intermediate format schema initially
  - The intermediate format is reasonably stable now, given it's usage
    by all 3 text content types currently implemented

- Changes
  - Concretize text entries into `Entries' class instead of using dictionaries
    - Code is updated to load, pass around entries as `Entries' objects
      instead of as dictionaries
    - `text_search' and `text_to_jsonl' methods are annotated with
       type hints for the new `Entries' type
    - Code and Tests referencing entries are updated to use class style
      access patterns instead of the previous dictionary access patterns

  - Move `mark_entries_for_update' method into `TextToJsonl' base class
    - This is a more natural location for the method as it is only
      (to be) used by `text_to_jsonl' classes
    - Avoid circular reference issues on importing `Entries' class
This commit is contained in:
Debanjum Singh Solanky
2022-09-15 23:34:43 +03:00
parent 99754970ab
commit 7e9298f315
15 changed files with 161 additions and 131 deletions

View File

@@ -8,14 +8,15 @@ import torch
# Application Packages
from src.search_filter.date_filter import DateFilter
from src.utils.rawconfig import Entry
def test_date_filter():
embeddings = torch.randn(3, 10)
entries = [
{'compiled': '', 'raw': 'Entry with no date'},
{'compiled': '', 'raw': 'April Fools entry: 1984-04-01'},
{'compiled': '', 'raw': 'Entry with date:1984-04-02'}]
Entry(compiled='', raw='Entry with no date'),
Entry(compiled='', raw='April Fools entry: 1984-04-01'),
Entry(compiled='', raw='Entry with date:1984-04-02')
]
q_with_no_date_filter = 'head tail'
ret_query, entry_indices = DateFilter().apply(q_with_no_date_filter, entries)

View File

@@ -3,6 +3,7 @@ import torch
# Application Packages
from src.search_filter.file_filter import FileFilter
from src.utils.rawconfig import Entry
def test_no_file_filter():
@@ -104,9 +105,10 @@ def test_multiple_file_filter():
def arrange_content():
embeddings = torch.randn(4, 10)
entries = [
{'compiled': '', 'raw': 'First Entry', 'file': 'file 1.org'},
{'compiled': '', 'raw': 'Second Entry', 'file': 'file2.org'},
{'compiled': '', 'raw': 'Third Entry', 'file': 'file 1.org'},
{'compiled': '', 'raw': 'Fourth Entry', 'file': 'file2.org'}]
Entry(compiled='', raw='First Entry', file= 'file 1.org'),
Entry(compiled='', raw='Second Entry', file= 'file2.org'),
Entry(compiled='', raw='Third Entry', file= 'file 1.org'),
Entry(compiled='', raw='Fourth Entry', file= 'file2.org')
]
return embeddings, entries
return entries

View File

@@ -70,7 +70,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
image_files_url='/static/images',
count=1)
actual_image_path = output_directory.joinpath(Path(results[0]["entry"]).name)
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
actual_image = Image.open(actual_image_path)
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))

View File

@@ -76,7 +76,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
# Assert
# Actual_data should contain "Khoj via Emacs" entry
search_result = results[0]["entry"]
search_result = results[0].entry
assert "git clone" in search_result

View File

@@ -1,6 +1,7 @@
# Application Packages
from src.search_filter.word_filter import WordFilter
from src.utils.config import SearchType
from src.utils.rawconfig import Entry
def test_no_word_filter():
@@ -69,9 +70,10 @@ def test_word_include_and_exclude_filter():
def arrange_content():
entries = [
{'compiled': '', 'raw': 'Minimal Entry'},
{'compiled': '', 'raw': 'Entry with exclude_word'},
{'compiled': '', 'raw': 'Entry with include_word'},
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
Entry(compiled='', raw='Minimal Entry'),
Entry(compiled='', raw='Entry with exclude_word'),
Entry(compiled='', raw='Entry with include_word'),
Entry(compiled='', raw='Entry with include_word and exclude_word')
]
return entries