Add method to split entries by specified max tokens limit

- Issue
   ML Models truncate entries exceeding some max token limit.
   This lowers the quality of search results

- Fix
  Split entries by max tokens before indexing.
  This should improve searching for content in longer entries.

- Miscellaneous
  - Test method to split entries by max tokens
This commit is contained in:
Debanjum Singh Solanky
2022-12-23 15:45:53 -03:00
parent d3e175370f
commit e057c8e208
2 changed files with 42 additions and 0 deletions

View File

@@ -3,6 +3,7 @@ import json
# Internal Packages
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.processor.text_to_jsonl import TextToJsonl
from src.utils.helpers import is_none_or_empty
@@ -35,6 +36,34 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
assert is_none_or_empty(jsonl_data)
def test_entry_split_when_exceeds_max_words(tmp_path):
"Ensure entries with compiled words exceeding max_words are split."
# Arrange
entry = f'''*** Heading
:PROPERTIES:
:ID: 42-42-42
:END:
\t\r
Body Line 1
'''
orgfile = create_file(tmp_path, entry)
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
# Split Each Entry from specified Org files by Max Words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
TextToJsonl.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
max_tokens = 2)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
assert len(jsonl_data) == 2
def test_entry_with_body_to_jsonl(tmp_path):
"Ensure entries with valid body text are loaded."
# Arrange