Add method to split entries by specified max tokens limit

- Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens
2026-03-02 21:19:12 +00:00 · 2022-12-23 15:45:53 -03:00
parent d3e175370f
commit e057c8e208
2 changed files with 42 additions and 0 deletions
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -3,6 +3,7 @@ import json

 # Internal Packages
 from src.processor.org_mode.org_to_jsonl import OrgToJsonl
+from src.processor.text_to_jsonl import TextToJsonl
 from src.utils.helpers import is_none_or_empty


@@ -35,6 +36,34 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
            assert is_none_or_empty(jsonl_data)


+def test_entry_split_when_exceeds_max_words(tmp_path):
+    "Ensure entries with compiled words exceeding max_words are split."
+    # Arrange
+    entry = f'''*** Heading
+    :PROPERTIES:
+    :ID:       42-42-42
+    :END:
+    \t\r
+    Body Line 1
+    '''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
+
+    # Split Each Entry from specified Org files by Max Words
+    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
+        TextToJsonl.split_entries_by_max_tokens(
+            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
+            max_tokens = 2)
+        )
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 2
+
+
 def test_entry_with_body_to_jsonl(tmp_path):
    "Ensure entries with valid body text are loaded."
    # Arrange