mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Add method to split entries by specified max tokens limit
- Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens
This commit is contained in:
@@ -3,6 +3,7 @@ import json
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from src.processor.text_to_jsonl import TextToJsonl
|
||||
from src.utils.helpers import is_none_or_empty
|
||||
|
||||
|
||||
@@ -35,6 +36,34 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||
assert is_none_or_empty(jsonl_data)
|
||||
|
||||
|
||||
def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||
"Ensure entries with compiled words exceeding max_words are split."
|
||||
# Arrange
|
||||
entry = f'''*** Heading
|
||||
:PROPERTIES:
|
||||
:ID: 42-42-42
|
||||
:END:
|
||||
\t\r
|
||||
Body Line 1
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Split Each Entry from specified Org files by Max Words
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
TextToJsonl.split_entries_by_max_tokens(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
|
||||
max_tokens = 2)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 2
|
||||
|
||||
|
||||
def test_entry_with_body_to_jsonl(tmp_path):
|
||||
"Ensure entries with valid body text are loaded."
|
||||
# Arrange
|
||||
|
||||
Reference in New Issue
Block a user