Chunk text in preference order of para, sentence, word, character

- Previous simplistic chunking strategy of splitting text by space didn't capture notes with newlines, no spaces. For e.g in #620 - New strategy will try chunk the text at more natural points like paragraph, sentence, word first. If none of those work it'll split at character to fit within max token limit - Drop long words while preserving original delimiters Resolves #620
2026-03-02 13:18:18 +00:00 · 2024-01-29 05:03:29 +05:30
parent a627f56a64
commit 86575b2946
3 changed files with 46 additions and 17 deletions
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -54,12 +54,12 @@ def test_entry_split_when_exceeds_max_words():
    # Extract Entries from specified Org files
    entries = OrgToEntries.extract_org_entries(org_files=data)

-    # Split each entry from specified Org files by max words
-    entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4)
+    # Split each entry from specified Org files by max tokens
+    entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=6)

    # Assert
    assert len(entries) == 2
-    # Ensure compiled entries split by max_words start with entry heading (for search context)
+    # Ensure compiled entries split by max tokens start with entry heading (for search context)
    assert all([entry.compiled.startswith(expected_heading) for entry in entries])


--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -192,7 +192,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon

    # Assert
    assert (
-        "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message
+        "Deleted 0 entries. Created 3 new entries for user " in caplog.records[-1].message
    ), "new entry not split by max tokens"


@@ -250,7 +250,7 @@ conda activate khoj

    # Assert
    assert (
-        "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message
+        "Deleted 0 entries. Created 3 new entries for user " in caplog.records[-1].message
    ), "new entry not split by max tokens"