Split current section once by heading to resolve org-mode indexing bug

- Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections
2026-03-02 21:19:12 +00:00 · 2024-07-06 16:58:05 +05:30
parent 6a135b1ed7
commit 010486fb36
2 changed files with 39 additions and 3 deletions
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -1,5 +1,6 @@
 import os
 import re
+import time

 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
@@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path):
            assert is_none_or_empty(entries[1])


+def test_extract_entries_when_child_headings_have_same_prefix():
+    """Extract org entries from entries having child headings with same prefix.
+    Prevents regressions like the one fixed in PR #840.
+    """
+    # Arrange
+    tmp_path = "tests/data/org/same_prefix_headings.org"
+    entry: str = """
+** 1
+*** 1.1
+**** 1.1.2
+""".strip()
+    data = {
+        f"{tmp_path}": entry,
+    }
+
+    # Act
+    # Extract Entries from specified Org files
+    start = time.time()
+    entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2)
+    end = time.time()
+    indexing_time = end - start
+
+    # Assert
+    explanation_msg = (
+        "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop."
+    )
+    assert indexing_time < 6 * len(entries), explanation_msg
+
+
 def test_entry_split_when_exceeds_max_tokens():
    "Ensure entries with compiled words exceeding max_tokens are split."
    # Arrange