mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Split current section once by heading to resolve org-mode indexing bug
- Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.content.text_to_entries import TextToEntries
|
||||
@@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path):
|
||||
assert is_none_or_empty(entries[1])
|
||||
|
||||
|
||||
def test_extract_entries_when_child_headings_have_same_prefix():
|
||||
"""Extract org entries from entries having child headings with same prefix.
|
||||
Prevents regressions like the one fixed in PR #840.
|
||||
"""
|
||||
# Arrange
|
||||
tmp_path = "tests/data/org/same_prefix_headings.org"
|
||||
entry: str = """
|
||||
** 1
|
||||
*** 1.1
|
||||
**** 1.1.2
|
||||
""".strip()
|
||||
data = {
|
||||
f"{tmp_path}": entry,
|
||||
}
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
start = time.time()
|
||||
entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2)
|
||||
end = time.time()
|
||||
indexing_time = end - start
|
||||
|
||||
# Assert
|
||||
explanation_msg = (
|
||||
"It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop."
|
||||
)
|
||||
assert indexing_time < 6 * len(entries), explanation_msg
|
||||
|
||||
|
||||
def test_entry_split_when_exceeds_max_tokens():
|
||||
"Ensure entries with compiled words exceeding max_tokens are split."
|
||||
# Arrange
|
||||
|
||||
Reference in New Issue
Block a user