Fix passing flags to re.split to break org, md content by heading level

`re.MULTILINE' should be passed to the `flags' argument, not the `max_splits' argument of the `re.split' func This was messing up the indexing by only allowing a maximum of re.MULTILINE splits. Fixing this improves the search quality to previous state
2026-03-02 21:19:12 +00:00 · 2024-04-02 21:05:06 +05:30
parent 32ac0622ff
commit 00f599ea78
2 changed files with 3 additions and 3 deletions
--- a/src/khoj/processor/content/markdown/markdown_to_entries.py
+++ b/src/khoj/processor/content/markdown/markdown_to_entries.py
@@ -87,7 +87,7 @@ class MarkdownToEntries(TextToEntries):

        # If content is small or content has no children headings, save it as a single entry
        if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
-            rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE
+            rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
        ):
            entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
            entries.extend([markdown_content_with_ancestry])
@@ -98,7 +98,7 @@ class MarkdownToEntries(TextToEntries):
        sections: List[str] = []
        while len(sections) < 2:
            next_heading_level += 1
-            sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE)
+            sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)

        for section in sections:
            # Skip empty sections
--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -114,7 +114,7 @@ class OrgToEntries(TextToEntries):
        sections: List[str] = []
        while len(sections) < 2:
            next_heading_level += 1
-            sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE)
+            sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)

        # Recurse down each non-empty section after parsing its body, heading and ancestry
        for section in sections: