mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 05:40:17 +00:00
Fix passing flags to re.split to break org, md content by heading level
`re.MULTILINE' should be passed to the `flags' argument, not the `max_splits' argument of the `re.split' func This was messing up the indexing by only allowing a maximum of re.MULTILINE splits. Fixing this improves the search quality to previous state
This commit is contained in:
@@ -87,7 +87,7 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
|
|
||||||
# If content is small or content has no children headings, save it as a single entry
|
# If content is small or content has no children headings, save it as a single entry
|
||||||
if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
|
if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
|
||||||
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE
|
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
|
||||||
):
|
):
|
||||||
entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
|
entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
|
||||||
entries.extend([markdown_content_with_ancestry])
|
entries.extend([markdown_content_with_ancestry])
|
||||||
@@ -98,7 +98,7 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
sections: List[str] = []
|
sections: List[str] = []
|
||||||
while len(sections) < 2:
|
while len(sections) < 2:
|
||||||
next_heading_level += 1
|
next_heading_level += 1
|
||||||
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE)
|
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
|
||||||
|
|
||||||
for section in sections:
|
for section in sections:
|
||||||
# Skip empty sections
|
# Skip empty sections
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class OrgToEntries(TextToEntries):
|
|||||||
sections: List[str] = []
|
sections: List[str] = []
|
||||||
while len(sections) < 2:
|
while len(sections) < 2:
|
||||||
next_heading_level += 1
|
next_heading_level += 1
|
||||||
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE)
|
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
|
||||||
|
|
||||||
# Recurse down each non-empty section after parsing its body, heading and ancestry
|
# Recurse down each non-empty section after parsing its body, heading and ancestry
|
||||||
for section in sections:
|
for section in sections:
|
||||||
|
|||||||
Reference in New Issue
Block a user