Fix passing flags to re.split to break org, md content by heading level

`re.MULTILINE' should be passed to the `flags' argument, not the
`max_splits' argument of the `re.split' func

This was messing up the indexing by only allowing a maximum of
re.MULTILINE splits. Fixing this improves the search quality to
previous state
This commit is contained in:
Debanjum Singh Solanky
2024-04-02 21:05:06 +05:30
parent 32ac0622ff
commit 00f599ea78
2 changed files with 3 additions and 3 deletions

View File

@@ -87,7 +87,7 @@ class MarkdownToEntries(TextToEntries):
# If content is small or content has no children headings, save it as a single entry
if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
):
entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
entries.extend([markdown_content_with_ancestry])
@@ -98,7 +98,7 @@ class MarkdownToEntries(TextToEntries):
sections: List[str] = []
while len(sections) < 2:
next_heading_level += 1
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE)
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
for section in sections:
# Skip empty sections

View File

@@ -114,7 +114,7 @@ class OrgToEntries(TextToEntries):
sections: List[str] = []
while len(sections) < 2:
next_heading_level += 1
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE)
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
# Recurse down each non-empty section after parsing its body, heading and ancestry
for section in sections: