Fix extracting Markdown Entries with Top Level Headings

- Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression
2026-03-05 21:29:11 +00:00 · 2023-01-17 12:42:36 -03:00
parent 1a296518c5
commit 7b4f78776c
3 changed files with 44 additions and 4 deletions
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl):
        for markdown_file in markdown_files:
            with open(markdown_file) as f:
                markdown_content = f.read()
-                markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
-                for entry
-                in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
-                if entry.strip(empty_escape_sequences) != '']
+                markdown_entries_per_file = []
+                for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
+                    prefix = '#' if entry.startswith('#') else '# '
+                    if entry.strip(empty_escape_sequences) != '':
+                        markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}')
+
                entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
                entries.extend(markdown_entries_per_file)