Fix extracting Markdown Entries with Top Level Headings

- Previously top level headings would have get stripped of the
  space between heading text and the prefix # symbols. That is,
  `# Top Level Heading' would get converted to `#Top Level Heading'
- This would mess up their rendering as a heading in search results

- Add unit tests to text_to_jsonl processors to prevent regression
This commit is contained in:
Debanjum Singh Solanky
2023-01-17 12:42:36 -03:00
parent 1a296518c5
commit 7b4f78776c
3 changed files with 44 additions and 4 deletions

View File

@@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl):
for markdown_file in markdown_files:
with open(markdown_file) as f:
markdown_content = f.read()
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
for entry
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
if entry.strip(empty_escape_sequences) != '']
markdown_entries_per_file = []
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
prefix = '#' if entry.startswith('#') else '# '
if entry.strip(empty_escape_sequences) != '':
markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}')
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
entries.extend(markdown_entries_per_file)