mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 21:29:11 +00:00
Fix extracting Markdown Entries with Top Level Headings
- Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression
This commit is contained in:
@@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
for markdown_file in markdown_files:
|
||||
with open(markdown_file) as f:
|
||||
markdown_content = f.read()
|
||||
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
||||
for entry
|
||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||
if entry.strip(empty_escape_sequences) != '']
|
||||
markdown_entries_per_file = []
|
||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||
prefix = '#' if entry.startswith('#') else '# '
|
||||
if entry.strip(empty_escape_sequences) != '':
|
||||
markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}')
|
||||
|
||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
||||
entries.extend(markdown_entries_per_file)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user