mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Entries with no md headings should not get heading prefix prepended
Files with no headings would previously get their entry be prefixed with a markdown heading prefix (#)
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
import glob
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
|
||||
with open(markdown_file, "r", encoding="utf8") as f:
|
||||
markdown_content = f.read()
|
||||
markdown_entries_per_file = []
|
||||
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||
prefix = "#" if entry.startswith("#") else "# "
|
||||
if entry.strip(empty_escape_sequences) != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
|
||||
# Add heading level as the regex split removed it from entries with headings
|
||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||
stripped_entry = entry.strip(empty_escape_sequences)
|
||||
if stripped_entry != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||
|
||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||
entries.extend(markdown_entries_per_file)
|
||||
|
||||
@@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 1
|
||||
# Ensure entries with no headings do not get heading prefix prepended
|
||||
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
|
||||
|
||||
|
||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||
|
||||
Reference in New Issue
Block a user