mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 13:25:11 +00:00
Entries with no md headings should not get heading prefix prepended
Files with no headings would previously get their entry be prefixed with a markdown heading prefix (#)
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
with open(markdown_file, "r", encoding="utf8") as f:
|
with open(markdown_file, "r", encoding="utf8") as f:
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
markdown_entries_per_file = []
|
markdown_entries_per_file = []
|
||||||
|
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||||
prefix = "#" if entry.startswith("#") else "# "
|
# Add heading level as the regex split removed it from entries with headings
|
||||||
if entry.strip(empty_escape_sequences) != "":
|
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||||
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
|
stripped_entry = entry.strip(empty_escape_sequences)
|
||||||
|
if stripped_entry != "":
|
||||||
|
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||||
|
|
||||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||||
entries.extend(markdown_entries_per_file)
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(jsonl_data) == 1
|
||||||
|
# Ensure entries with no headings do not get heading prefix prepended
|
||||||
|
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
|
||||||
|
|
||||||
|
|
||||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user