From 0e3fb59e098100765977365b566ed3d50c6a6b9c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:18:48 +0800 Subject: [PATCH] Entries with no md headings should not get heading prefix prepended Files with no headings would previously get their entry be prefixed with a markdown heading prefix (#) --- src/khoj/processor/markdown/markdown_to_jsonl.py | 10 ++++++---- tests/test_markdown_to_jsonl.py | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index a1e4d0c1..9e08ae89 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -2,7 +2,6 @@ import glob import logging import re -import time from pathlib import Path from typing import List @@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl): with open(markdown_file, "r", encoding="utf8") as f: markdown_content = f.read() markdown_entries_per_file = [] + any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - prefix = "#" if entry.startswith("#") else "# " - if entry.strip(empty_escape_sequences) != "": - markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}") + # Add heading level as the regex split removed it from entries with headings + prefix = "#" if entry.startswith("#") else "# " if any_headings else "" + stripped_entry = entry.strip(empty_escape_sequences) + if stripped_entry != "": + markdown_entries_per_file.append(f"{prefix}{stripped_entry}") entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index dfb42fed..ca22f359 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 + # Ensure entries with no headings do not get heading prefix prepended + assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") def test_single_markdown_entry_to_jsonl(tmp_path):