diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index fb0fe4e5..c0fea077 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -4,11 +4,11 @@ from pathlib import Path from typing import List, Tuple import urllib3 +from langchain.text_splitter import MarkdownHeaderTextSplitter from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser from khoj.processor.content.text_to_entries import TextToEntries -from khoj.utils.constants import empty_escape_sequences from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry @@ -76,16 +76,28 @@ class MarkdownToEntries(TextToEntries): def process_single_markdown_file( markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]] ): - markdown_heading_regex = r"^#" - + headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")] + reversed_headers_to_split_on = list(reversed(headers_to_split_on)) markdown_entries_per_file: List[str] = [] - any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) - for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - # Add heading level as the regex split removed it from entries with headings - prefix = "#" if entry.startswith("#") else "# " if any_headings else "" - stripped_entry = entry.strip(empty_escape_sequences) - if stripped_entry != "": - markdown_entries_per_file.append(f"{prefix}{stripped_entry}") + previous_section_metadata, current_section_metadata = None, None + + splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True) + for section in splitter.split_text(markdown_content): + current_section_metadata = section.metadata.copy() + # Append the section's content to the last entry if the metadata is the same + if previous_section_metadata == current_section_metadata: + markdown_entries_per_file[-1] = f"{markdown_entries_per_file[-1]}\n{section.page_content}" + # Insert new entry with it's heading ancestry, if the section is under a new heading + else: + # Drop the current heading from the metadata. It is already in the section content + if section.metadata: + section.metadata.pop(max(section.metadata)) + # Prepend the markdown section's heading ancestry + for heading in reversed_headers_to_split_on: + if heading[1] in section.metadata: + section.page_content = f"{heading[0]} {section.metadata[heading[1]]}\n{section.page_content}" + previous_section_metadata = current_section_metadata + markdown_entries_per_file += [section.page_content] entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 8a086dbc..174c6c4d 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -7,8 +6,8 @@ from khoj.utils.fs_syncer import get_markdown_files from khoj.utils.rawconfig import TextContentConfig -def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - "Convert files with no heading to jsonl." +def test_extract_markdown_with_no_headings(tmp_path): + "Convert markdown file with no heading to entry format." # Arrange entry = f""" - Bullet point 1 @@ -33,8 +32,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): assert str(tmp_path) in entries[0].compiled -def test_single_markdown_entry_to_jsonl(tmp_path): - "Convert markdown entry from single file to jsonl." +def test_extract_single_markdown_entry(tmp_path): + "Convert markdown from single file to entry format." # Arrange entry = f"""### Heading \t\r @@ -52,8 +51,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path): assert len(entries) == 1 -def test_multiple_markdown_entries_to_jsonl(tmp_path): - "Convert multiple markdown entries from single file to jsonl." +def test_extract_multiple_markdown_entries(tmp_path): + "Convert multiple markdown from single file to entry format." # Arrange entry = f""" ### Heading 1 @@ -119,7 +118,8 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Arrange entry = f""" # Heading 1 -## Heading 2 +## Sub-Heading 1.1 +# Heading 2 """ data = { f"{tmp_path}": entry, @@ -130,9 +130,35 @@ def test_extract_entries_with_different_level_headings(tmp_path): entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Assert - assert len(entries) == 2 + assert len(entries) == 3 assert entries[0].raw == "# Heading 1" - assert entries[1].raw == "## Heading 2" + assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory" + assert entries[2].raw == "# Heading 2" + + +def test_extract_entries_with_text_before_headings(tmp_path): + "Extract markdown entries with some text before any headings." + # Arrange + entry = f""" +Text before headings +# Heading 1 +body line 1 +## Heading 2 +body line 2 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) + + # Assert + assert len(entries) == 3 + assert entries[0].raw == "Text before headings" + assert entries[1].raw == "# Heading 1\nbody line 1" + assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory" # Helper Functions