Add parent heading ancestory to extracted markdown entries for context

Improve, update the markdown to entries extractor tests
2026-03-09 21:29:11 +00:00 · 2024-01-29 15:31:48 +05:30
parent 86575b2946
commit d8f01876e5
2 changed files with 58 additions and 20 deletions
--- a/src/khoj/processor/content/markdown/markdown_to_entries.py
+++ b/src/khoj/processor/content/markdown/markdown_to_entries.py
@@ -4,11 +4,11 @@ from pathlib import Path
 from typing import List, Tuple
 import urllib3
 from langchain.text_splitter import MarkdownHeaderTextSplitter
 from khoj.database.models import Entry as DbEntry
 from khoj.database.models import KhojUser
 from khoj.processor.content.text_to_entries import TextToEntries
 from khoj.utils.constants import empty_escape_sequences
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry
@@ -76,16 +76,28 @@ class MarkdownToEntries(TextToEntries):
    def process_single_markdown_file(
        markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]]
    ):
-        markdown_heading_regex = r"^#"
+        headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")]
-
+        reversed_headers_to_split_on = list(reversed(headers_to_split_on))
        markdown_entries_per_file: List[str] = []
-        any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
+        previous_section_metadata, current_section_metadata = None, None
-        for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
+
-            # Add heading level as the regex split removed it from entries with headings
+        splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True)
-            prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
+        for section in splitter.split_text(markdown_content):
-            stripped_entry = entry.strip(empty_escape_sequences)
+            current_section_metadata = section.metadata.copy()
-            if stripped_entry != "":
+            # Append the section's content to the last entry if the metadata is the same
-                markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
+            if previous_section_metadata == current_section_metadata:
                markdown_entries_per_file[-1] = f"{markdown_entries_per_file[-1]}\n{section.page_content}"
            # Insert new entry with it's heading ancestry, if the section is under a new heading
            else:
                # Drop the current heading from the metadata. It is already in the section content
                if section.metadata:
                    section.metadata.pop(max(section.metadata))
                # Prepend the markdown section's heading ancestry
                for heading in reversed_headers_to_split_on:
                    if heading[1] in section.metadata:
                        section.page_content = f"{heading[0]} {section.metadata[heading[1]]}\n{section.page_content}"
                previous_section_metadata = current_section_metadata
                markdown_entries_per_file += [section.page_content]
        entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
        entries.extend(markdown_entries_per_file)
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -1,4 +1,3 @@
 import json
 import os
 from pathlib import Path
@@ -7,8 +6,8 @@ from khoj.utils.fs_syncer import get_markdown_files
 from khoj.utils.rawconfig import TextContentConfig
-def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
+def test_extract_markdown_with_no_headings(tmp_path):
-    "Convert files with no heading to jsonl."
+    "Convert markdown file with no heading to entry format."
    # Arrange
    entry = f"""
    - Bullet point 1
@@ -33,8 +32,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
    assert str(tmp_path) in entries[0].compiled
-def test_single_markdown_entry_to_jsonl(tmp_path):
+def test_extract_single_markdown_entry(tmp_path):
-    "Convert markdown entry from single file to jsonl."
+    "Convert markdown from single file to entry format."
    # Arrange
    entry = f"""### Heading
    \t\r
@@ -52,8 +51,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
    assert len(entries) == 1
-def test_multiple_markdown_entries_to_jsonl(tmp_path):
+def test_extract_multiple_markdown_entries(tmp_path):
-    "Convert multiple markdown entries from single file to jsonl."
+    "Convert multiple markdown from single file to entry format."
    # Arrange
    entry = f"""
 ### Heading 1
@@ -119,7 +118,8 @@ def test_extract_entries_with_different_level_headings(tmp_path):
    # Arrange
    entry = f"""
 # Heading 1
-## Heading 2
+## Sub-Heading 1.1
 # Heading 2
 """
    data = {
        f"{tmp_path}": entry,
@@ -130,9 +130,35 @@ def test_extract_entries_with_different_level_headings(tmp_path):
    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
    # Assert
-    assert len(entries) == 2
+    assert len(entries) == 3
    assert entries[0].raw == "# Heading 1"
-    assert entries[1].raw == "## Heading 2"
+    assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory"
    assert entries[2].raw == "# Heading 2"
 def test_extract_entries_with_text_before_headings(tmp_path):
    "Extract markdown entries with some text before any headings."
    # Arrange
    entry = f"""
 Text before headings
 # Heading 1
 body line 1
 ## Heading 2
 body line 2
 """
    data = {
        f"{tmp_path}": entry,
    }
    # Act
    # Extract Entries from specified Markdown files
    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
    # Assert
    assert len(entries) == 3
    assert entries[0].raw == "Text before headings"
    assert entries[1].raw == "# Heading 1\nbody line 1"
    assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory"
 # Helper Functions