mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Add parent heading ancestory to extracted markdown entries for context
Improve, update the markdown to entries extractor tests
This commit is contained in:
@@ -4,11 +4,11 @@ from pathlib import Path
|
|||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import urllib3
|
import urllib3
|
||||||
|
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
||||||
|
|
||||||
from khoj.database.models import Entry as DbEntry
|
from khoj.database.models import Entry as DbEntry
|
||||||
from khoj.database.models import KhojUser
|
from khoj.database.models import KhojUser
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
@@ -76,16 +76,28 @@ class MarkdownToEntries(TextToEntries):
|
|||||||
def process_single_markdown_file(
|
def process_single_markdown_file(
|
||||||
markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]]
|
markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]]
|
||||||
):
|
):
|
||||||
markdown_heading_regex = r"^#"
|
headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")]
|
||||||
|
reversed_headers_to_split_on = list(reversed(headers_to_split_on))
|
||||||
markdown_entries_per_file: List[str] = []
|
markdown_entries_per_file: List[str] = []
|
||||||
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
previous_section_metadata, current_section_metadata = None, None
|
||||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
|
||||||
# Add heading level as the regex split removed it from entries with headings
|
splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True)
|
||||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
for section in splitter.split_text(markdown_content):
|
||||||
stripped_entry = entry.strip(empty_escape_sequences)
|
current_section_metadata = section.metadata.copy()
|
||||||
if stripped_entry != "":
|
# Append the section's content to the last entry if the metadata is the same
|
||||||
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
if previous_section_metadata == current_section_metadata:
|
||||||
|
markdown_entries_per_file[-1] = f"{markdown_entries_per_file[-1]}\n{section.page_content}"
|
||||||
|
# Insert new entry with it's heading ancestry, if the section is under a new heading
|
||||||
|
else:
|
||||||
|
# Drop the current heading from the metadata. It is already in the section content
|
||||||
|
if section.metadata:
|
||||||
|
section.metadata.pop(max(section.metadata))
|
||||||
|
# Prepend the markdown section's heading ancestry
|
||||||
|
for heading in reversed_headers_to_split_on:
|
||||||
|
if heading[1] in section.metadata:
|
||||||
|
section.page_content = f"{heading[0]} {section.metadata[heading[1]]}\n{section.page_content}"
|
||||||
|
previous_section_metadata = current_section_metadata
|
||||||
|
markdown_entries_per_file += [section.page_content]
|
||||||
|
|
||||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||||
entries.extend(markdown_entries_per_file)
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -7,8 +6,8 @@ from khoj.utils.fs_syncer import get_markdown_files
|
|||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
|
|
||||||
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
def test_extract_markdown_with_no_headings(tmp_path):
|
||||||
"Convert files with no heading to jsonl."
|
"Convert markdown file with no heading to entry format."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""
|
entry = f"""
|
||||||
- Bullet point 1
|
- Bullet point 1
|
||||||
@@ -33,8 +32,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
assert str(tmp_path) in entries[0].compiled
|
assert str(tmp_path) in entries[0].compiled
|
||||||
|
|
||||||
|
|
||||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
def test_extract_single_markdown_entry(tmp_path):
|
||||||
"Convert markdown entry from single file to jsonl."
|
"Convert markdown from single file to entry format."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""### Heading
|
entry = f"""### Heading
|
||||||
\t\r
|
\t\r
|
||||||
@@ -52,8 +51,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
|||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
def test_extract_multiple_markdown_entries(tmp_path):
|
||||||
"Convert multiple markdown entries from single file to jsonl."
|
"Convert multiple markdown from single file to entry format."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""
|
entry = f"""
|
||||||
### Heading 1
|
### Heading 1
|
||||||
@@ -119,7 +118,8 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""
|
entry = f"""
|
||||||
# Heading 1
|
# Heading 1
|
||||||
## Heading 2
|
## Sub-Heading 1.1
|
||||||
|
# Heading 2
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {
|
||||||
f"{tmp_path}": entry,
|
f"{tmp_path}": entry,
|
||||||
@@ -130,9 +130,35 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||||||
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 3
|
||||||
assert entries[0].raw == "# Heading 1"
|
assert entries[0].raw == "# Heading 1"
|
||||||
assert entries[1].raw == "## Heading 2"
|
assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory"
|
||||||
|
assert entries[2].raw == "# Heading 2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_entries_with_text_before_headings(tmp_path):
|
||||||
|
"Extract markdown entries with some text before any headings."
|
||||||
|
# Arrange
|
||||||
|
entry = f"""
|
||||||
|
Text before headings
|
||||||
|
# Heading 1
|
||||||
|
body line 1
|
||||||
|
## Heading 2
|
||||||
|
body line 2
|
||||||
|
"""
|
||||||
|
data = {
|
||||||
|
f"{tmp_path}": entry,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Markdown files
|
||||||
|
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 3
|
||||||
|
assert entries[0].raw == "Text before headings"
|
||||||
|
assert entries[1].raw == "# Heading 1\nbody line 1"
|
||||||
|
assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory"
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
|
|||||||
Reference in New Issue
Block a user