Include Filename, Entry Heading in All Compiled Entries to Improve Search Context

Merge pull request #214 from debanjum/add-filename-heading-to-compiled-entry-for-context - Set filename as top heading in compiled org, markdown entries - Note: *Khoj was already indexing filenames in compiled markdown entries but they weren't set as top level headings but rather appended as bare text*. The updated structure should provide more schematic context of relevance - Set entry heading as heading for compiled org, md entries, even if split by max tokens - Snip prepended heading to avoid crossing model max_token limits - Entries with no md headings should not get heading prefix prepended
2026-03-06 05:39:12 +00:00 · 2023-05-03 22:59:30 +08:00
parent 3386cc92b5 6b535cc345
commit f0253e2cbb
7 changed files with 73 additions and 23 deletions
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -2,7 +2,6 @@
 import glob
 import logging
 import re
-import time
 from pathlib import Path
 from typing import List

@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
            with open(markdown_file, "r", encoding="utf8") as f:
                markdown_content = f.read()
                markdown_entries_per_file = []
+                any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
                for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
-                    prefix = "#" if entry.startswith("#") else "# "
-                    if entry.strip(empty_escape_sequences) != "":
-                        markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
+                    # Add heading level as the regex split removed it from entries with headings
+                    prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
+                    stripped_entry = entry.strip(empty_escape_sequences)
+                    if stripped_entry != "":
+                        markdown_entries_per_file.append(f"{prefix}{stripped_entry}")

                entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
                entries.extend(markdown_entries_per_file)
@@ -126,9 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
        entries = []
        for parsed_entry in parsed_entries:
            entry_filename = Path(entry_to_file_map[parsed_entry])
+            heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
            # Append base filename to compiled entry for context to model
-            compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
-            entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
+            # Increment heading level for heading entries and make filename as its top level heading
+            prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
+            compiled_entry = f"{prefix}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=f"{prefix}{heading}",
+                    file=f"{entry_filename}",
+                )
+            )

        logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")

--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -1,7 +1,7 @@
 # Standard Packages
 import glob
 import logging
-import time
+from pathlib import Path
 from typing import Iterable, List

 # Internal Packages
@@ -113,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
                # Ignore title notes i.e notes with just headings and empty body
                continue

-            compiled = f"{parsed_entry.heading}."
+            # Prepend filename as top heading to entry
+            filename = Path(entry_to_file_map[parsed_entry]).stem
+            heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
+
+            compiled = heading
            if state.verbose > 2:
                logger.debug(f"Title: {parsed_entry.heading}")

@@ -139,7 +143,14 @@ class OrgToJsonl(TextToJsonl):
                    logger.debug(f"Body: {parsed_entry.body}")

            if compiled:
-                entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
+                entries.append(
+                    Entry(
+                        compiled=compiled,
+                        raw=f"{parsed_entry}",
+                        heading=f"{heading}",
+                        file=f"{entry_to_file_map[parsed_entry]}",
+                    )
+                )

        return entries

--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -31,14 +31,33 @@ class TextToJsonl(ABC):
        "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
        chunked_entries: List[Entry] = []
        for entry in entries:
+            # Split entry into words
            compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
+
            # Drop long words instead of having entry truncated to maintain quality of entry processed by models
            compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
+
+            # Split entry into chunks of max tokens
            for chunk_index in range(0, len(compiled_entry_words), max_tokens):
                compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
                compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
-                entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
-                chunked_entries.append(entry_chunk)
+
+                # Prepend heading to all other chunks, the first chunk already has heading from original entry
+                if chunk_index > 0:
+                    # Snip heading to avoid crossing max_tokens limit
+                    # Keep last 100 characters of heading as entry heading more important than filename
+                    snipped_heading = entry.heading[-100:]
+                    compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}"
+
+                chunked_entries.append(
+                    Entry(
+                        compiled=compiled_entry_chunk,
+                        raw=entry.raw,
+                        heading=entry.heading,
+                        file=entry.file,
+                    )
+                )
+
        return chunked_entries

    def mark_entries_for_update(
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
 class Entry:
    raw: str
    compiled: str
+    heading: Optional[str]
    file: Optional[str]

-    def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
+    def __init__(
+        self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
+    ):
        self.raw = raw
        self.compiled = compiled
+        self.heading = heading
        self.file = file

    def to_json(self) -> str:
--- a/tests/test_jsonl_to_jsonl.py
+++ b/tests/test_jsonl_to_jsonl.py
@@ -1,17 +1,13 @@
-# Standard Packages
-import json
-
 # Internal Packages
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
-from khoj.utils.jsonl import load_jsonl
 from khoj.utils.rawconfig import Entry


 def test_process_entries_from_single_input_jsonl(tmp_path):
    "Convert multiple jsonl entries from single file to entries."
    # Arrange
-    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
-{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
+    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
+{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
 """
    input_jsonl_file = create_file(tmp_path, input_jsonl)

@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
 def test_process_entries_from_multiple_input_jsonls(tmp_path):
    "Convert multiple jsonl entries from single file to entries."
    # Arrange
-    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
-    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
+    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
+    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
    input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
    input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")

--- a/tests/test_markdown_to_jsonl.py
+++ b/tests/test_markdown_to_jsonl.py
@@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
    - Bullet point 2
    """
    markdownfile = create_file(tmp_path, entry)
+    expected_heading = "# " + markdownfile.stem

    # Act
    # Extract Entries from specified Markdown files
@@ -27,6 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):

    # Assert
    assert len(jsonl_data) == 1
+    # Ensure raw entry with no headings do not get heading prefix prepended
+    assert not jsonl_data[0]["raw"].startswith("#")
+    # Ensure compiled entry has filename prepended as top level heading
+    assert jsonl_data[0]["compiled"].startswith(expected_heading)


 def test_single_markdown_entry_to_jsonl(tmp_path):
@@ -128,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):


 # Helper Functions
-def create_file(tmp_path, entry=None, filename="test.md"):
+def create_file(tmp_path: Path, entry=None, filename="test.md"):
    markdown_file = tmp_path / filename
    markdown_file.touch()
    if entry:
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
    Body Line
    """
    orgfile = create_file(tmp_path, entry)
+    expected_heading = f"* {orgfile.stem}\n** Heading"

    # Act
    # Extract Entries from specified Org files
@@ -55,16 +56,18 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
    # Split each entry from specified Org files by max words
    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
        TextToJsonl.split_entries_by_max_tokens(
-            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
+            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
        )
    )
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
    assert len(jsonl_data) == 2
+    # Ensure compiled entries split by max_words start with entry heading (for search context)
+    assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])


-def test_entry_split_drops_large_words(tmp_path):
+def test_entry_split_drops_large_words():
    "Ensure entries drops words larger than specified max word length from compiled version."
    # Arrange
    entry_text = f"""*** Heading