Prepend entry heading to all compiled org snippets to improve search context

All compiled snippets split by max tokens (apart from first) do not get the heading as context. This limits search context required to retrieve these continuation entries
2026-03-05 21:29:11 +00:00 · 2023-05-03 17:47:33 +08:00
parent 3386cc92b5
commit 45a991d75c
5 changed files with 38 additions and 14 deletions
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -1,7 +1,6 @@
 # Standard Packages
 import glob
 import logging
-import time
 from typing import Iterable, List

 # Internal Packages
@@ -139,7 +138,14 @@ class OrgToJsonl(TextToJsonl):
                    logger.debug(f"Body: {parsed_entry.body}")

            if compiled:
-                entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
+                entries.append(
+                    Entry(
+                        compiled=compiled,
+                        raw=f"{parsed_entry}",
+                        heading=f"{parsed_entry.heading}",
+                        file=f"{entry_to_file_map[parsed_entry]}",
+                    )
+                )

        return entries

--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -31,14 +31,30 @@ class TextToJsonl(ABC):
        "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
        chunked_entries: List[Entry] = []
        for entry in entries:
+            # Split entry into words
            compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
+
            # Drop long words instead of having entry truncated to maintain quality of entry processed by models
            compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
+
+            # Split entry into chunks of max tokens
            for chunk_index in range(0, len(compiled_entry_words), max_tokens):
                compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
                compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
-                entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
-                chunked_entries.append(entry_chunk)
+
+                # Prepend heading to all other chunks, the first chunk already has heading from original entry
+                if chunk_index > 0:
+                    compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
+
+                chunked_entries.append(
+                    Entry(
+                        compiled=compiled_entry_chunk,
+                        raw=entry.raw,
+                        heading=entry.heading,
+                        file=entry.file,
+                    )
+                )
+
        return chunked_entries

    def mark_entries_for_update(
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
 class Entry:
    raw: str
    compiled: str
+    heading: Optional[str]
    file: Optional[str]

-    def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
+    def __init__(
+        self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
+    ):
        self.raw = raw
        self.compiled = compiled
+        self.heading = heading
        self.file = file

    def to_json(self) -> str: