From 45a991d75cfb9eef2c244045113649aeefc0819e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 17:47:33 +0800
Subject: [PATCH] Prepend entry heading to all compiled org snippets to improve
 search context

All compiled snippets split by max tokens (apart from first) do not
get the heading as context.

This limits search context required to retrieve these continuation
entries
---
 src/khoj/processor/org_mode/org_to_jsonl.py | 10 ++++++++--
 src/khoj/processor/text_to_jsonl.py         | 20 ++++++++++++++++++--
 src/khoj/utils/rawconfig.py                 |  6 +++++-
 tests/test_jsonl_to_jsonl.py                | 12 ++++--------
 tests/test_org_to_jsonl.py                  |  4 +++-
 5 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index 0950a089..ed3be1d0 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -1,7 +1,6 @@
 # Standard Packages
 import glob
 import logging
-import time
 from typing import Iterable, List
 
 # Internal Packages
@@ -139,7 +138,14 @@ class OrgToJsonl(TextToJsonl):
                     logger.debug(f"Body: {parsed_entry.body}")
 
             if compiled:
-                entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
+                entries.append(
+                    Entry(
+                        compiled=compiled,
+                        raw=f"{parsed_entry}",
+                        heading=f"{parsed_entry.heading}",
+                        file=f"{entry_to_file_map[parsed_entry]}",
+                    )
+                )
 
         return entries
 
diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index 22de2c01..e440af90 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -31,14 +31,30 @@ class TextToJsonl(ABC):
         "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
         chunked_entries: List[Entry] = []
         for entry in entries:
+            # Split entry into words
             compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
+
             # Drop long words instead of having entry truncated to maintain quality of entry processed by models
             compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
+
+            # Split entry into chunks of max tokens
             for chunk_index in range(0, len(compiled_entry_words), max_tokens):
                 compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
                 compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
-                entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
-                chunked_entries.append(entry_chunk)
+
+                # Prepend heading to all other chunks, the first chunk already has heading from original entry
+                if chunk_index > 0:
+                    compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
+
+                chunked_entries.append(
+                    Entry(
+                        compiled=compiled_entry_chunk,
+                        raw=entry.raw,
+                        heading=entry.heading,
+                        file=entry.file,
+                    )
+                )
+
         return chunked_entries
 
     def mark_entries_for_update(
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 389e80f6..6b87c220 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
 class Entry:
     raw: str
     compiled: str
+    heading: Optional[str]
     file: Optional[str]
 
-    def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
+    def __init__(
+        self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
+    ):
         self.raw = raw
         self.compiled = compiled
+        self.heading = heading
         self.file = file
 
     def to_json(self) -> str:
diff --git a/tests/test_jsonl_to_jsonl.py b/tests/test_jsonl_to_jsonl.py
index eb25d579..b52b5fc9 100644
--- a/tests/test_jsonl_to_jsonl.py
+++ b/tests/test_jsonl_to_jsonl.py
@@ -1,17 +1,13 @@
-# Standard Packages
-import json
-
 # Internal Packages
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
-from khoj.utils.jsonl import load_jsonl
 from khoj.utils.rawconfig import Entry
 
 
 def test_process_entries_from_single_input_jsonl(tmp_path):
     "Convert multiple jsonl entries from single file to entries."
     # Arrange
-    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
-{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
+    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
+{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
 """
     input_jsonl_file = create_file(tmp_path, input_jsonl)
 
@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
 def test_process_entries_from_multiple_input_jsonls(tmp_path):
     "Convert multiple jsonl entries from single file to entries."
     # Arrange
-    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
-    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
+    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
+    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
     input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
     input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
 
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index aed4983f..15dd368a 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -62,9 +62,11 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
 
     # Assert
     assert len(jsonl_data) == 2
+    # Ensure compiled entries split by max_words start with entry heading (for search context)
+    assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
 
 
-def test_entry_split_drops_large_words(tmp_path):
+def test_entry_split_drops_large_words():
     "Ensure entries drops words larger than specified max word length from compiled version."
     # Arrange
     entry_text = f"""*** Heading