From 45a991d75cfb9eef2c244045113649aeefc0819e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 17:47:33 +0800
Subject: [PATCH 1/6] Prepend entry heading to all compiled org snippets to
 improve search context

All compiled snippets split by max tokens (apart from first) do not
get the heading as context.

This limits search context required to retrieve these continuation
entries
---
 src/khoj/processor/org_mode/org_to_jsonl.py | 10 ++++++++--
 src/khoj/processor/text_to_jsonl.py         | 20 ++++++++++++++++++--
 src/khoj/utils/rawconfig.py                 |  6 +++++-
 tests/test_jsonl_to_jsonl.py                | 12 ++++--------
 tests/test_org_to_jsonl.py                  |  4 +++-
 5 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index 0950a089..ed3be1d0 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -1,7 +1,6 @@
 # Standard Packages
 import glob
 import logging
-import time
 from typing import Iterable, List
 
 # Internal Packages
@@ -139,7 +138,14 @@ class OrgToJsonl(TextToJsonl):
                     logger.debug(f"Body: {parsed_entry.body}")
 
             if compiled:
-                entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
+                entries.append(
+                    Entry(
+                        compiled=compiled,
+                        raw=f"{parsed_entry}",
+                        heading=f"{parsed_entry.heading}",
+                        file=f"{entry_to_file_map[parsed_entry]}",
+                    )
+                )
 
         return entries
 
diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index 22de2c01..e440af90 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -31,14 +31,30 @@ class TextToJsonl(ABC):
         "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
         chunked_entries: List[Entry] = []
         for entry in entries:
+            # Split entry into words
             compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
+
             # Drop long words instead of having entry truncated to maintain quality of entry processed by models
             compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
+
+            # Split entry into chunks of max tokens
             for chunk_index in range(0, len(compiled_entry_words), max_tokens):
                 compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
                 compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
-                entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
-                chunked_entries.append(entry_chunk)
+
+                # Prepend heading to all other chunks, the first chunk already has heading from original entry
+                if chunk_index > 0:
+                    compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
+
+                chunked_entries.append(
+                    Entry(
+                        compiled=compiled_entry_chunk,
+                        raw=entry.raw,
+                        heading=entry.heading,
+                        file=entry.file,
+                    )
+                )
+
         return chunked_entries
 
     def mark_entries_for_update(
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 389e80f6..6b87c220 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
 class Entry:
     raw: str
     compiled: str
+    heading: Optional[str]
     file: Optional[str]
 
-    def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
+    def __init__(
+        self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
+    ):
         self.raw = raw
         self.compiled = compiled
+        self.heading = heading
         self.file = file
 
     def to_json(self) -> str:
diff --git a/tests/test_jsonl_to_jsonl.py b/tests/test_jsonl_to_jsonl.py
index eb25d579..b52b5fc9 100644
--- a/tests/test_jsonl_to_jsonl.py
+++ b/tests/test_jsonl_to_jsonl.py
@@ -1,17 +1,13 @@
-# Standard Packages
-import json
-
 # Internal Packages
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
-from khoj.utils.jsonl import load_jsonl
 from khoj.utils.rawconfig import Entry
 
 
 def test_process_entries_from_single_input_jsonl(tmp_path):
     "Convert multiple jsonl entries from single file to entries."
     # Arrange
-    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
-{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
+    input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
+{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
 """
     input_jsonl_file = create_file(tmp_path, input_jsonl)
 
@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
 def test_process_entries_from_multiple_input_jsonls(tmp_path):
     "Convert multiple jsonl entries from single file to entries."
     # Arrange
-    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
-    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
+    input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
+    input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
     input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
     input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
 
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index aed4983f..15dd368a 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -62,9 +62,11 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
 
     # Assert
     assert len(jsonl_data) == 2
+    # Ensure compiled entries split by max_words start with entry heading (for search context)
+    assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
 
 
-def test_entry_split_drops_large_words(tmp_path):
+def test_entry_split_drops_large_words():
     "Ensure entries drops words larger than specified max word length from compiled version."
     # Arrange
     entry_text = f"""*** Heading

From 0e3fb59e098100765977365b566ed3d50c6a6b9c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 18:18:48 +0800
Subject: [PATCH 2/6] Entries with no md headings should not get heading prefix
 prepended

Files with no headings would previously get their entry be prefixed
with a markdown heading prefix (#)
---
 src/khoj/processor/markdown/markdown_to_jsonl.py | 10 ++++++----
 tests/test_markdown_to_jsonl.py                  |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index a1e4d0c1..9e08ae89 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -2,7 +2,6 @@
 import glob
 import logging
 import re
-import time
 from pathlib import Path
 from typing import List
 
@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
             with open(markdown_file, "r", encoding="utf8") as f:
                 markdown_content = f.read()
                 markdown_entries_per_file = []
+                any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
                 for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
-                    prefix = "#" if entry.startswith("#") else "# "
-                    if entry.strip(empty_escape_sequences) != "":
-                        markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
+                    # Add heading level as the regex split removed it from entries with headings
+                    prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
+                    stripped_entry = entry.strip(empty_escape_sequences)
+                    if stripped_entry != "":
+                        markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
 
                 entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
                 entries.extend(markdown_entries_per_file)
diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py
index dfb42fed..ca22f359 100644
--- a/tests/test_markdown_to_jsonl.py
+++ b/tests/test_markdown_to_jsonl.py
@@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
 
     # Assert
     assert len(jsonl_data) == 1
+    # Ensure entries with no headings do not get heading prefix prepended
+    assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
 
 
 def test_single_markdown_entry_to_jsonl(tmp_path):

From 5de04621b5f038be59f869ddecd7dd9fce3fd89c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 18:55:56 +0800
Subject: [PATCH 3/6] Set filename as top heading of md entries for better
 search context

Previously filename was appended to the end of the compiled entry.
This didn't provide appropriate structured context

Test filename getting prepended as heading to compiled entry
---
 src/khoj/processor/markdown/markdown_to_jsonl.py | 5 +++--
 tests/test_markdown_to_jsonl.py                  | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index 9e08ae89..20cf9b2c 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl):
         for parsed_entry in parsed_entries:
             entry_filename = Path(entry_to_file_map[parsed_entry])
             # Append base filename to compiled entry for context to model
-            compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
-            entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
+            # Increment heading level for heading entries and make filename as its top level heading
+            prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
+            compiled_entry = f"{prefix}{parsed_entry}"
 
         logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
 
diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py
index ca22f359..87a1a07e 100644
--- a/tests/test_markdown_to_jsonl.py
+++ b/tests/test_markdown_to_jsonl.py
@@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
     - Bullet point 2
     """
     markdownfile = create_file(tmp_path, entry)
+    expected_heading = "# " + markdownfile.stem
 
     # Act
     # Extract Entries from specified Markdown files
@@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
 
     # Assert
     assert len(jsonl_data) == 1
-    # Ensure entries with no headings do not get heading prefix prepended
-    assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
+    # Ensure raw entry with no headings do not get heading prefix prepended
+    assert not jsonl_data[0]["raw"].startswith("#")
+    # Ensure compiled entry has filename prepended as top level heading
+    assert jsonl_data[0]["compiled"].startswith(expected_heading)
 
 
 def test_single_markdown_entry_to_jsonl(tmp_path):
@@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
 
 
 # Helper Functions
-def create_file(tmp_path, entry=None, filename="test.md"):
+def create_file(tmp_path: Path, entry=None, filename="test.md"):
     markdown_file = tmp_path / filename
     markdown_file.touch()
     if entry:

From 94825a70b9f03ed2fdcc8b2968a8a7d5cef439e0 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 18:58:37 +0800
Subject: [PATCH 4/6] Set heading of md entries to improve search context for
 long entries

Otherwise if a markdown entry is longer than max_tokens, the split
entries (apart from first one) do not get their heading context set
---
 src/khoj/processor/markdown/markdown_to_jsonl.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index 20cf9b2c..0179e05e 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
         entries = []
         for parsed_entry in parsed_entries:
             entry_filename = Path(entry_to_file_map[parsed_entry])
+            heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
             # Append base filename to compiled entry for context to model
             # Increment heading level for heading entries and make filename as its top level heading
             prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
             compiled_entry = f"{prefix}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=f"{prefix}{heading}",
+                    file=f"{entry_filename}",
+                )
+            )
 
         logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
 

From 02aeee60aaea3ce828be80e7a2732a2c0d28ecb8 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 19:51:25 +0800
Subject: [PATCH 5/6] Set filename as top heading of org entries for better
 search context

Previously filename was only being appended to markdown entries.

Test filename getting prepended to compiled entry as heading
---
 src/khoj/processor/org_mode/org_to_jsonl.py | 9 +++++++--
 tests/test_org_to_jsonl.py                  | 5 +++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index ed3be1d0..e5ec7cc6 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import glob
 import logging
+from pathlib import Path
 from typing import Iterable, List
 
 # Internal Packages
@@ -112,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
                 # Ignore title notes i.e notes with just headings and empty body
                 continue
 
-            compiled = f"{parsed_entry.heading}."
+            # Prepend filename as top heading to entry
+            filename = Path(entry_to_file_map[parsed_entry]).stem
+            heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
+
+            compiled = heading
             if state.verbose > 2:
                 logger.debug(f"Title: {parsed_entry.heading}")
 
@@ -142,7 +147,7 @@ class OrgToJsonl(TextToJsonl):
                     Entry(
                         compiled=compiled,
                         raw=f"{parsed_entry}",
-                        heading=f"{parsed_entry.heading}",
+                        heading=f"{heading}",
                         file=f"{entry_to_file_map[parsed_entry]}",
                     )
                 )
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index 15dd368a..171037c0 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     Body Line
     """
     orgfile = create_file(tmp_path, entry)
+    expected_heading = f"* {orgfile.stem}\n** Heading"
 
     # Act
     # Extract Entries from specified Org files
@@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     # Split each entry from specified Org files by max words
     jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
         TextToJsonl.split_entries_by_max_tokens(
-            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
+            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
         )
     )
     jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@@ -63,7 +64,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     # Assert
     assert len(jsonl_data) == 2
     # Ensure compiled entries split by max_words start with entry heading (for search context)
-    assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
+    assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
 
 
 def test_entry_split_drops_large_words():

From 6b535cc3457072b40f37b31de684bbf76b48be6b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 22:08:41 +0800
Subject: [PATCH 6/6] Snip prepended heading to avoid crossing model max_token
 limits

Otherwise if heading > max_tokens than the search models will just see
a heading (with repeated filename) for each compiled entry and not
actual content.

100 characters should be sufficient to include filename (not path) and
entry heading. If longer rather truncate to pass entry unique text to
model for search context
---
 src/khoj/processor/text_to_jsonl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index e440af90..3dd0d1b5 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -44,7 +44,10 @@ class TextToJsonl(ABC):
 
                 # Prepend heading to all other chunks, the first chunk already has heading from original entry
                 if chunk_index > 0:
-                    compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
+                    # Snip heading to avoid crossing max_tokens limit
+                    # Keep last 100 characters of heading as entry heading more important than filename
+                    snipped_heading = entry.heading[-100:]
+                    compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}"
 
                 chunked_entries.append(
                     Entry(