From 6b535cc3457072b40f37b31de684bbf76b48be6b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 May 2023 22:08:41 +0800
Subject: [PATCH] Snip prepended heading to avoid crossing model max_token
 limits

Otherwise if heading > max_tokens than the search models will just see
a heading (with repeated filename) for each compiled entry and not
actual content.

100 characters should be sufficient to include filename (not path) and
entry heading. If longer rather truncate to pass entry unique text to
model for search context
---
 src/khoj/processor/text_to_jsonl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index e440af90..3dd0d1b5 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -44,7 +44,10 @@ class TextToJsonl(ABC):
 
                 # Prepend heading to all other chunks, the first chunk already has heading from original entry
                 if chunk_index > 0:
-                    compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
+                    # Snip heading to avoid crossing max_tokens limit
+                    # Keep last 100 characters of heading as entry heading more important than filename
+                    snipped_heading = entry.heading[-100:]
+                    compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}"
 
                 chunked_entries.append(
                     Entry(