From 6b535cc3457072b40f37b31de684bbf76b48be6b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 22:08:41 +0800 Subject: [PATCH] Snip prepended heading to avoid crossing model max_token limits Otherwise if heading > max_tokens than the search models will just see a heading (with repeated filename) for each compiled entry and not actual content. 100 characters should be sufficient to include filename (not path) and entry heading. If longer rather truncate to pass entry unique text to model for search context --- src/khoj/processor/text_to_jsonl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index e440af90..3dd0d1b5 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -44,7 +44,10 @@ class TextToJsonl(ABC): # Prepend heading to all other chunks, the first chunk already has heading from original entry if chunk_index > 0: - compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}" + # Snip heading to avoid crossing max_tokens limit + # Keep last 100 characters of heading as entry heading more important than filename + snipped_heading = entry.heading[-100:] + compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" chunked_entries.append( Entry(