From 94825a70b9f03ed2fdcc8b2968a8a7d5cef439e0 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:58:37 +0800 Subject: [PATCH] Set heading of md entries to improve search context for long entries Otherwise if a markdown entry is longer than max_tokens, the split entries (apart from first one) do not get their heading context set --- src/khoj/processor/markdown/markdown_to_jsonl.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 20cf9b2c..0179e05e 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl): entries = [] for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) + heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model # Increment heading level for heading entries and make filename as its top level heading prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" compiled_entry = f"{prefix}{parsed_entry}" + entries.append( + Entry( + compiled=compiled_entry, + raw=parsed_entry, + heading=f"{prefix}{heading}", + file=f"{entry_filename}", + ) + ) logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")