From 45a991d75cfb9eef2c244045113649aeefc0819e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 17:47:33 +0800 Subject: [PATCH 1/6] Prepend entry heading to all compiled org snippets to improve search context All compiled snippets split by max tokens (apart from first) do not get the heading as context. This limits search context required to retrieve these continuation entries --- src/khoj/processor/org_mode/org_to_jsonl.py | 10 ++++++++-- src/khoj/processor/text_to_jsonl.py | 20 ++++++++++++++++++-- src/khoj/utils/rawconfig.py | 6 +++++- tests/test_jsonl_to_jsonl.py | 12 ++++-------- tests/test_org_to_jsonl.py | 4 +++- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 0950a089..ed3be1d0 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -1,7 +1,6 @@ # Standard Packages import glob import logging -import time from typing import Iterable, List # Internal Packages @@ -139,7 +138,14 @@ class OrgToJsonl(TextToJsonl): logger.debug(f"Body: {parsed_entry.body}") if compiled: - entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")] + entries.append( + Entry( + compiled=compiled, + raw=f"{parsed_entry}", + heading=f"{parsed_entry.heading}", + file=f"{entry_to_file_map[parsed_entry]}", + ) + ) return entries diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 22de2c01..e440af90 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -31,14 +31,30 @@ class TextToJsonl(ABC): "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] for entry in entries: + # Split entry into words compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] + # Drop long words instead of having entry truncated to maintain quality of entry processed by models compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] + + # Split entry into chunks of max tokens for chunk_index in range(0, len(compiled_entry_words), max_tokens): compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] compiled_entry_chunk = " ".join(compiled_entry_words_chunk) - entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file) - chunked_entries.append(entry_chunk) + + # Prepend heading to all other chunks, the first chunk already has heading from original entry + if chunk_index > 0: + compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}" + + chunked_entries.append( + Entry( + compiled=compiled_entry_chunk, + raw=entry.raw, + heading=entry.heading, + file=entry.file, + ) + ) + return chunked_entries def mark_entries_for_update( diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 389e80f6..6b87c220 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -103,11 +103,15 @@ class SearchResponse(ConfigBase): class Entry: raw: str compiled: str + heading: Optional[str] file: Optional[str] - def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None): + def __init__( + self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None + ): self.raw = raw self.compiled = compiled + self.heading = heading self.file = file def to_json(self) -> str: diff --git a/tests/test_jsonl_to_jsonl.py b/tests/test_jsonl_to_jsonl.py index eb25d579..b52b5fc9 100644 --- a/tests/test_jsonl_to_jsonl.py +++ b/tests/test_jsonl_to_jsonl.py @@ -1,17 +1,13 @@ -# Standard Packages -import json - # Internal Packages from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl -from khoj.utils.jsonl import load_jsonl from khoj.utils.rawconfig import Entry def test_process_entries_from_single_input_jsonl(tmp_path): "Convert multiple jsonl entries from single file to entries." # Arrange - input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"} -{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"} + input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"} +{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"} """ input_jsonl_file = create_file(tmp_path, input_jsonl) @@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path): def test_process_entries_from_multiple_input_jsonls(tmp_path): "Convert multiple jsonl entries from single file to entries." # Arrange - input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}""" - input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}""" + input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}""" + input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}""" input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl") input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl") diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index aed4983f..15dd368a 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -62,9 +62,11 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Assert assert len(jsonl_data) == 2 + # Ensure compiled entries split by max_words start with entry heading (for search context) + assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data) -def test_entry_split_drops_large_words(tmp_path): +def test_entry_split_drops_large_words(): "Ensure entries drops words larger than specified max word length from compiled version." # Arrange entry_text = f"""*** Heading From 0e3fb59e098100765977365b566ed3d50c6a6b9c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:18:48 +0800 Subject: [PATCH 2/6] Entries with no md headings should not get heading prefix prepended Files with no headings would previously get their entry be prefixed with a markdown heading prefix (#) --- src/khoj/processor/markdown/markdown_to_jsonl.py | 10 ++++++---- tests/test_markdown_to_jsonl.py | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index a1e4d0c1..9e08ae89 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -2,7 +2,6 @@ import glob import logging import re -import time from pathlib import Path from typing import List @@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl): with open(markdown_file, "r", encoding="utf8") as f: markdown_content = f.read() markdown_entries_per_file = [] + any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - prefix = "#" if entry.startswith("#") else "# " - if entry.strip(empty_escape_sequences) != "": - markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}") + # Add heading level as the regex split removed it from entries with headings + prefix = "#" if entry.startswith("#") else "# " if any_headings else "" + stripped_entry = entry.strip(empty_escape_sequences) + if stripped_entry != "": + markdown_entries_per_file.append(f"{prefix}{stripped_entry}") entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index dfb42fed..ca22f359 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 + # Ensure entries with no headings do not get heading prefix prepended + assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") def test_single_markdown_entry_to_jsonl(tmp_path): From 5de04621b5f038be59f869ddecd7dd9fce3fd89c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:55:56 +0800 Subject: [PATCH 3/6] Set filename as top heading of md entries for better search context Previously filename was appended to the end of the compiled entry. This didn't provide appropriate structured context Test filename getting prepended as heading to compiled entry --- src/khoj/processor/markdown/markdown_to_jsonl.py | 5 +++-- tests/test_markdown_to_jsonl.py | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 9e08ae89..20cf9b2c 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl): for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) # Append base filename to compiled entry for context to model - compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" - entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) + # Increment heading level for heading entries and make filename as its top level heading + prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" + compiled_entry = f"{prefix}{parsed_entry}" logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index ca22f359..87a1a07e 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - Bullet point 2 """ markdownfile = create_file(tmp_path, entry) + expected_heading = "# " + markdownfile.stem # Act # Extract Entries from specified Markdown files @@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 - # Ensure entries with no headings do not get heading prefix prepended - assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") + # Ensure raw entry with no headings do not get heading prefix prepended + assert not jsonl_data[0]["raw"].startswith("#") + # Ensure compiled entry has filename prepended as top level heading + assert jsonl_data[0]["compiled"].startswith(expected_heading) def test_single_markdown_entry_to_jsonl(tmp_path): @@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Helper Functions -def create_file(tmp_path, entry=None, filename="test.md"): +def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename markdown_file.touch() if entry: From 94825a70b9f03ed2fdcc8b2968a8a7d5cef439e0 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:58:37 +0800 Subject: [PATCH 4/6] Set heading of md entries to improve search context for long entries Otherwise if a markdown entry is longer than max_tokens, the split entries (apart from first one) do not get their heading context set --- src/khoj/processor/markdown/markdown_to_jsonl.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 20cf9b2c..0179e05e 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl): entries = [] for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) + heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model # Increment heading level for heading entries and make filename as its top level heading prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" compiled_entry = f"{prefix}{parsed_entry}" + entries.append( + Entry( + compiled=compiled_entry, + raw=parsed_entry, + heading=f"{prefix}{heading}", + file=f"{entry_filename}", + ) + ) logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") From 02aeee60aaea3ce828be80e7a2732a2c0d28ecb8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 19:51:25 +0800 Subject: [PATCH 5/6] Set filename as top heading of org entries for better search context Previously filename was only being appended to markdown entries. Test filename getting prepended to compiled entry as heading --- src/khoj/processor/org_mode/org_to_jsonl.py | 9 +++++++-- tests/test_org_to_jsonl.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index ed3be1d0..e5ec7cc6 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -1,6 +1,7 @@ # Standard Packages import glob import logging +from pathlib import Path from typing import Iterable, List # Internal Packages @@ -112,7 +113,11 @@ class OrgToJsonl(TextToJsonl): # Ignore title notes i.e notes with just headings and empty body continue - compiled = f"{parsed_entry.heading}." + # Prepend filename as top heading to entry + filename = Path(entry_to_file_map[parsed_entry]).stem + heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}." + + compiled = heading if state.verbose > 2: logger.debug(f"Title: {parsed_entry.heading}") @@ -142,7 +147,7 @@ class OrgToJsonl(TextToJsonl): Entry( compiled=compiled, raw=f"{parsed_entry}", - heading=f"{parsed_entry.heading}", + heading=f"{heading}", file=f"{entry_to_file_map[parsed_entry]}", ) ) diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 15dd368a..171037c0 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): Body Line """ orgfile = create_file(tmp_path, entry) + expected_heading = f"* {orgfile.stem}\n** Heading" # Act # Extract Entries from specified Org files @@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( TextToJsonl.split_entries_by_max_tokens( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2 + OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -63,7 +64,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Assert assert len(jsonl_data) == 2 # Ensure compiled entries split by max_words start with entry heading (for search context) - assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data) + assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data]) def test_entry_split_drops_large_words(): From 6b535cc3457072b40f37b31de684bbf76b48be6b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 22:08:41 +0800 Subject: [PATCH 6/6] Snip prepended heading to avoid crossing model max_token limits Otherwise if heading > max_tokens than the search models will just see a heading (with repeated filename) for each compiled entry and not actual content. 100 characters should be sufficient to include filename (not path) and entry heading. If longer rather truncate to pass entry unique text to model for search context --- src/khoj/processor/text_to_jsonl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index e440af90..3dd0d1b5 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -44,7 +44,10 @@ class TextToJsonl(ABC): # Prepend heading to all other chunks, the first chunk already has heading from original entry if chunk_index > 0: - compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}" + # Snip heading to avoid crossing max_tokens limit + # Keep last 100 characters of heading as entry heading more important than filename + snipped_heading = entry.heading[-100:] + compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" chunked_entries.append( Entry(