From 28105ee027b44d152d2d11deda8b7fc708e136d9 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 9 Feb 2024 16:04:41 +0530 Subject: [PATCH 01/17] Create wrapper function to get entries from org, md, pdf & text files - Convert extract_org_entries function to actually extract org entries Previously it was extracting intermediary org-node objects instead Now it extracts the org-node objects from files and converts them into entries - Create separate, new function to extract_org_nodes from files - Similarly create wrapper funcs for md, pdf, plaintext to entries - Update org, md, pdf, plaintext to entries tests to use the new simplified wrapper function to extract org entries --- .../content/markdown/markdown_to_entries.py | 28 ++++++++--------- .../content/org_mode/org_to_entries.py | 30 +++++++++++-------- .../processor/content/pdf/pdf_to_entries.py | 12 ++++---- .../content/plaintext/plaintext_to_entries.py | 6 ++-- tests/test_markdown_to_entries.py | 21 +++++-------- tests/test_org_to_entries.py | 29 ++++++------------ tests/test_pdf_to_entries.py | 17 ++++------- tests/test_plaintext_to_entries.py | 22 +++++++------- 8 files changed, 71 insertions(+), 94 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index 7274cf1c..a3cb75f9 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -32,10 +32,8 @@ class MarkdownToEntries(TextToEntries): deletion_file_names = None # Extract Entries from specified Markdown files - with timer("Parse entries from Markdown files into dictionaries", logger): - current_entries = MarkdownToEntries.convert_markdown_entries_to_maps( - *MarkdownToEntries.extract_markdown_entries(files) - ) + with timer("Extract entries from specified Markdown files", logger): + current_entries = MarkdownToEntries.extract_markdown_entries(files) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): @@ -57,13 +55,10 @@ class MarkdownToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_markdown_entries(markdown_files): + def extract_markdown_entries(markdown_files) -> List[Entry]: "Extract entries by heading from specified Markdown files" - - # Regex to extract Markdown Entries by Heading - - entries = [] - entry_to_file_map = [] + entries: List[str] = [] + entry_to_file_map: List[Tuple[str, Path]] = [] for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] @@ -71,18 +66,19 @@ class MarkdownToEntries(TextToEntries): markdown_content, markdown_file, entries, entry_to_file_map ) except Exception as e: - logger.warning(f"Unable to process file: {markdown_file}. This file will not be indexed.") - logger.warning(e, exc_info=True) + logger.warning( + f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True + ) - return entries, dict(entry_to_file_map) + return MarkdownToEntries.convert_markdown_entries_to_maps(entries, dict(entry_to_file_map)) @staticmethod def process_single_markdown_file( - markdown_content: str, markdown_file: Path, entries: List, entry_to_file_map: List + markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]] ): markdown_heading_regex = r"^#" - markdown_entries_per_file = [] + markdown_entries_per_file: List[str] = [] any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): # Add heading level as the regex split removed it from entries with headings @@ -98,7 +94,7 @@ class MarkdownToEntries(TextToEntries): @staticmethod def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: "Convert each Markdown entries into a dictionary" - entries = [] + entries: List[Entry] = [] for parsed_entry in parsed_entries: raw_filename = entry_to_file_map[parsed_entry] diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 0e115f78..989d3501 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -21,9 +21,6 @@ class OrgToEntries(TextToEntries): def process( self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False ) -> Tuple[int, int]: - # Extract required fields from config - index_heading_entries = False - if not full_corpus: deletion_file_names = set([file for file in files if files[file] == ""]) files_to_process = set(files) - deletion_file_names @@ -32,11 +29,8 @@ class OrgToEntries(TextToEntries): deletion_file_names = None # Extract Entries from specified Org files - with timer("Parse entries from org files into OrgNode objects", logger): - entry_nodes, file_to_entries = self.extract_org_entries(files) - - with timer("Convert OrgNodes into list of entries", logger): - current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) + with timer("Extract entries from specified Org files", logger): + current_entries = self.extract_org_entries(files) with timer("Split entries by max token size supported by model", logger): current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -57,9 +51,18 @@ class OrgToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_org_entries(org_files: dict[str, str]): + def extract_org_entries(org_files: dict[str, str], index_heading_entries: bool = False): "Extract entries from specified Org files" - entries = [] + with timer("Parse entries from org files into OrgNode objects", logger): + entry_nodes, file_to_entries = OrgToEntries.extract_org_nodes(org_files) + + with timer("Convert OrgNodes into list of entries", logger): + return OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) + + @staticmethod + def extract_org_nodes(org_files: dict[str, str]): + "Extract org nodes from specified org files" + entry_nodes = [] entry_to_file_map: List[Tuple[orgnode.Orgnode, str]] = [] for org_file in org_files: filename = org_file @@ -67,16 +70,17 @@ class OrgToEntries(TextToEntries): try: org_file_entries = orgnode.makelist(file, filename) entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) - entries.extend(org_file_entries) + entry_nodes.extend(org_file_entries) except Exception as e: logger.warning(f"Unable to process file: {org_file}. This file will not be indexed.") logger.warning(e, exc_info=True) - return entries, dict(entry_to_file_map) + return entry_nodes, dict(entry_to_file_map) @staticmethod def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List): - # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. We'll split the raw conetnt of this file by new line to mimic the same behavior. + # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. + # We'll split the raw content of this file by new line to mimic the same behavior. try: org_file_entries = orgnode.makelist(org_content, org_file) entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 3582cbe0..008b4cce 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -32,8 +32,8 @@ class PdfToEntries(TextToEntries): deletion_file_names = None # Extract Entries from specified Pdf files - with timer("Parse entries from PDF files into dictionaries", logger): - current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files)) + with timer("Extract entries from specified PDF files", logger): + current_entries = PdfToEntries.extract_pdf_entries(files) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): @@ -55,11 +55,11 @@ class PdfToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_pdf_entries(pdf_files): + def extract_pdf_entries(pdf_files) -> List[Entry]: """Extract entries by page from specified PDF files""" - entries = [] - entry_to_location_map = [] + entries: List[str] = [] + entry_to_location_map: List[Tuple[str, str]] = [] for pdf_file in pdf_files: try: # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path @@ -83,7 +83,7 @@ class PdfToEntries(TextToEntries): if os.path.exists(f"{tmp_file}"): os.remove(f"{tmp_file}") - return entries, dict(entry_to_location_map) + return PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map)) @staticmethod def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 9ea8c654..67604ad7 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -42,8 +42,8 @@ class PlaintextToEntries(TextToEntries): logger.warning(e, exc_info=True) # Extract Entries from specified plaintext files - with timer("Parse entries from plaintext files", logger): - current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files) + with timer("Parse entries from specified Plaintext files", logger): + current_entries = PlaintextToEntries.extract_plaintext_entries(files) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): @@ -74,7 +74,7 @@ class PlaintextToEntries(TextToEntries): return soup.get_text(strip=True, separator="\n") @staticmethod - def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]: + def extract_plaintext_entries(entry_to_file_map: dict[str, str]) -> List[Entry]: "Convert each plaintext entries into a dictionary" entries = [] for file, entry in entry_to_file_map.items(): diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 12ea238e..6851a7ed 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -21,12 +21,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( - MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) - ) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -52,12 +50,10 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( - MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map) - ) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -81,8 +77,7 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) @@ -144,12 +139,12 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Markdown files - entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Assert assert len(entries) == 2 - assert entries[0] == "# Heading 1" - assert entries[1] == "## Heading 2" + assert entries[0].raw == "# Heading 1" + assert entries[1].raw == "## Heading 2" # Helper Functions diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index f7c19b79..e1051d25 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -27,9 +27,7 @@ def test_configure_heading_entry_to_jsonl(tmp_path): # Act # Extract entries into jsonl from specified Org files jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - OrgToEntries.convert_org_nodes_to_entries( - *OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries - ) + OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -57,13 +55,11 @@ def test_entry_split_when_exceeds_max_words(): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max words jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - TextToEntries.split_entries_by_max_tokens( - OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 - ) + TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -107,12 +103,7 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) - - # Process Each Entry from All Notes Files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map) - ) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(OrgToEntries.extract_org_entries(org_files=data)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -134,10 +125,9 @@ Intro text # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -158,10 +148,9 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -222,12 +211,12 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Org files - entries, _ = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True) # Assert assert len(entries) == 2 - assert f"{entries[0]}".startswith("* Heading 1") - assert f"{entries[1]}".startswith("** Heading 2") + assert f"{entries[0].raw}".startswith("* Heading 1") + assert f"{entries[1].raw}".startswith("** Heading 2") # Helper Functions diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index 62decdd7..1e3e2783 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -15,12 +15,10 @@ def test_single_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) + entries = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( - PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) - ) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -35,12 +33,10 @@ def test_multi_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/multipage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) + entries = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( - PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) - ) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -55,10 +51,7 @@ def test_ocr_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) - - # Process Each Entry from All Pdf Files - entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) + entries = PdfToEntries.extract_pdf_entries(pdf_files=data) assert len(entries) == 1 assert "playing on a strip of marsh" in entries[0].raw diff --git a/tests/test_plaintext_to_entries.py b/tests/test_plaintext_to_entries.py index 53585177..679892dc 100644 --- a/tests/test_plaintext_to_entries.py +++ b/tests/test_plaintext_to_entries.py @@ -11,10 +11,10 @@ from khoj.utils.rawconfig import TextContentConfig def test_plaintext_file(tmp_path): "Convert files with no heading to jsonl." # Arrange - entry = f""" + raw_entry = f""" Hi, I am a plaintext file and I have some plaintext words. """ - plaintextfile = create_file(tmp_path, entry) + plaintextfile = create_file(tmp_path, raw_entry) filename = plaintextfile.stem @@ -22,17 +22,17 @@ def test_plaintext_file(tmp_path): # Extract Entries from specified plaintext files data = { - f"{plaintextfile}": entry, + f"{plaintextfile}": raw_entry, } - maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data) + entries = PlaintextToEntries.extract_plaintext_entries(entry_to_file_map=data) # Convert each entry.file to absolute path to make them JSON serializable - for map in maps: - map.file = str(Path(map.file).absolute()) + for entry in entries: + entry.file = str(Path(entry.file).absolute()) # Process Each Entry from All Notes Files - jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps) + jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -40,7 +40,7 @@ def test_plaintext_file(tmp_path): # Ensure raw entry with no headings do not get heading prefix prepended assert not jsonl_data[0]["raw"].startswith("#") # Ensure compiled entry has filename prepended as top level heading - assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}" + assert jsonl_data[0]["compiled"] == f"{filename}\n{raw_entry}" def test_get_plaintext_files(tmp_path): @@ -98,11 +98,11 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser): extracted_plaintext_files = get_plaintext_files(config=config) # Act - maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files) + entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files) # Assert - assert len(maps) == 1 - assert "
" not in maps[0].raw + assert len(entries) == 1 + assert "
" not in entries[0].raw # Helper Functions From a627f56a64f01f6adafabd58d5ca7b6a6ebd2872 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 9 Feb 2024 17:03:36 +0530 Subject: [PATCH 02/17] Remove unused Entry to Jsonl converter from text to entry class, tests This was earlier used when the index was plaintext jsonl file. Now that documents are indexed in a DB this func is not required. Simplify org,md,pdf,plaintext to entries tests by removing the entry to jsonl conversion step --- .../content/markdown/markdown_to_entries.py | 5 --- .../content/org_mode/org_to_entries.py | 5 --- .../processor/content/pdf/pdf_to_entries.py | 5 --- .../content/plaintext/plaintext_to_entries.py | 5 --- src/khoj/processor/content/text_to_entries.py | 5 --- tests/test_markdown_to_entries.py | 24 ++++--------- tests/test_org_to_entries.py | 35 ++++++------------- tests/test_pdf_to_entries.py | 13 ++----- tests/test_plaintext_to_entries.py | 11 ++---- 9 files changed, 21 insertions(+), 87 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index a3cb75f9..fb0fe4e5 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -123,8 +123,3 @@ class MarkdownToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") return entries - - @staticmethod - def convert_markdown_maps_to_jsonl(entries: List[Entry]): - "Convert each Markdown entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 989d3501..60226946 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -146,8 +146,3 @@ class OrgToEntries(TextToEntries): ) return entries - - @staticmethod - def convert_org_entries_to_jsonl(entries: Iterable[Entry]) -> str: - "Convert each Org-Mode entry to JSON and collate as JSONL" - return "".join([f"{entry_dict.to_json()}\n" for entry_dict in entries]) diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 008b4cce..c59b305c 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -106,8 +106,3 @@ class PdfToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries") return entries - - @staticmethod - def convert_pdf_maps_to_jsonl(entries: List[Entry]): - "Convert each PDF entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 67604ad7..4fb0dd2e 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -87,8 +87,3 @@ class PlaintextToEntries(TextToEntries): ) ) return entries - - @staticmethod - def convert_entries_to_jsonl(entries: List[Entry]): - "Convert each entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index f8bf30dc..8ebc6604 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -244,11 +244,6 @@ class TextToEntries(ABC): return entries_with_ids - @staticmethod - def convert_text_maps_to_jsonl(entries: List[Entry]) -> str: - # Convert each entry to JSON and write to JSONL file - return "".join([f"{entry.to_json()}\n" for entry in entries]) - @staticmethod def clean_field(field: str) -> str: return field.replace("\0", "") if not is_none_or_empty(field) else "" diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 6851a7ed..8a086dbc 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -23,18 +23,14 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 # Ensure raw entry with no headings do not get heading prefix prepended - assert not jsonl_data[0]["raw"].startswith("#") + assert not entries[0].raw.startswith("#") # Ensure compiled entry has filename prepended as top level heading - assert expected_heading in jsonl_data[0]["compiled"] + assert entries[0].compiled.startswith(expected_heading) # Ensure compiled entry also includes the file name - assert str(tmp_path) in jsonl_data[0]["compiled"] + assert str(tmp_path) in entries[0].compiled def test_single_markdown_entry_to_jsonl(tmp_path): @@ -52,12 +48,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_multiple_markdown_entries_to_jsonl(tmp_path): @@ -79,12 +71,8 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 # Ensure entry compiled strings include the markdown files they originate from assert all([tmp_path.stem in entry.compiled for entry in entries]) diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index e1051d25..66371e5c 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -26,18 +26,15 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) - ) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) # Assert if index_heading_entries: # Entry with empty body indexed when index_heading_entries set to True - assert len(jsonl_data) == 1 + assert len(entries) == 1 else: # Entry with empty body ignored when index_heading_entries set to False - assert is_none_or_empty(jsonl_data) + assert is_none_or_empty(entries) def test_entry_split_when_exceeds_max_words(): @@ -58,15 +55,12 @@ def test_entry_split_when_exceeds_max_words(): entries = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max words - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) - ) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 # Ensure compiled entries split by max_words start with entry heading (for search context) - assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data]) + assert all([entry.compiled.startswith(expected_heading) for entry in entries]) def test_entry_split_drops_large_words(): @@ -103,11 +97,10 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(OrgToEntries.extract_org_entries(org_files=data)) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = OrgToEntries.extract_org_entries(org_files=data) # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_file_with_entry_after_intro_text_to_jsonl(tmp_path): @@ -127,12 +120,8 @@ Intro text # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) - # Process Each Entry from All Notes Files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 def test_file_with_no_headings_to_jsonl(tmp_path): @@ -150,12 +139,8 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) - # Process Each Entry from All Notes Files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_get_org_files(tmp_path): diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index 1e3e2783..a8c6aa43 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -1,4 +1,3 @@ -import json import os from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries @@ -17,12 +16,8 @@ def test_single_page_pdf_to_jsonl(): data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} entries = PdfToEntries.extract_pdf_entries(pdf_files=data) - # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_multi_page_pdf_to_jsonl(): @@ -35,12 +30,8 @@ def test_multi_page_pdf_to_jsonl(): data = {"tests/data/pdf/multipage.pdf": pdf_bytes} entries = PdfToEntries.extract_pdf_entries(pdf_files=data) - # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 6 + assert len(entries) == 6 def test_ocr_page_pdf_to_jsonl(): diff --git a/tests/test_plaintext_to_entries.py b/tests/test_plaintext_to_entries.py index 679892dc..41d841fc 100644 --- a/tests/test_plaintext_to_entries.py +++ b/tests/test_plaintext_to_entries.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -31,16 +30,12 @@ def test_plaintext_file(tmp_path): for entry in entries: entry.file = str(Path(entry.file).absolute()) - # Process Each Entry from All Notes Files - jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 # Ensure raw entry with no headings do not get heading prefix prepended - assert not jsonl_data[0]["raw"].startswith("#") + assert not entries[0].raw.startswith("#") # Ensure compiled entry has filename prepended as top level heading - assert jsonl_data[0]["compiled"] == f"{filename}\n{raw_entry}" + assert entries[0].compiled == f"{filename}\n{raw_entry}" def test_get_plaintext_files(tmp_path): From 86575b29467f2f1a928f5e022b5213c5bfe6aa4a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 29 Jan 2024 05:03:29 +0530 Subject: [PATCH 03/17] Chunk text in preference order of para, sentence, word, character - Previous simplistic chunking strategy of splitting text by space didn't capture notes with newlines, no spaces. For e.g in #620 - New strategy will try chunk the text at more natural points like paragraph, sentence, word first. If none of those work it'll split at character to fit within max token limit - Drop long words while preserving original delimiters Resolves #620 --- src/khoj/processor/content/text_to_entries.py | 53 ++++++++++++++----- tests/test_org_to_entries.py | 6 +-- tests/test_text_search.py | 4 +- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 8ebc6604..8b7df3e9 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -1,10 +1,12 @@ import hashlib import logging +import re import uuid from abc import ABC, abstractmethod from itertools import repeat from typing import Any, Callable, List, Set, Tuple +from langchain.text_splitter import RecursiveCharacterTextSplitter from tqdm import tqdm from khoj.database.adapters import EntryAdapters, get_user_search_model_or_default @@ -34,6 +36,27 @@ class TextToEntries(ABC): def hash_func(key: str) -> Callable: return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding="utf-8")).hexdigest() + @staticmethod + def remove_long_words(text: str, max_word_length: int = 500) -> str: + "Remove words longer than max_word_length from text." + # Split the string by words, keeping the delimiters + splits = re.split(r"(\s+)", text) + [""] + words_with_delimiters = list(zip(splits[::2], splits[1::2])) + + # Filter out long words while preserving delimiters in text + filtered_text = [ + f"{word}{delimiter}" + for word, delimiter in words_with_delimiters + if not word.strip() or len(word.strip()) <= max_word_length + ] + + return "".join(filtered_text) + + @staticmethod + def tokenizer(text: str) -> List[str]: + "Tokenize text into words." + return text.split() + @staticmethod def split_entries_by_max_tokens( entries: List[Entry], max_tokens: int = 256, max_word_length: int = 500 @@ -44,24 +67,30 @@ class TextToEntries(ABC): if is_none_or_empty(entry.compiled): continue - # Split entry into words - compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] - - # Drop long words instead of having entry truncated to maintain quality of entry processed by models - compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] + # Split entry into chunks of max_tokens + # Use chunking preference order: paragraphs > sentences > words > characters + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=max_tokens, + separators=["\n\n", "\n", "!", "?", ".", " ", "\t", ""], + keep_separator=True, + length_function=lambda chunk: len(TextToEntries.tokenizer(chunk)), + chunk_overlap=0, + ) + chunked_entry_chunks = text_splitter.split_text(entry.compiled) corpus_id = uuid.uuid4() - # Split entry into chunks of max tokens - for chunk_index in range(0, len(compiled_entry_words), max_tokens): - compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] - compiled_entry_chunk = " ".join(compiled_entry_words_chunk) - + # Create heading prefixed entry from each chunk + for chunk_index, compiled_entry_chunk in enumerate(chunked_entry_chunks): # Prepend heading to all other chunks, the first chunk already has heading from original entry - if chunk_index > 0: + if chunk_index > 0 and entry.heading: # Snip heading to avoid crossing max_tokens limit # Keep last 100 characters of heading as entry heading more important than filename snipped_heading = entry.heading[-100:] - compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" + # Prepend snipped heading + compiled_entry_chunk = f"{snipped_heading}\n{compiled_entry_chunk}" + + # Drop long words instead of having entry truncated to maintain quality of entry processed by models + compiled_entry_chunk = TextToEntries.remove_long_words(compiled_entry_chunk, max_word_length) # Clean entry of unwanted characters like \0 character compiled_entry_chunk = TextToEntries.clean_field(compiled_entry_chunk) diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index 66371e5c..b97e62e9 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -54,12 +54,12 @@ def test_entry_split_when_exceeds_max_words(): # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) - # Split each entry from specified Org files by max words - entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) + # Split each entry from specified Org files by max tokens + entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=6) # Assert assert len(entries) == 2 - # Ensure compiled entries split by max_words start with entry heading (for search context) + # Ensure compiled entries split by max tokens start with entry heading (for search context) assert all([entry.compiled.startswith(expected_heading) for entry in entries]) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 791ce91b..9a41430f 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -192,7 +192,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon # Assert assert ( - "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message + "Deleted 0 entries. Created 3 new entries for user " in caplog.records[-1].message ), "new entry not split by max tokens" @@ -250,7 +250,7 @@ conda activate khoj # Assert assert ( - "Deleted 0 entries. Created 2 new entries for user " in caplog.records[-1].message + "Deleted 0 entries. Created 3 new entries for user " in caplog.records[-1].message ), "new entry not split by max tokens" From d8f01876e5f26913cbbe1c8224a4c97562999fc8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 29 Jan 2024 15:31:48 +0530 Subject: [PATCH 04/17] Add parent heading ancestory to extracted markdown entries for context Improve, update the markdown to entries extractor tests --- .../content/markdown/markdown_to_entries.py | 32 +++++++++---- tests/test_markdown_to_entries.py | 46 +++++++++++++++---- 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index fb0fe4e5..c0fea077 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -4,11 +4,11 @@ from pathlib import Path from typing import List, Tuple import urllib3 +from langchain.text_splitter import MarkdownHeaderTextSplitter from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser from khoj.processor.content.text_to_entries import TextToEntries -from khoj.utils.constants import empty_escape_sequences from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry @@ -76,16 +76,28 @@ class MarkdownToEntries(TextToEntries): def process_single_markdown_file( markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]] ): - markdown_heading_regex = r"^#" - + headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")] + reversed_headers_to_split_on = list(reversed(headers_to_split_on)) markdown_entries_per_file: List[str] = [] - any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) - for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - # Add heading level as the regex split removed it from entries with headings - prefix = "#" if entry.startswith("#") else "# " if any_headings else "" - stripped_entry = entry.strip(empty_escape_sequences) - if stripped_entry != "": - markdown_entries_per_file.append(f"{prefix}{stripped_entry}") + previous_section_metadata, current_section_metadata = None, None + + splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True) + for section in splitter.split_text(markdown_content): + current_section_metadata = section.metadata.copy() + # Append the section's content to the last entry if the metadata is the same + if previous_section_metadata == current_section_metadata: + markdown_entries_per_file[-1] = f"{markdown_entries_per_file[-1]}\n{section.page_content}" + # Insert new entry with it's heading ancestry, if the section is under a new heading + else: + # Drop the current heading from the metadata. It is already in the section content + if section.metadata: + section.metadata.pop(max(section.metadata)) + # Prepend the markdown section's heading ancestry + for heading in reversed_headers_to_split_on: + if heading[1] in section.metadata: + section.page_content = f"{heading[0]} {section.metadata[heading[1]]}\n{section.page_content}" + previous_section_metadata = current_section_metadata + markdown_entries_per_file += [section.page_content] entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 8a086dbc..174c6c4d 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -7,8 +6,8 @@ from khoj.utils.fs_syncer import get_markdown_files from khoj.utils.rawconfig import TextContentConfig -def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - "Convert files with no heading to jsonl." +def test_extract_markdown_with_no_headings(tmp_path): + "Convert markdown file with no heading to entry format." # Arrange entry = f""" - Bullet point 1 @@ -33,8 +32,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): assert str(tmp_path) in entries[0].compiled -def test_single_markdown_entry_to_jsonl(tmp_path): - "Convert markdown entry from single file to jsonl." +def test_extract_single_markdown_entry(tmp_path): + "Convert markdown from single file to entry format." # Arrange entry = f"""### Heading \t\r @@ -52,8 +51,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path): assert len(entries) == 1 -def test_multiple_markdown_entries_to_jsonl(tmp_path): - "Convert multiple markdown entries from single file to jsonl." +def test_extract_multiple_markdown_entries(tmp_path): + "Convert multiple markdown from single file to entry format." # Arrange entry = f""" ### Heading 1 @@ -119,7 +118,8 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Arrange entry = f""" # Heading 1 -## Heading 2 +## Sub-Heading 1.1 +# Heading 2 """ data = { f"{tmp_path}": entry, @@ -130,9 +130,35 @@ def test_extract_entries_with_different_level_headings(tmp_path): entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Assert - assert len(entries) == 2 + assert len(entries) == 3 assert entries[0].raw == "# Heading 1" - assert entries[1].raw == "## Heading 2" + assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory" + assert entries[2].raw == "# Heading 2" + + +def test_extract_entries_with_text_before_headings(tmp_path): + "Extract markdown entries with some text before any headings." + # Arrange + entry = f""" +Text before headings +# Heading 1 +body line 1 +## Heading 2 +body line 2 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) + + # Assert + assert len(entries) == 3 + assert entries[0].raw == "Text before headings" + assert entries[1].raw == "# Heading 1\nbody line 1" + assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory" # Helper Functions From 982ac1859cbd00cebd841b26d447016e2b7da0a8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Feb 2024 14:34:09 +0530 Subject: [PATCH 05/17] Parse markdown file as single entry if it fits with max token limits These changes improve entry context available to the search model Specifically this should improve entry context from short knowledge trees, that is knowledge bases with small files Previously we split all markdown files by their headings, even if the file was small enough to fit entirely within the max token limits of the search model. This used to reduce the context available to select the appropriate entries for a given query for the search model, especially from short knowledge trees --- .../content/markdown/markdown_to_entries.py | 20 +++++++++---- tests/test_markdown_to_entries.py | 30 ++++++++++++++++--- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index c0fea077..986d4911 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -31,13 +31,14 @@ class MarkdownToEntries(TextToEntries): else: deletion_file_names = None + max_tokens = 256 # Extract Entries from specified Markdown files with timer("Extract entries from specified Markdown files", logger): - current_entries = MarkdownToEntries.extract_markdown_entries(files) + current_entries = MarkdownToEntries.extract_markdown_entries(files, max_tokens) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): @@ -55,7 +56,7 @@ class MarkdownToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_markdown_entries(markdown_files) -> List[Entry]: + def extract_markdown_entries(markdown_files, max_tokens=256) -> List[Entry]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] entry_to_file_map: List[Tuple[str, Path]] = [] @@ -63,7 +64,7 @@ class MarkdownToEntries(TextToEntries): try: markdown_content = markdown_files[markdown_file] entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( - markdown_content, markdown_file, entries, entry_to_file_map + markdown_content, markdown_file, entries, entry_to_file_map, max_tokens ) except Exception as e: logger.warning( @@ -74,8 +75,17 @@ class MarkdownToEntries(TextToEntries): @staticmethod def process_single_markdown_file( - markdown_content: str, markdown_file: Path, entries: List[str], entry_to_file_map: List[Tuple[str, Path]] + markdown_content: str, + markdown_file: Path, + entries: List[str], + entry_to_file_map: List[Tuple[str, Path]], + max_tokens=256, ): + if len(TextToEntries.tokenizer(markdown_content)) <= max_tokens: + entry_to_file_map += [(markdown_content, markdown_file)] + entries.extend([markdown_content]) + return entries, entry_to_file_map + headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")] reversed_headers_to_split_on = list(reversed(headers_to_split_on)) markdown_entries_per_file: List[str] = [] diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 174c6c4d..18d43791 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -20,7 +20,7 @@ def test_extract_markdown_with_no_headings(tmp_path): # Act # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) # Assert assert len(entries) == 1 @@ -45,7 +45,7 @@ def test_extract_single_markdown_entry(tmp_path): # Act # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) # Assert assert len(entries) == 1 @@ -68,7 +68,7 @@ def test_extract_multiple_markdown_entries(tmp_path): # Act # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) # Assert assert len(entries) == 2 @@ -127,7 +127,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) # Assert assert len(entries) == 3 @@ -161,6 +161,28 @@ body line 2 assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory" +def test_parse_markdown_file_into_single_entry_if_small(tmp_path): + "Parse markdown file into single entry if it fits within the token limits." + # Arrange + entry = f""" +# Heading 1 +body line 1 +## Subheading 1.1 +body line 1.1 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12) + + # Assert + assert len(entries) == 1 + assert entries[0].raw == entry + + # Helper Functions def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename From db2581459f5e8ae5bfa35de95830bf78367e1a7d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Feb 2024 23:03:30 +0530 Subject: [PATCH 06/17] Parse markdown parent entries as single entry if fit within max tokens These changes improve context available to the search model. Specifically this should improve entry context from short knowledge trees, that is knowledge bases with sparse, short heading/entry trees Previously we'd always split markdown files by headings, even if a parent entry was small enough to fit entirely within the max token limits of the search model. This used to reduce the context available to the search model to select appropriate entries for a query, especially from short entry trees Revert back to using regex to parse through markdown file instead of using MarkdownHeaderTextSplitter. It was easier to implement the logical split using regexes rather than bend MarkdowHeaderTextSplitter to implement it. - DFS traverse the markdown knowledge tree, prefix ancestry to each entry --- .../content/markdown/markdown_to_entries.py | 70 ++++--- tests/test_markdown_to_entries.py | 195 +++++++++++------- 2 files changed, 168 insertions(+), 97 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index 986d4911..e36d5ee1 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -1,10 +1,9 @@ import logging import re from pathlib import Path -from typing import List, Tuple +from typing import Dict, List, Tuple import urllib3 -from langchain.text_splitter import MarkdownHeaderTextSplitter from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser @@ -80,37 +79,54 @@ class MarkdownToEntries(TextToEntries): entries: List[str], entry_to_file_map: List[Tuple[str, Path]], max_tokens=256, + ancestry: Dict[int, str] = {}, ): - if len(TextToEntries.tokenizer(markdown_content)) <= max_tokens: - entry_to_file_map += [(markdown_content, markdown_file)] - entries.extend([markdown_content]) + # Prepend the markdown section's heading ancestry + ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())]) + markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}" + + # If content is small or content has no children headings, save it as a single entry + if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search( + rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE + ): + entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)] + entries.extend([markdown_content_with_ancestry]) return entries, entry_to_file_map - headers_to_split_on = [("#", "1"), ("##", "2"), ("###", "3"), ("####", "4"), ("#####", "5"), ("######", "6")] - reversed_headers_to_split_on = list(reversed(headers_to_split_on)) - markdown_entries_per_file: List[str] = [] - previous_section_metadata, current_section_metadata = None, None + # Split by next heading level present in the entry + next_heading_level = len(ancestry) + sections: List[str] = [] + while len(sections) < 2: + next_heading_level += 1 + sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE) - splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False, return_each_line=True) - for section in splitter.split_text(markdown_content): - current_section_metadata = section.metadata.copy() - # Append the section's content to the last entry if the metadata is the same - if previous_section_metadata == current_section_metadata: - markdown_entries_per_file[-1] = f"{markdown_entries_per_file[-1]}\n{section.page_content}" - # Insert new entry with it's heading ancestry, if the section is under a new heading + for section in sections: + # Skip empty sections + if section.strip() == "": + continue + + # Extract the section body and (when present) the heading + current_ancestry = ancestry.copy() + first_line = [line for line in section.split("\n") if line.strip() != ""][0] + if re.search(rf"^#{{{next_heading_level}}} ", first_line): + # Extract the section body without the heading + current_section_body = "\n".join(section.split(first_line)[1:]) + # Parse the section heading into current section ancestry + current_section_title = first_line[next_heading_level:].strip() + current_ancestry[next_heading_level] = current_section_title else: - # Drop the current heading from the metadata. It is already in the section content - if section.metadata: - section.metadata.pop(max(section.metadata)) - # Prepend the markdown section's heading ancestry - for heading in reversed_headers_to_split_on: - if heading[1] in section.metadata: - section.page_content = f"{heading[0]} {section.metadata[heading[1]]}\n{section.page_content}" - previous_section_metadata = current_section_metadata - markdown_entries_per_file += [section.page_content] + current_section_body = section + + # Recurse down children of the current entry + MarkdownToEntries.process_single_markdown_file( + current_section_body, + markdown_file, + entries, + entry_to_file_map, + max_tokens, + current_ancestry, + ) - entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) - entries.extend(markdown_entries_per_file) return entries, entry_to_file_map @staticmethod diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 18d43791..68b6589e 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -76,6 +76,131 @@ def test_extract_multiple_markdown_entries(tmp_path): assert all([tmp_path.stem in entry.compiled for entry in entries]) +def test_extract_entries_with_different_level_headings(tmp_path): + "Extract markdown entries with different level headings." + # Arrange + entry = f""" +# Heading 1 +## Sub-Heading 1.1 +# Heading 2 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) + + # Assert + assert len(entries) == 2 + assert entries[0].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory" + assert entries[1].raw == "# Heading 2\n" + + +def test_extract_entries_with_non_incremental_heading_levels(tmp_path): + "Extract markdown entries when deeper child level before shallower child level." + # Arrange + entry = f""" +# Heading 1 +#### Sub-Heading 1.1 +## Sub-Heading 1.2 +# Heading 2 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) + + # Assert + assert len(entries) == 3 + assert entries[0].raw == "# Heading 1\n#### Sub-Heading 1.1", "Ensure entry includes heading ancestory" + assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.2", "Ensure entry includes heading ancestory" + assert entries[2].raw == "# Heading 2\n" + + +def test_extract_entries_with_text_before_headings(tmp_path): + "Extract markdown entries with some text before any headings." + # Arrange + entry = f""" +Text before headings +# Heading 1 +body line 1 +## Heading 2 +body line 2 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) + + # Assert + assert len(entries) == 3 + assert entries[0].raw == "\nText before headings" + assert entries[1].raw == "# Heading 1\nbody line 1" + assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2\n", "Ensure raw entry includes heading ancestory" + + +def test_parse_markdown_file_into_single_entry_if_small(tmp_path): + "Parse markdown file into single entry if it fits within the token limits." + # Arrange + entry = f""" +# Heading 1 +body line 1 +## Subheading 1.1 +body line 1.1 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12) + + # Assert + assert len(entries) == 1 + assert entries[0].raw == entry + + +def test_parse_markdown_entry_with_children_as_single_entry_if_small(tmp_path): + "Parse markdown entry with child headings as single entry if it fits within the tokens limits." + # Arrange + entry = f""" +# Heading 1 +body line 1 +## Subheading 1.1 +body line 1.1 +# Heading 2 +body line 2 +## Subheading 2.1 +longer body line 2.1 +""" + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Markdown files + entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12) + + # Assert + assert len(entries) == 3 + assert ( + entries[0].raw == "# Heading 1\nbody line 1\n## Subheading 1.1\nbody line 1.1" + ), "First entry includes children headings" + assert entries[1].raw == "# Heading 2\nbody line 2", "Second entry does not include children headings" + assert ( + entries[2].raw == "# Heading 2\n## Subheading 2.1\nlonger body line 2.1\n" + ), "Third entry is second entries child heading" + + def test_get_markdown_files(tmp_path): "Ensure Markdown files specified via input-filter, input-files extracted" # Arrange @@ -113,76 +238,6 @@ def test_get_markdown_files(tmp_path): assert set(extracted_org_files.keys()) == expected_files -def test_extract_entries_with_different_level_headings(tmp_path): - "Extract markdown entries with different level headings." - # Arrange - entry = f""" -# Heading 1 -## Sub-Heading 1.1 -# Heading 2 -""" - data = { - f"{tmp_path}": entry, - } - - # Act - # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) - - # Assert - assert len(entries) == 3 - assert entries[0].raw == "# Heading 1" - assert entries[1].raw == "# Heading 1\n## Sub-Heading 1.1", "Ensure entry includes heading ancestory" - assert entries[2].raw == "# Heading 2" - - -def test_extract_entries_with_text_before_headings(tmp_path): - "Extract markdown entries with some text before any headings." - # Arrange - entry = f""" -Text before headings -# Heading 1 -body line 1 -## Heading 2 -body line 2 -""" - data = { - f"{tmp_path}": entry, - } - - # Act - # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=3) - - # Assert - assert len(entries) == 3 - assert entries[0].raw == "Text before headings" - assert entries[1].raw == "# Heading 1\nbody line 1" - assert entries[2].raw == "# Heading 1\n## Heading 2\nbody line 2", "Ensure raw entry includes heading ancestory" - - -def test_parse_markdown_file_into_single_entry_if_small(tmp_path): - "Parse markdown file into single entry if it fits within the token limits." - # Arrange - entry = f""" -# Heading 1 -body line 1 -## Subheading 1.1 -body line 1.1 -""" - data = { - f"{tmp_path}": entry, - } - - # Act - # Extract Entries from specified Markdown files - entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data, max_tokens=12) - - # Assert - assert len(entries) == 1 - assert entries[0].raw == entry - - # Helper Functions def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename From 44eab74888f29a52fa9a4a53f057cb3896b8b3bc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Feb 2024 00:34:04 +0530 Subject: [PATCH 07/17] Dedupe code by using single func to process an org file into entries Add type hints to orgnode and org-to-entries packages --- .../content/org_mode/org_to_entries.py | 37 ++++++++++--------- .../processor/content/org_mode/orgnode.py | 26 ++++++------- tests/test_org_to_entries.py | 18 +++++---- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 60226946..06a3c453 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -1,10 +1,11 @@ import logging from pathlib import Path -from typing import Iterable, List, Tuple +from typing import Dict, List, Tuple from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser from khoj.processor.content.org_mode import orgnode +from khoj.processor.content.org_mode.orgnode import Orgnode from khoj.processor.content.text_to_entries import TextToEntries from khoj.utils import state from khoj.utils.helpers import timer @@ -51,7 +52,7 @@ class OrgToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_org_entries(org_files: dict[str, str], index_heading_entries: bool = False): + def extract_org_entries(org_files: dict[str, str], index_heading_entries: bool = False) -> List[Entry]: "Extract entries from specified Org files" with timer("Parse entries from org files into OrgNode objects", logger): entry_nodes, file_to_entries = OrgToEntries.extract_org_nodes(org_files) @@ -60,35 +61,35 @@ class OrgToEntries(TextToEntries): return OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) @staticmethod - def extract_org_nodes(org_files: dict[str, str]): + def extract_org_nodes(org_files: dict[str, str]) -> Tuple[List[Orgnode], Dict[Orgnode, str]]: "Extract org nodes from specified org files" - entry_nodes = [] - entry_to_file_map: List[Tuple[orgnode.Orgnode, str]] = [] + entry_nodes: List[Orgnode] = [] + entry_to_file_map: List[Tuple[Orgnode, str]] = [] for org_file in org_files: - filename = org_file - file = org_files[org_file] - try: - org_file_entries = orgnode.makelist(file, filename) - entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) - entry_nodes.extend(org_file_entries) - except Exception as e: - logger.warning(f"Unable to process file: {org_file}. This file will not be indexed.") - logger.warning(e, exc_info=True) + org_content = org_files[org_file] + entry_nodes, entry_to_file_map = OrgToEntries.process_single_org_file( + org_content, org_file, entry_nodes, entry_to_file_map + ) return entry_nodes, dict(entry_to_file_map) @staticmethod - def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List): + def process_single_org_file( + org_content: str, + org_file: str, + entries: List[Orgnode], + entry_to_file_map: List[Tuple[Orgnode, str]], + ) -> Tuple[List[Orgnode], List[Tuple[Orgnode, str]]]: # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. # We'll split the raw content of this file by new line to mimic the same behavior. try: org_file_entries = orgnode.makelist(org_content, org_file) entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) entries.extend(org_file_entries) - return entries, entry_to_file_map except Exception as e: - logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True) - return entries, entry_to_file_map + logger.error(f"Unable to process file: {org_file}. Skipped indexing it.\nError; {e}", exc_info=True) + + return entries, entry_to_file_map @staticmethod def convert_org_nodes_to_entries( diff --git a/src/khoj/processor/content/org_mode/orgnode.py b/src/khoj/processor/content/org_mode/orgnode.py index 0c9bcb6e..63d36b70 100644 --- a/src/khoj/processor/content/org_mode/orgnode.py +++ b/src/khoj/processor/content/org_mode/orgnode.py @@ -37,7 +37,7 @@ import datetime import re from os.path import relpath from pathlib import Path -from typing import List +from typing import Dict, List, Tuple indent_regex = re.compile(r"^ *") @@ -58,7 +58,7 @@ def makelist_with_filepath(filename): return makelist(f, filename) -def makelist(file, filename): +def makelist(file, filename) -> List["Orgnode"]: """ Read an org-mode file and return a list of Orgnode objects created from this file. @@ -80,16 +80,16 @@ def makelist(file, filename): } # populated from #+SEQ_TODO line level = "" heading = "" - ancestor_headings = [] + ancestor_headings: List[str] = [] bodytext = "" introtext = "" - tags = list() # set of all tags in headline - closed_date = "" - sched_date = "" - deadline_date = "" - logbook = list() + tags: List[str] = list() # set of all tags in headline + closed_date: datetime.date = None + sched_date: datetime.date = None + deadline_date: datetime.date = None + logbook: List[Tuple[datetime.datetime, datetime.datetime]] = list() nodelist: List[Orgnode] = list() - property_map = dict() + property_map: Dict[str, str] = dict() in_properties_drawer = False in_logbook_drawer = False file_title = f"{filename}" @@ -102,13 +102,13 @@ def makelist(file, filename): thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings) if closed_date: thisNode.closed = closed_date - closed_date = "" + closed_date = None if sched_date: thisNode.scheduled = sched_date - sched_date = "" + sched_date = None if deadline_date: thisNode.deadline = deadline_date - deadline_date = "" + deadline_date = None if logbook: thisNode.logbook = logbook logbook = list() @@ -116,7 +116,7 @@ def makelist(file, filename): nodelist.append(thisNode) property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"} previous_level = level - previous_heading = heading + previous_heading: str = heading level = heading_search.group(1) heading = heading_search.group(2) bodytext = "" diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index b97e62e9..a80ff2e1 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -37,8 +37,8 @@ def test_configure_heading_entry_to_jsonl(tmp_path): assert is_none_or_empty(entries) -def test_entry_split_when_exceeds_max_words(): - "Ensure entries with compiled words exceeding max_words are split." +def test_entry_split_when_exceeds_max_tokens(): + "Ensure entries with compiled words exceeding max_tokens are split." # Arrange tmp_path = "/tmp/test.org" entry = f"""*** Heading @@ -81,7 +81,7 @@ def test_entry_split_drops_large_words(): assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1 -def test_entry_with_body_to_jsonl(tmp_path): +def test_entry_with_body_to_entry(tmp_path): "Ensure entries with valid body text are loaded." # Arrange entry = f"""*** Heading @@ -97,13 +97,13 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=3) # Assert assert len(entries) == 1 -def test_file_with_entry_after_intro_text_to_jsonl(tmp_path): +def test_file_with_entry_after_intro_text_to_entry(tmp_path): "Ensure intro text before any headings is indexed." # Arrange entry = f""" @@ -188,7 +188,8 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Arrange entry = f""" * Heading 1 -** Heading 2 +** Sub-Heading 1.1 +* Heading 2 """ data = { f"{tmp_path}": entry, @@ -199,9 +200,10 @@ def test_extract_entries_with_different_level_headings(tmp_path): entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True) # Assert - assert len(entries) == 2 + assert len(entries) == 3 assert f"{entries[0].raw}".startswith("* Heading 1") - assert f"{entries[1].raw}".startswith("** Heading 2") + assert f"{entries[1].raw}".startswith("** Sub-Heading 1.1") + assert f"{entries[2].raw}".startswith("* Heading 2") # Helper Functions From 2ea8a832a0ac46c38aac59ede51526ec9b2dd5c5 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Feb 2024 20:40:31 +0530 Subject: [PATCH 08/17] Log error when fail to index md file. Fix, improve typing in md_to_entries --- .../processor/content/markdown/markdown_to_entries.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index e36d5ee1..300b543e 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -58,7 +58,7 @@ class MarkdownToEntries(TextToEntries): def extract_markdown_entries(markdown_files, max_tokens=256) -> List[Entry]: "Extract entries by heading from specified Markdown files" entries: List[str] = [] - entry_to_file_map: List[Tuple[str, Path]] = [] + entry_to_file_map: List[Tuple[str, str]] = [] for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] @@ -66,7 +66,7 @@ class MarkdownToEntries(TextToEntries): markdown_content, markdown_file, entries, entry_to_file_map, max_tokens ) except Exception as e: - logger.warning( + logger.error( f"Unable to process file: {markdown_file}. This file will not be indexed.\n{e}", exc_info=True ) @@ -75,12 +75,12 @@ class MarkdownToEntries(TextToEntries): @staticmethod def process_single_markdown_file( markdown_content: str, - markdown_file: Path, + markdown_file: str, entries: List[str], - entry_to_file_map: List[Tuple[str, Path]], + entry_to_file_map: List[Tuple[str, str]], max_tokens=256, ancestry: Dict[int, str] = {}, - ): + ) -> Tuple[List[str], List[Tuple[str, str]]]: # Prepend the markdown section's heading ancestry ancestry_string = "\n".join([f"{'#' * key} {ancestry[key]}" for key in sorted(ancestry.keys())]) markdown_content_with_ancestry = f"{ancestry_string}{markdown_content}" From eaa27ca841d57271b4ea9fb50f5774f6871b0215 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 12 Feb 2024 10:14:37 +0530 Subject: [PATCH 09/17] Only add spaces after heading if any tags in orgnode raw entry repr --- src/khoj/processor/content/org_mode/orgnode.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/content/org_mode/orgnode.py b/src/khoj/processor/content/org_mode/orgnode.py index 63d36b70..8449cc9d 100644 --- a/src/khoj/processor/content/org_mode/orgnode.py +++ b/src/khoj/processor/content/org_mode/orgnode.py @@ -495,12 +495,13 @@ class Orgnode(object): if self._priority: n = n + "[#" + self._priority + "] " n = n + self._heading - n = "%-60s " % n # hack - tags will start in column 62 - closecolon = "" - for t in self._tags: - n = n + ":" + t - closecolon = ":" - n = n + closecolon + if self._tags: + n = "%-60s " % n # hack - tags will start in column 62 + closecolon = "" + for t in self._tags: + n = n + ":" + t + closecolon = ":" + n = n + closecolon n = n + "\n" # Get body indentation from first line of body From 44b3247869be1cc3764ba3e3cf23dd08e67b83cc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 12 Feb 2024 11:27:35 +0530 Subject: [PATCH 10/17] Update logical splitting of org-mode text into entries - Major - Do not split org file, entry if it fits within the max token limits - Recurse down org file entries, one heading level at a time until reach leaf node or the current parent tree fits context window - Update `process_single_org_file' func logic to do this recursion - Convert extracted org nodes with children into entries - Previously org node to entry code just had to handle leaf entries - Now it recieve list of org node trees - Only add ancestor path to root org-node of each tree - Indent each entry trees headings by +1 level from base level (=2) - Minor - Stop timing org-node parsing vs org-node to entry conversion Just time the wrapping function for org-mode entry extraction This standardizes what is being timed across at md, org etc. - Move try/catch to `extract_org_nodes' from `parse_single_org_file' func to standardize this also across md, org --- .../content/org_mode/org_to_entries.py | 201 ++++++++++++------ tests/test_org_to_entries.py | 175 +++++++++++++-- 2 files changed, 303 insertions(+), 73 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 06a3c453..953e918d 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -1,4 +1,5 @@ import logging +import re from pathlib import Path from typing import Dict, List, Tuple @@ -30,11 +31,12 @@ class OrgToEntries(TextToEntries): deletion_file_names = None # Extract Entries from specified Org files + max_tokens = 256 with timer("Extract entries from specified Org files", logger): - current_entries = self.extract_org_entries(files) + current_entries = self.extract_org_entries(files, max_tokens=max_tokens) with timer("Split entries by max token size supported by model", logger): - current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=max_tokens) # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): @@ -52,96 +54,173 @@ class OrgToEntries(TextToEntries): return num_new_embeddings, num_deleted_embeddings @staticmethod - def extract_org_entries(org_files: dict[str, str], index_heading_entries: bool = False) -> List[Entry]: + def extract_org_entries( + org_files: dict[str, str], index_heading_entries: bool = False, max_tokens=256 + ) -> List[Entry]: "Extract entries from specified Org files" - with timer("Parse entries from org files into OrgNode objects", logger): - entry_nodes, file_to_entries = OrgToEntries.extract_org_nodes(org_files) - - with timer("Convert OrgNodes into list of entries", logger): - return OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) + entries, entry_to_file_map = OrgToEntries.extract_org_nodes(org_files, max_tokens) + return OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map, index_heading_entries) @staticmethod - def extract_org_nodes(org_files: dict[str, str]) -> Tuple[List[Orgnode], Dict[Orgnode, str]]: + def extract_org_nodes(org_files: dict[str, str], max_tokens) -> Tuple[List[List[Orgnode]], Dict[Orgnode, str]]: "Extract org nodes from specified org files" - entry_nodes: List[Orgnode] = [] + entries: List[List[Orgnode]] = [] entry_to_file_map: List[Tuple[Orgnode, str]] = [] for org_file in org_files: - org_content = org_files[org_file] - entry_nodes, entry_to_file_map = OrgToEntries.process_single_org_file( - org_content, org_file, entry_nodes, entry_to_file_map - ) + try: + org_content = org_files[org_file] + entries, entry_to_file_map = OrgToEntries.process_single_org_file( + org_content, org_file, entries, entry_to_file_map, max_tokens + ) + except Exception as e: + logger.error(f"Unable to process file: {org_file}. Skipped indexing it.\nError; {e}", exc_info=True) - return entry_nodes, dict(entry_to_file_map) + return entries, dict(entry_to_file_map) @staticmethod def process_single_org_file( org_content: str, org_file: str, - entries: List[Orgnode], + entries: List[List[Orgnode]], entry_to_file_map: List[Tuple[Orgnode, str]], - ) -> Tuple[List[Orgnode], List[Tuple[Orgnode, str]]]: - # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. - # We'll split the raw content of this file by new line to mimic the same behavior. - try: - org_file_entries = orgnode.makelist(org_content, org_file) - entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) - entries.extend(org_file_entries) - except Exception as e: - logger.error(f"Unable to process file: {org_file}. Skipped indexing it.\nError; {e}", exc_info=True) + max_tokens=256, + ancestry: Dict[int, str] = {}, + ) -> Tuple[List[List[Orgnode]], List[Tuple[Orgnode, str]]]: + """Parse org_content from org_file into OrgNode entries + + Recurse down org file entries, one heading level at a time, + until reach a leaf entry or the current entry tree fits max_tokens. + + Parse recursion terminating entry (trees) into (a list of) OrgNode objects. + """ + # Prepend the org section's heading ancestry + ancestry_string = "\n".join([f"{'*' * key} {ancestry[key]}" for key in sorted(ancestry.keys())]) + org_content_with_ancestry = f"{ancestry_string}{org_content}" + + # If content is small or content has no children headings, save it as a single entry + # Note: This is the terminating condition for this recursive function + if len(TextToEntries.tokenizer(org_content_with_ancestry)) <= max_tokens or not re.search( + rf"^\*{{{len(ancestry)+1},}}\s", org_content, re.MULTILINE + ): + orgnode_content_with_ancestry = orgnode.makelist(org_content_with_ancestry, org_file) + entry_to_file_map += zip(orgnode_content_with_ancestry, [org_file] * len(orgnode_content_with_ancestry)) + entries.extend([orgnode_content_with_ancestry]) + return entries, entry_to_file_map + + # Split this entry tree into sections by the next heading level in it + # Increment heading level until able to split entry into sections + # A successful split will result in at least 2 sections + next_heading_level = len(ancestry) + sections: List[str] = [] + while len(sections) < 2: + next_heading_level += 1 + sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE) + + # Recurse down each non-empty section after parsing its body, heading and ancestry + for section in sections: + # Skip empty sections + if section.strip() == "": + continue + + # Extract the section body and (when present) the heading + current_ancestry = ancestry.copy() + first_non_empty_line = [line for line in section.split("\n") if line.strip() != ""][0] + # If first non-empty line is a heading with expected heading level + if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): + # Extract the section body without the heading + current_section_body = "\n".join(section.split(first_non_empty_line)[1:]) + # Parse the section heading into current section ancestry + current_section_title = first_non_empty_line[next_heading_level:].strip() + current_ancestry[next_heading_level] = current_section_title + # Else process the section as just body text + else: + current_section_body = section + + # Recurse down children of the current entry + OrgToEntries.process_single_org_file( + current_section_body, + org_file, + entries, + entry_to_file_map, + max_tokens, + current_ancestry, + ) return entries, entry_to_file_map @staticmethod def convert_org_nodes_to_entries( - parsed_entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False + parsed_entries: List[List[Orgnode]], + entry_to_file_map: Dict[Orgnode, str], + index_heading_entries: bool = False, ) -> List[Entry]: - "Convert Org-Mode nodes into list of Entry objects" + """ + Convert OrgNode lists into list of Entry objects + + Each list of OrgNodes is a parsed parent org tree or leaf node. + Convert each list of these OrgNodes into a single Entry. + """ entries: List[Entry] = [] - for parsed_entry in parsed_entries: - if not parsed_entry.hasBody and not index_heading_entries: - # Ignore title notes i.e notes with just headings and empty body - continue + for entry_group in parsed_entries: + entry_heading, entry_compiled, entry_raw = "", "", "" + for parsed_entry in entry_group: + if not parsed_entry.hasBody and not index_heading_entries: + # Ignore title notes i.e notes with just headings and empty body + continue - todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else "" + todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else "" - # Prepend ancestor headings, filename as top heading to entry for context - ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry]) - if parsed_entry.heading: - heading = f"* Path: {ancestors_trail}\n** {todo_str}{parsed_entry.heading}." - else: - heading = f"* Path: {ancestors_trail}." + # Set base level to current org-node tree's root heading level + if not entry_heading and parsed_entry.level > 0: + base_level = parsed_entry.level + # Indent entry by 1 heading level as ancestry is prepended as top level heading + heading = f"{'*' * (parsed_entry.level-base_level+2)} {todo_str}" if parsed_entry.level > 0 else "" + if parsed_entry.heading: + heading += f"{parsed_entry.heading}." - compiled = heading - if state.verbose > 2: - logger.debug(f"Title: {heading}") + # Prepend ancestor headings, filename as top heading to root parent entry for context + # Children nodes do not need ancestors trail as root parent node will have it + if not entry_heading: + ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry]) + heading = f"* Path: {ancestors_trail}\n{heading}" if heading else f"* Path: {ancestors_trail}." - if parsed_entry.tags: - tags_str = " ".join(parsed_entry.tags) - compiled += f"\t {tags_str}." + compiled = heading if state.verbose > 2: - logger.debug(f"Tags: {tags_str}") + logger.debug(f"Title: {heading}") - if parsed_entry.closed: - compiled += f'\n Closed on {parsed_entry.closed.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Closed: {parsed_entry.closed.strftime("%Y-%m-%d")}') + if parsed_entry.tags: + tags_str = " ".join(parsed_entry.tags) + compiled += f"\t {tags_str}." + if state.verbose > 2: + logger.debug(f"Tags: {tags_str}") - if parsed_entry.scheduled: - compiled += f'\n Scheduled for {parsed_entry.scheduled.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Scheduled: {parsed_entry.scheduled.strftime("%Y-%m-%d")}') + if parsed_entry.closed: + compiled += f'\n Closed on {parsed_entry.closed.strftime("%Y-%m-%d")}.' + if state.verbose > 2: + logger.debug(f'Closed: {parsed_entry.closed.strftime("%Y-%m-%d")}') - if parsed_entry.hasBody: - compiled += f"\n {parsed_entry.body}" - if state.verbose > 2: - logger.debug(f"Body: {parsed_entry.body}") + if parsed_entry.scheduled: + compiled += f'\n Scheduled for {parsed_entry.scheduled.strftime("%Y-%m-%d")}.' + if state.verbose > 2: + logger.debug(f'Scheduled: {parsed_entry.scheduled.strftime("%Y-%m-%d")}') - if compiled: + if parsed_entry.hasBody: + compiled += f"\n {parsed_entry.body}" + if state.verbose > 2: + logger.debug(f"Body: {parsed_entry.body}") + + # Add the sub-entry contents to the entry + entry_compiled += f"{compiled}" + entry_raw += f"{parsed_entry}" + if not entry_heading: + entry_heading = heading + + if entry_compiled: entries.append( Entry( - compiled=compiled, - raw=f"{parsed_entry}", - heading=f"{heading}", + compiled=entry_compiled, + raw=entry_raw, + heading=f"{entry_heading}", file=f"{entry_to_file_map[parsed_entry]}", ) ) diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index a80ff2e1..41c25226 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -1,5 +1,5 @@ -import json import os +import re from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.text_to_entries import TextToEntries @@ -8,7 +8,7 @@ from khoj.utils.helpers import is_none_or_empty from khoj.utils.rawconfig import Entry, TextContentConfig -def test_configure_heading_entry_to_jsonl(tmp_path): +def test_configure_indexing_heading_only_entries(tmp_path): """Ensure entries with empty body are ignored, unless explicitly configured to index heading entries. Property drawers not considered Body. Ignore control characters for evaluating if Body empty.""" # Arrange @@ -26,7 +26,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) + entries = OrgToEntries.extract_org_entries( + org_files=data, index_heading_entries=index_heading_entries, max_tokens=3 + ) # Assert if index_heading_entries: @@ -77,10 +79,153 @@ def test_entry_split_drops_large_words(): processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] # Assert - # "Heading" dropped from compiled version because its over the set max word limit + # (Only) "Heading" dropped from compiled version because its over the set max word limit + assert "Heading" not in processed_entry.compiled assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1 +def test_parse_org_file_into_single_entry_if_small(tmp_path): + "Parse org file into single entry if it fits within the token limits." + # Arrange + original_entry = f""" +* Heading 1 +body line 1 +** Subheading 1.1 +body line 1.1 +""" + data = { + f"{tmp_path}": original_entry, + } + expected_entry = f""" +* Heading 1 +body line 1 + +** Subheading 1.1 +body line 1.1 + +""".lstrip() + + # Act + # Extract Entries from specified Org files + extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=12) + for entry in extracted_entries: + entry.raw = clean(entry.raw) + + # Assert + assert len(extracted_entries) == 1 + assert entry.raw == expected_entry + + +def test_parse_org_entry_with_children_as_single_entry_if_small(tmp_path): + "Parse org entry with child headings as single entry only if it fits within the tokens limits." + # Arrange + entry = f""" +* Heading 1 +body line 1 +** Subheading 1.1 +body line 1.1 +* Heading 2 +body line 2 +** Subheading 2.1 +longer body line 2.1 +""" + data = { + f"{tmp_path}": entry, + } + first_expected_entry = f""" +* Path: {tmp_path} +** Heading 1. + body line 1 + +*** Subheading 1.1. + body line 1.1 + +""".lstrip() + second_expected_entry = f""" +* Path: {tmp_path} +** Heading 2. + body line 2 + +""".lstrip() + third_expected_entry = f""" +* Path: {tmp_path} / Heading 2 +** Subheading 2.1. + longer body line 2.1 + +""".lstrip() + + # Act + # Extract Entries from specified Org files + extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=12) + + # Assert + assert len(extracted_entries) == 3 + assert extracted_entries[0].compiled == first_expected_entry, "First entry includes children headings" + assert extracted_entries[1].compiled == second_expected_entry, "Second entry does not include children headings" + assert extracted_entries[2].compiled == third_expected_entry, "Third entry is second entries child heading" + + +def test_separate_sibling_org_entries_if_all_cannot_fit_in_token_limit(tmp_path): + "Parse org sibling entries as separate entries only if it fits within the tokens limits." + # Arrange + entry = f""" +* Heading 1 +body line 1 +** Subheading 1.1 +body line 1.1 +* Heading 2 +body line 2 +** Subheading 2.1 +body line 2.1 +* Heading 3 +body line 3 +** Subheading 3.1 +body line 3.1 +""" + data = { + f"{tmp_path}": entry, + } + first_expected_entry = f""" +* Path: {tmp_path} +** Heading 1. + body line 1 + +*** Subheading 1.1. + body line 1.1 + +""".lstrip() + second_expected_entry = f""" +* Path: {tmp_path} +** Heading 2. + body line 2 + +*** Subheading 2.1. + body line 2.1 + +""".lstrip() + third_expected_entry = f""" +* Path: {tmp_path} +** Heading 3. + body line 3 + +*** Subheading 3.1. + body line 3.1 + +""".lstrip() + + # Act + # Extract Entries from specified Org files + # Max tokens = 30 is in the middle of 2 entry (24 tokens) and 3 entry (36 tokens) tokens boundary + # Where each sibling entry contains 12 tokens per sibling entry * 3 entries = 36 tokens + extracted_entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=30) + + # Assert + assert len(extracted_entries) == 3 + assert extracted_entries[0].compiled == first_expected_entry, "First entry includes children headings" + assert extracted_entries[1].compiled == second_expected_entry, "Second entry includes children headings" + assert extracted_entries[2].compiled == third_expected_entry, "Third entry includes children headings" + + def test_entry_with_body_to_entry(tmp_path): "Ensure entries with valid body text are loaded." # Arrange @@ -118,13 +263,13 @@ Intro text # Act # Extract Entries from specified Org files - entries = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=3) # Assert assert len(entries) == 2 -def test_file_with_no_headings_to_jsonl(tmp_path): +def test_file_with_no_headings_to_entry(tmp_path): "Ensure files with no heading, only body text are loaded." # Arrange entry = f""" @@ -137,7 +282,7 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries = OrgToEntries.extract_org_entries(org_files=data) + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=3) # Assert assert len(entries) == 1 @@ -197,13 +342,14 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Org files - entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True) + entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True, max_tokens=3) + for entry in entries: + entry.raw = clean(f"{entry.raw}") # Assert - assert len(entries) == 3 - assert f"{entries[0].raw}".startswith("* Heading 1") - assert f"{entries[1].raw}".startswith("** Sub-Heading 1.1") - assert f"{entries[2].raw}".startswith("* Heading 2") + assert len(entries) == 2 + assert entries[0].raw == "* Heading 1\n** Sub-Heading 1.1\n", "Ensure entry includes heading ancestory" + assert entries[1].raw == "* Heading 2\n" # Helper Functions @@ -213,3 +359,8 @@ def create_file(tmp_path, entry=None, filename="test.org"): if entry: org_file.write_text(entry) return org_file + + +def clean(entry): + "Remove properties from entry for easier comparison." + return re.sub(r"\n:PROPERTIES:(.*?):END:", "", entry, flags=re.DOTALL) From 720139c3c19dea041edad25b1bf50ebad50c64dc Mon Sep 17 00:00:00 2001 From: sabaimran Date: Fri, 15 Mar 2024 16:39:03 +0530 Subject: [PATCH 11/17] Fix all unit tests for test_text_search --- tests/test_text_search.py | 137 ++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 66 deletions(-) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 9a41430f..915425bf 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -57,18 +57,21 @@ def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, defaul # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db def test_text_search_setup_with_empty_file_creates_no_entries( - org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog + org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser ): # Arrange + existing_entries = Entry.objects.filter(user=default_user).count() data = get_org_files(org_config_with_only_new_file) # Act # Generate notes embeddings during asymmetric setup - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert - assert "Deleted 8 entries. Created 0 new entries for user " in caplog.records[-1].message + updated_entries = Entry.objects.filter(user=default_user).count() + + assert existing_entries == 2 + assert updated_entries == 0 verify_embeddings(0, default_user) @@ -78,6 +81,7 @@ def test_text_indexer_deletes_embedding_before_regenerate( content_config: ContentConfig, default_user: KhojUser, caplog ): # Arrange + existing_entries = Entry.objects.filter(user=default_user).count() org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) @@ -87,30 +91,18 @@ def test_text_indexer_deletes_embedding_before_regenerate( text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert + updated_entries = Entry.objects.filter(user=default_user).count() + assert existing_entries == 2 + assert updated_entries == 2 assert "Deleting all entries for file type org" in caplog.text - assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message - - -# ---------------------------------------------------------------------------------------------------- -@pytest.mark.django_db -def test_text_search_setup_batch_processes(content_config: ContentConfig, default_user: KhojUser, caplog): - # Arrange - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - data = get_org_files(org_config) - - # Act - # Generate notes embeddings during asymmetric setup - with caplog.at_level(logging.DEBUG): - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) - - # Assert - assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message + assert "Deleted 2 entries. Created 2 new entries for user " in caplog.records[-1].message # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog): # Arrange + existing_entries = Entry.objects.filter(user=default_user) org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) @@ -127,6 +119,10 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def final_logs = caplog.text # Assert + updated_entries = Entry.objects.filter(user=default_user) + for entry in updated_entries: + assert entry in existing_entries + assert len(existing_entries) == len(updated_entries) assert "Deleting all entries for file type org" in initial_logs assert "Deleting all entries for file type org" not in final_logs @@ -256,10 +252,9 @@ conda activate khoj # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_regenerate_index_with_new_entry( - content_config: ContentConfig, new_org_file: Path, default_user: KhojUser, caplog -): +def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser): # Arrange + existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) org_config = LocalOrgConfig.objects.filter(user=default_user).first() initial_data = get_org_files(org_config) @@ -271,28 +266,34 @@ def test_regenerate_index_with_new_entry( final_data = get_org_files(org_config) # Act - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) - initial_logs = caplog.text - caplog.clear() # Clear logs + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) + updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # regenerate notes jsonl, model embeddings and model to include entry from new file - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user) - final_logs = caplog.text + text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user) + updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert - assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs - assert "Deleted 13 entries. Created 14 new entries for user " in final_logs - verify_embeddings(14, default_user) + for entry in updated_entries1: + assert entry in updated_entries2 + + assert not any([new_org_file.name in entry for entry in updated_entries1]) + assert not any([new_org_file.name in entry for entry in existing_entries]) + assert any([new_org_file.name in entry for entry in updated_entries2]) + + assert any( + ["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2] + ) + verify_embeddings(3, default_user) # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db def test_update_index_with_duplicate_entries_in_stable_order( - org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog + org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser ): # Arrange + existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) # Insert org-mode entries with same compiled form into new org file @@ -304,30 +305,33 @@ def test_update_index_with_duplicate_entries_in_stable_order( # Act # generate embeddings, entries, notes model from scratch after adding new org-mode file - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) - initial_logs = caplog.text - caplog.clear() # Clear logs + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) data = get_org_files(org_config_with_only_new_file) # update embeddings, entries, notes model with no new changes - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) - final_logs = caplog.text + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) + updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert # verify only 1 entry added even if there are multiple duplicate entries - assert "Deleted 8 entries. Created 1 new entries for user " in initial_logs - assert "Deleted 0 entries. Created 0 new entries for user " in final_logs + for entry in existing_entries: + assert entry not in updated_entries1 + for entry in updated_entries1: + assert entry in updated_entries2 + + assert len(existing_entries) == 2 + assert len(updated_entries1) == len(updated_entries2) verify_embeddings(1, default_user) # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog): +def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser): # Arrange + existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) # Insert org-mode entries with same compiled form into new org file @@ -344,33 +348,34 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg # Act # load embeddings, entries, notes model after adding new org file with 2 entries - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) - initial_logs = caplog.text - caplog.clear() # Clear logs + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) + updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user) - final_logs = caplog.text + text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user) + updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert - # verify only 1 entry added even if there are multiple duplicate entries - assert "Deleted 8 entries. Created 2 new entries for user " in initial_logs - assert "Deleted 1 entries. Created 0 new entries for user " in final_logs + for entry in existing_entries: + assert entry not in updated_entries1 + + # verify the entry in updated_entries2 is a subset of updated_entries1 + for entry in updated_entries1: + assert entry not in updated_entries2 + + for entry in updated_entries2: + assert entry in updated_entries1[0] verify_embeddings(1, default_user) # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser, caplog): +def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser): # Arrange + existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) - initial_logs = caplog.text - caplog.clear() # Clear logs + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # append org-mode entry to first org input file in config with open(new_org_file, "w") as f: @@ -381,14 +386,14 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file # Act # update embeddings, entries with the newly added note - with caplog.at_level(logging.INFO): - text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) - final_logs = caplog.text + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) + updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert - assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs - assert "Deleted 0 entries. Created 1 new entries for user " in final_logs - verify_embeddings(14, default_user) + for entry in existing_entries: + assert entry not in updated_entries1 + assert len(updated_entries1) == len(existing_entries) + 1 + verify_embeddings(3, default_user) # ---------------------------------------------------------------------------------------------------- From ad4fa4b2f4063f41b2a886e4ac2aed24b750553a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 31 Mar 2024 18:03:17 +0530 Subject: [PATCH 12/17] Fix adding file path instead of stem to markdown entries --- src/khoj/processor/content/markdown/markdown_to_entries.py | 5 ++--- tests/test_markdown_to_entries.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index 300b543e..b00ac325 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -142,13 +142,12 @@ class MarkdownToEntries(TextToEntries): entry_filename = urllib3.util.parse_url(raw_filename).url else: entry_filename = str(Path(raw_filename)) - stem = Path(raw_filename).stem heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model # Increment heading level for heading entries and make filename as its top level heading - prefix = f"# {stem}\n#" if heading else f"# {stem}\n" - compiled_entry = f"{entry_filename}\n{prefix}{parsed_entry}" + prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n" + compiled_entry = f"{prefix}{parsed_entry}" entries.append( Entry( compiled=compiled_entry, diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 68b6589e..d63f026a 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -16,7 +16,7 @@ def test_extract_markdown_with_no_headings(tmp_path): data = { f"{tmp_path}": entry, } - expected_heading = f"# {tmp_path.stem}" + expected_heading = f"# {tmp_path}" # Act # Extract Entries from specified Markdown files From 29c1c180429fea515fb8654edf1befdc7e6905db Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 31 Mar 2024 18:05:22 +0530 Subject: [PATCH 13/17] Increase search distance to get relevant content for chat post indexer update More content indexed per entry would result in an overall scores lowering effect. Increase default search distance threshold to counter that - Details - Fix expected results post indexing updates - Fix search with max distance post indexing updates - Minor - Remove openai chat actor test for after: operator as it's not expected anymore --- src/khoj/routers/api_chat.py | 2 +- tests/test_client.py | 6 +++--- tests/test_multiple_users.py | 2 +- tests/test_openai_chat_actors.py | 4 ---- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 9336fd6f..5ded7bff 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -475,7 +475,7 @@ async def chat( common: CommonQueryParams, q: str, n: Optional[int] = 5, - d: Optional[float] = 0.18, + d: Optional[float] = 0.22, stream: Optional[bool] = False, title: Optional[str] = None, conversation_id: Optional[int] = None, diff --git a/tests/test_client.py b/tests/test_client.py index abbd1fec..3cd2eee4 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -335,7 +335,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa user_query = quote("How to git install application?") # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers) + response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers) # Assert assert response.status_code == 200 @@ -354,7 +354,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org user_query = quote("How to find my goat?") # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers) + response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers) # Assert assert response.status_code == 200 @@ -438,7 +438,7 @@ def test_notes_search_requires_parent_context( user_query = quote("Install Khoj on Emacs") # Act - response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers) + response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.22", headers=headers) # Assert assert response.status_code == 200 diff --git a/tests/test_multiple_users.py b/tests/test_multiple_users.py index bb0f99d8..4e8e456a 100644 --- a/tests/test_multiple_users.py +++ b/tests/test_multiple_users.py @@ -56,7 +56,7 @@ def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUs # Assert assert update_response.status_code == 200 - assert len(results) == 5 + assert len(results) == 3 for result in results: assert result["additional"]["file"] not in source_file_symbol diff --git a/tests/test_openai_chat_actors.py b/tests/test_openai_chat_actors.py index 5c2855b2..df9d8f07 100644 --- a/tests/test_openai_chat_actors.py +++ b/tests/test_openai_chat_actors.py @@ -470,10 +470,6 @@ async def test_websearch_with_operators(chat_client): ["site:reddit.com" in response for response in responses] ), "Expected a search query to include site:reddit.com but got: " + str(responses) - assert any( - ["after:2024/04/01" in response for response in responses] - ), "Expected a search query to include after:2024/04/01 but got: " + str(responses) - # ---------------------------------------------------------------------------------------------------- @pytest.mark.anyio From 32ac0622ff070ada66bf072e9366f6d6bc1fc90f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 31 Mar 2024 18:09:54 +0530 Subject: [PATCH 14/17] Extract dates from compiled text entries --- src/khoj/processor/content/text_to_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 8b7df3e9..edd814f6 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -189,7 +189,7 @@ class TextToEntries(ABC): new_dates = [] with timer("Indexed dates from added entries in", logger): for added_entry in added_entries: - dates_in_entries = zip(self.date_filter.extract_dates(added_entry.raw), repeat(added_entry)) + dates_in_entries = zip(self.date_filter.extract_dates(added_entry.compiled), repeat(added_entry)) dates_to_create = [ EntryDates(date=date, entry=added_entry) for date, added_entry in dates_in_entries From 00f599ea783c4aa7d59e1fefa320fc4c8379c46c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 2 Apr 2024 21:05:06 +0530 Subject: [PATCH 15/17] Fix passing flags to re.split to break org, md content by heading level `re.MULTILINE' should be passed to the `flags' argument, not the `max_splits' argument of the `re.split' func This was messing up the indexing by only allowing a maximum of re.MULTILINE splits. Fixing this improves the search quality to previous state --- src/khoj/processor/content/markdown/markdown_to_entries.py | 4 ++-- src/khoj/processor/content/org_mode/org_to_entries.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index b00ac325..73e5bf47 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -87,7 +87,7 @@ class MarkdownToEntries(TextToEntries): # If content is small or content has no children headings, save it as a single entry if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search( - rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE + rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE ): entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)] entries.extend([markdown_content_with_ancestry]) @@ -98,7 +98,7 @@ class MarkdownToEntries(TextToEntries): sections: List[str] = [] while len(sections) < 2: next_heading_level += 1 - sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE) + sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE) for section in sections: # Skip empty sections diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 953e918d..59a4ff45 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -114,7 +114,7 @@ class OrgToEntries(TextToEntries): sections: List[str] = [] while len(sections) < 2: next_heading_level += 1 - sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE) + sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE) # Recurse down each non-empty section after parsing its body, heading and ancestry for section in sections: From 67b1178aec5c420d3a4dc9c3c96cceeaaab92e54 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 8 Apr 2024 13:01:24 +0530 Subject: [PATCH 16/17] Remove debug logs generated while compiling org-mode entries --- src/khoj/processor/content/org_mode/org_to_entries.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 59a4ff45..2dcaa744 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -185,29 +185,19 @@ class OrgToEntries(TextToEntries): heading = f"* Path: {ancestors_trail}\n{heading}" if heading else f"* Path: {ancestors_trail}." compiled = heading - if state.verbose > 2: - logger.debug(f"Title: {heading}") if parsed_entry.tags: tags_str = " ".join(parsed_entry.tags) compiled += f"\t {tags_str}." - if state.verbose > 2: - logger.debug(f"Tags: {tags_str}") if parsed_entry.closed: compiled += f'\n Closed on {parsed_entry.closed.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Closed: {parsed_entry.closed.strftime("%Y-%m-%d")}') if parsed_entry.scheduled: compiled += f'\n Scheduled for {parsed_entry.scheduled.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Scheduled: {parsed_entry.scheduled.strftime("%Y-%m-%d")}') if parsed_entry.hasBody: compiled += f"\n {parsed_entry.body}" - if state.verbose > 2: - logger.debug(f"Body: {parsed_entry.body}") # Add the sub-entry contents to the entry entry_compiled += f"{compiled}" From 9239c2c2ed7a83ea5b5ab3ee749a279608d911c8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 8 Apr 2024 13:38:08 +0530 Subject: [PATCH 17/17] Update drop large words test to ensure newlines considerd word boundary Prevent regression to #620 --- tests/test_org_to_entries.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index 41c25226..f01f50f3 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -68,10 +68,12 @@ def test_entry_split_when_exceeds_max_tokens(): def test_entry_split_drops_large_words(): "Ensure entries drops words larger than specified max word length from compiled version." # Arrange - entry_text = f"""*** Heading - \t\r - Body Line 1 - """ + entry_text = f"""First Line +dog=1\n\r\t +cat=10 +car=4 +book=2 +""" entry = Entry(raw=entry_text, compiled=entry_text) # Act @@ -79,9 +81,13 @@ def test_entry_split_drops_large_words(): processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] # Assert - # (Only) "Heading" dropped from compiled version because its over the set max word limit - assert "Heading" not in processed_entry.compiled - assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1 + # Ensure words larger than max word length are dropped + # Ensure newline characters are considered as word boundaries for splitting words. See #620 + words_to_keep = ["First", "Line", "dog=1", "car=4"] + words_to_drop = ["cat=10", "book=2"] + assert all([word for word in words_to_keep if word in processed_entry.compiled]) + assert not any([word for word in words_to_drop if word in processed_entry.compiled]) + assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 2 def test_parse_org_file_into_single_entry_if_small(tmp_path):