diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index a1e4d0c1..0179e05e 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -2,7 +2,6 @@ import glob import logging import re -import time from pathlib import Path from typing import List @@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl): with open(markdown_file, "r", encoding="utf8") as f: markdown_content = f.read() markdown_entries_per_file = [] + any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - prefix = "#" if entry.startswith("#") else "# " - if entry.strip(empty_escape_sequences) != "": - markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}") + # Add heading level as the regex split removed it from entries with headings + prefix = "#" if entry.startswith("#") else "# " if any_headings else "" + stripped_entry = entry.strip(empty_escape_sequences) + if stripped_entry != "": + markdown_entries_per_file.append(f"{prefix}{stripped_entry}") entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) @@ -126,9 +128,19 @@ class MarkdownToJsonl(TextToJsonl): entries = [] for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) + heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model - compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" - entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) + # Increment heading level for heading entries and make filename as its top level heading + prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" + compiled_entry = f"{prefix}{parsed_entry}" + entries.append( + Entry( + compiled=compiled_entry, + raw=parsed_entry, + heading=f"{prefix}{heading}", + file=f"{entry_filename}", + ) + ) logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 0950a089..e5ec7cc6 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -1,7 +1,7 @@ # Standard Packages import glob import logging -import time +from pathlib import Path from typing import Iterable, List # Internal Packages @@ -113,7 +113,11 @@ class OrgToJsonl(TextToJsonl): # Ignore title notes i.e notes with just headings and empty body continue - compiled = f"{parsed_entry.heading}." + # Prepend filename as top heading to entry + filename = Path(entry_to_file_map[parsed_entry]).stem + heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}." + + compiled = heading if state.verbose > 2: logger.debug(f"Title: {parsed_entry.heading}") @@ -139,7 +143,14 @@ class OrgToJsonl(TextToJsonl): logger.debug(f"Body: {parsed_entry.body}") if compiled: - entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")] + entries.append( + Entry( + compiled=compiled, + raw=f"{parsed_entry}", + heading=f"{heading}", + file=f"{entry_to_file_map[parsed_entry]}", + ) + ) return entries diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 22de2c01..3dd0d1b5 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -31,14 +31,33 @@ class TextToJsonl(ABC): "Split entries if compiled entry length exceeds the max tokens supported by the ML model." chunked_entries: List[Entry] = [] for entry in entries: + # Split entry into words compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] + # Drop long words instead of having entry truncated to maintain quality of entry processed by models compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] + + # Split entry into chunks of max tokens for chunk_index in range(0, len(compiled_entry_words), max_tokens): compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] compiled_entry_chunk = " ".join(compiled_entry_words_chunk) - entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file) - chunked_entries.append(entry_chunk) + + # Prepend heading to all other chunks, the first chunk already has heading from original entry + if chunk_index > 0: + # Snip heading to avoid crossing max_tokens limit + # Keep last 100 characters of heading as entry heading more important than filename + snipped_heading = entry.heading[-100:] + compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}" + + chunked_entries.append( + Entry( + compiled=compiled_entry_chunk, + raw=entry.raw, + heading=entry.heading, + file=entry.file, + ) + ) + return chunked_entries def mark_entries_for_update( diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 389e80f6..6b87c220 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -103,11 +103,15 @@ class SearchResponse(ConfigBase): class Entry: raw: str compiled: str + heading: Optional[str] file: Optional[str] - def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None): + def __init__( + self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None + ): self.raw = raw self.compiled = compiled + self.heading = heading self.file = file def to_json(self) -> str: diff --git a/tests/test_jsonl_to_jsonl.py b/tests/test_jsonl_to_jsonl.py index eb25d579..b52b5fc9 100644 --- a/tests/test_jsonl_to_jsonl.py +++ b/tests/test_jsonl_to_jsonl.py @@ -1,17 +1,13 @@ -# Standard Packages -import json - # Internal Packages from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl -from khoj.utils.jsonl import load_jsonl from khoj.utils.rawconfig import Entry def test_process_entries_from_single_input_jsonl(tmp_path): "Convert multiple jsonl entries from single file to entries." # Arrange - input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"} -{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"} + input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"} +{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"} """ input_jsonl_file = create_file(tmp_path, input_jsonl) @@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path): def test_process_entries_from_multiple_input_jsonls(tmp_path): "Convert multiple jsonl entries from single file to entries." # Arrange - input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}""" - input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}""" + input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}""" + input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}""" input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl") input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl") diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index dfb42fed..87a1a07e 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - Bullet point 2 """ markdownfile = create_file(tmp_path, entry) + expected_heading = "# " + markdownfile.stem # Act # Extract Entries from specified Markdown files @@ -27,6 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 + # Ensure raw entry with no headings do not get heading prefix prepended + assert not jsonl_data[0]["raw"].startswith("#") + # Ensure compiled entry has filename prepended as top level heading + assert jsonl_data[0]["compiled"].startswith(expected_heading) def test_single_markdown_entry_to_jsonl(tmp_path): @@ -128,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Helper Functions -def create_file(tmp_path, entry=None, filename="test.md"): +def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename markdown_file.touch() if entry: diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index aed4983f..171037c0 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): Body Line """ orgfile = create_file(tmp_path, entry) + expected_heading = f"* {orgfile.stem}\n** Heading" # Act # Extract Entries from specified Org files @@ -55,16 +56,18 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( TextToJsonl.split_entries_by_max_tokens( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2 + OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert assert len(jsonl_data) == 2 + # Ensure compiled entries split by max_words start with entry heading (for search context) + assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data]) -def test_entry_split_drops_large_words(tmp_path): +def test_entry_split_drops_large_words(): "Ensure entries drops words larger than specified max word length from compiled version." # Arrange entry_text = f"""*** Heading