Include Filename, Entry Heading in All Compiled Entries to Improve Search Context

Merge pull request #214 from debanjum/add-filename-heading-to-compiled-entry-for-context

- Set filename as top heading in compiled org, markdown entries
  - Note: *Khoj was already indexing filenames in compiled markdown entries but they weren't set as top level headings but rather appended as bare text*. The updated structure should provide more schematic context of relevance
- Set entry heading as heading for compiled org, md entries, even if split by max tokens
- Snip prepended heading to avoid crossing model max_token limits
- Entries with no md headings should not get heading prefix prepended
This commit is contained in:
Debanjum
2023-05-03 22:59:30 +08:00
committed by GitHub
7 changed files with 73 additions and 23 deletions

View File

@@ -2,7 +2,6 @@
import glob import glob
import logging import logging
import re import re
import time
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
with open(markdown_file, "r", encoding="utf8") as f: with open(markdown_file, "r", encoding="utf8") as f:
markdown_content = f.read() markdown_content = f.read()
markdown_entries_per_file = [] markdown_entries_per_file = []
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
prefix = "#" if entry.startswith("#") else "# " # Add heading level as the regex split removed it from entries with headings
if entry.strip(empty_escape_sequences) != "": prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}") stripped_entry = entry.strip(empty_escape_sequences)
if stripped_entry != "":
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
entries.extend(markdown_entries_per_file) entries.extend(markdown_entries_per_file)
@@ -126,9 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
entries = [] entries = []
for parsed_entry in parsed_entries: for parsed_entry in parsed_entries:
entry_filename = Path(entry_to_file_map[parsed_entry]) entry_filename = Path(entry_to_file_map[parsed_entry])
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
# Append base filename to compiled entry for context to model # Append base filename to compiled entry for context to model
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" # Increment heading level for heading entries and make filename as its top level heading
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
compiled_entry = f"{prefix}{parsed_entry}"
entries.append(
Entry(
compiled=compiled_entry,
raw=parsed_entry,
heading=f"{prefix}{heading}",
file=f"{entry_filename}",
)
)
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")

View File

@@ -1,7 +1,7 @@
# Standard Packages # Standard Packages
import glob import glob
import logging import logging
import time from pathlib import Path
from typing import Iterable, List from typing import Iterable, List
# Internal Packages # Internal Packages
@@ -113,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
# Ignore title notes i.e notes with just headings and empty body # Ignore title notes i.e notes with just headings and empty body
continue continue
compiled = f"{parsed_entry.heading}." # Prepend filename as top heading to entry
filename = Path(entry_to_file_map[parsed_entry]).stem
heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
compiled = heading
if state.verbose > 2: if state.verbose > 2:
logger.debug(f"Title: {parsed_entry.heading}") logger.debug(f"Title: {parsed_entry.heading}")
@@ -139,7 +143,14 @@ class OrgToJsonl(TextToJsonl):
logger.debug(f"Body: {parsed_entry.body}") logger.debug(f"Body: {parsed_entry.body}")
if compiled: if compiled:
entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")] entries.append(
Entry(
compiled=compiled,
raw=f"{parsed_entry}",
heading=f"{heading}",
file=f"{entry_to_file_map[parsed_entry]}",
)
)
return entries return entries

View File

@@ -31,14 +31,33 @@ class TextToJsonl(ABC):
"Split entries if compiled entry length exceeds the max tokens supported by the ML model." "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
chunked_entries: List[Entry] = [] chunked_entries: List[Entry] = []
for entry in entries: for entry in entries:
# Split entry into words
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""] compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
# Drop long words instead of having entry truncated to maintain quality of entry processed by models # Drop long words instead of having entry truncated to maintain quality of entry processed by models
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length] compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
# Split entry into chunks of max tokens
for chunk_index in range(0, len(compiled_entry_words), max_tokens): for chunk_index in range(0, len(compiled_entry_words), max_tokens):
compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens] compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
compiled_entry_chunk = " ".join(compiled_entry_words_chunk) compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
chunked_entries.append(entry_chunk) # Prepend heading to all other chunks, the first chunk already has heading from original entry
if chunk_index > 0:
# Snip heading to avoid crossing max_tokens limit
# Keep last 100 characters of heading as entry heading more important than filename
snipped_heading = entry.heading[-100:]
compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}"
chunked_entries.append(
Entry(
compiled=compiled_entry_chunk,
raw=entry.raw,
heading=entry.heading,
file=entry.file,
)
)
return chunked_entries return chunked_entries
def mark_entries_for_update( def mark_entries_for_update(

View File

@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
class Entry: class Entry:
raw: str raw: str
compiled: str compiled: str
heading: Optional[str]
file: Optional[str] file: Optional[str]
def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None): def __init__(
self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
):
self.raw = raw self.raw = raw
self.compiled = compiled self.compiled = compiled
self.heading = heading
self.file = file self.file = file
def to_json(self) -> str: def to_json(self) -> str:

View File

@@ -1,17 +1,13 @@
# Standard Packages
import json
# Internal Packages # Internal Packages
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
from khoj.utils.jsonl import load_jsonl
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
def test_process_entries_from_single_input_jsonl(tmp_path): def test_process_entries_from_single_input_jsonl(tmp_path):
"Convert multiple jsonl entries from single file to entries." "Convert multiple jsonl entries from single file to entries."
# Arrange # Arrange
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"} input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"} {"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
""" """
input_jsonl_file = create_file(tmp_path, input_jsonl) input_jsonl_file = create_file(tmp_path, input_jsonl)
@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
def test_process_entries_from_multiple_input_jsonls(tmp_path): def test_process_entries_from_multiple_input_jsonls(tmp_path):
"Convert multiple jsonl entries from single file to entries." "Convert multiple jsonl entries from single file to entries."
# Arrange # Arrange
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}""" input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}""" input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl") input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl") input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")

View File

@@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
- Bullet point 2 - Bullet point 2
""" """
markdownfile = create_file(tmp_path, entry) markdownfile = create_file(tmp_path, entry)
expected_heading = "# " + markdownfile.stem
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
@@ -27,6 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Assert # Assert
assert len(jsonl_data) == 1 assert len(jsonl_data) == 1
# Ensure raw entry with no headings do not get heading prefix prepended
assert not jsonl_data[0]["raw"].startswith("#")
# Ensure compiled entry has filename prepended as top level heading
assert jsonl_data[0]["compiled"].startswith(expected_heading)
def test_single_markdown_entry_to_jsonl(tmp_path): def test_single_markdown_entry_to_jsonl(tmp_path):
@@ -128,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Helper Functions # Helper Functions
def create_file(tmp_path, entry=None, filename="test.md"): def create_file(tmp_path: Path, entry=None, filename="test.md"):
markdown_file = tmp_path / filename markdown_file = tmp_path / filename
markdown_file.touch() markdown_file.touch()
if entry: if entry:

View File

@@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
Body Line Body Line
""" """
orgfile = create_file(tmp_path, entry) orgfile = create_file(tmp_path, entry)
expected_heading = f"* {orgfile.stem}\n** Heading"
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
@@ -55,16 +56,18 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Split each entry from specified Org files by max words # Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
TextToJsonl.split_entries_by_max_tokens( TextToJsonl.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2 OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
) )
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
assert len(jsonl_data) == 2 assert len(jsonl_data) == 2
# Ensure compiled entries split by max_words start with entry heading (for search context)
assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
def test_entry_split_drops_large_words(tmp_path): def test_entry_split_drops_large_words():
"Ensure entries drops words larger than specified max word length from compiled version." "Ensure entries drops words larger than specified max word length from compiled version."
# Arrange # Arrange
entry_text = f"""*** Heading entry_text = f"""*** Heading