mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Include Filename, Entry Heading in All Compiled Entries to Improve Search Context
Merge pull request #214 from debanjum/add-filename-heading-to-compiled-entry-for-context - Set filename as top heading in compiled org, markdown entries - Note: *Khoj was already indexing filenames in compiled markdown entries but they weren't set as top level headings but rather appended as bare text*. The updated structure should provide more schematic context of relevance - Set entry heading as heading for compiled org, md entries, even if split by max tokens - Snip prepended heading to avoid crossing model max_token limits - Entries with no md headings should not get heading prefix prepended
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
with open(markdown_file, "r", encoding="utf8") as f:
|
with open(markdown_file, "r", encoding="utf8") as f:
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
markdown_entries_per_file = []
|
markdown_entries_per_file = []
|
||||||
|
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||||
prefix = "#" if entry.startswith("#") else "# "
|
# Add heading level as the regex split removed it from entries with headings
|
||||||
if entry.strip(empty_escape_sequences) != "":
|
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||||
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
|
stripped_entry = entry.strip(empty_escape_sequences)
|
||||||
|
if stripped_entry != "":
|
||||||
|
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||||
|
|
||||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||||
entries.extend(markdown_entries_per_file)
|
entries.extend(markdown_entries_per_file)
|
||||||
@@ -126,9 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
|
|||||||
entries = []
|
entries = []
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
|
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
|
||||||
# Append base filename to compiled entry for context to model
|
# Append base filename to compiled entry for context to model
|
||||||
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
|
# Increment heading level for heading entries and make filename as its top level heading
|
||||||
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
|
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
||||||
|
compiled_entry = f"{prefix}{parsed_entry}"
|
||||||
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled_entry,
|
||||||
|
raw=parsed_entry,
|
||||||
|
heading=f"{prefix}{heading}",
|
||||||
|
file=f"{entry_filename}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# Standard Packages
|
# Standard Packages
|
||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import time
|
from pathlib import Path
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
@@ -113,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
# Ignore title notes i.e notes with just headings and empty body
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
continue
|
continue
|
||||||
|
|
||||||
compiled = f"{parsed_entry.heading}."
|
# Prepend filename as top heading to entry
|
||||||
|
filename = Path(entry_to_file_map[parsed_entry]).stem
|
||||||
|
heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
|
||||||
|
|
||||||
|
compiled = heading
|
||||||
if state.verbose > 2:
|
if state.verbose > 2:
|
||||||
logger.debug(f"Title: {parsed_entry.heading}")
|
logger.debug(f"Title: {parsed_entry.heading}")
|
||||||
|
|
||||||
@@ -139,7 +143,14 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
logger.debug(f"Body: {parsed_entry.body}")
|
logger.debug(f"Body: {parsed_entry.body}")
|
||||||
|
|
||||||
if compiled:
|
if compiled:
|
||||||
entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled,
|
||||||
|
raw=f"{parsed_entry}",
|
||||||
|
heading=f"{heading}",
|
||||||
|
file=f"{entry_to_file_map[parsed_entry]}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|||||||
@@ -31,14 +31,33 @@ class TextToJsonl(ABC):
|
|||||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||||
chunked_entries: List[Entry] = []
|
chunked_entries: List[Entry] = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
# Split entry into words
|
||||||
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||||
|
|
||||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||||
|
|
||||||
|
# Split entry into chunks of max tokens
|
||||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||||
compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
|
compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
|
||||||
compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
|
compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
|
||||||
entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
|
|
||||||
chunked_entries.append(entry_chunk)
|
# Prepend heading to all other chunks, the first chunk already has heading from original entry
|
||||||
|
if chunk_index > 0:
|
||||||
|
# Snip heading to avoid crossing max_tokens limit
|
||||||
|
# Keep last 100 characters of heading as entry heading more important than filename
|
||||||
|
snipped_heading = entry.heading[-100:]
|
||||||
|
compiled_entry_chunk = f"{snipped_heading}.\n{compiled_entry_chunk}"
|
||||||
|
|
||||||
|
chunked_entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled_entry_chunk,
|
||||||
|
raw=entry.raw,
|
||||||
|
heading=entry.heading,
|
||||||
|
file=entry.file,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return chunked_entries
|
return chunked_entries
|
||||||
|
|
||||||
def mark_entries_for_update(
|
def mark_entries_for_update(
|
||||||
|
|||||||
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
|
|||||||
class Entry:
|
class Entry:
|
||||||
raw: str
|
raw: str
|
||||||
compiled: str
|
compiled: str
|
||||||
|
heading: Optional[str]
|
||||||
file: Optional[str]
|
file: Optional[str]
|
||||||
|
|
||||||
def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
|
def __init__(
|
||||||
|
self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
|
||||||
|
):
|
||||||
self.raw = raw
|
self.raw = raw
|
||||||
self.compiled = compiled
|
self.compiled = compiled
|
||||||
|
self.heading = heading
|
||||||
self.file = file
|
self.file = file
|
||||||
|
|
||||||
def to_json(self) -> str:
|
def to_json(self) -> str:
|
||||||
|
|||||||
@@ -1,17 +1,13 @@
|
|||||||
# Standard Packages
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||||
from khoj.utils.jsonl import load_jsonl
|
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
def test_process_entries_from_single_input_jsonl(tmp_path):
|
def test_process_entries_from_single_input_jsonl(tmp_path):
|
||||||
"Convert multiple jsonl entries from single file to entries."
|
"Convert multiple jsonl entries from single file to entries."
|
||||||
# Arrange
|
# Arrange
|
||||||
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
|
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
|
||||||
{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
|
{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
|
||||||
"""
|
"""
|
||||||
input_jsonl_file = create_file(tmp_path, input_jsonl)
|
input_jsonl_file = create_file(tmp_path, input_jsonl)
|
||||||
|
|
||||||
@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
|
|||||||
def test_process_entries_from_multiple_input_jsonls(tmp_path):
|
def test_process_entries_from_multiple_input_jsonls(tmp_path):
|
||||||
"Convert multiple jsonl entries from single file to entries."
|
"Convert multiple jsonl entries from single file to entries."
|
||||||
# Arrange
|
# Arrange
|
||||||
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
|
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
|
||||||
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
|
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
|
||||||
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
|
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
|
||||||
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
|
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
- Bullet point 2
|
- Bullet point 2
|
||||||
"""
|
"""
|
||||||
markdownfile = create_file(tmp_path, entry)
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
expected_heading = "# " + markdownfile.stem
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
@@ -27,6 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(jsonl_data) == 1
|
||||||
|
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||||
|
assert not jsonl_data[0]["raw"].startswith("#")
|
||||||
|
# Ensure compiled entry has filename prepended as top level heading
|
||||||
|
assert jsonl_data[0]["compiled"].startswith(expected_heading)
|
||||||
|
|
||||||
|
|
||||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
@@ -128,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry=None, filename="test.md"):
|
def create_file(tmp_path: Path, entry=None, filename="test.md"):
|
||||||
markdown_file = tmp_path / filename
|
markdown_file = tmp_path / filename
|
||||||
markdown_file.touch()
|
markdown_file.touch()
|
||||||
if entry:
|
if entry:
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||||||
Body Line
|
Body Line
|
||||||
"""
|
"""
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
expected_heading = f"* {orgfile.stem}\n** Heading"
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
@@ -55,16 +56,18 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||||||
# Split each entry from specified Org files by max words
|
# Split each entry from specified Org files by max words
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||||
TextToJsonl.split_entries_by_max_tokens(
|
TextToJsonl.split_entries_by_max_tokens(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
|
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(jsonl_data) == 2
|
||||||
|
# Ensure compiled entries split by max_words start with entry heading (for search context)
|
||||||
|
assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
|
||||||
|
|
||||||
|
|
||||||
def test_entry_split_drops_large_words(tmp_path):
|
def test_entry_split_drops_large_words():
|
||||||
"Ensure entries drops words larger than specified max word length from compiled version."
|
"Ensure entries drops words larger than specified max word length from compiled version."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry_text = f"""*** Heading
|
entry_text = f"""*** Heading
|
||||||
|
|||||||
Reference in New Issue
Block a user