mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Prepend entry heading to all compiled org snippets to improve search context
All compiled snippets split by max tokens (apart from first) do not get the heading as context. This limits search context required to retrieve these continuation entries
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
# Standard Packages
|
# Standard Packages
|
||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import time
|
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
@@ -139,7 +138,14 @@ class OrgToJsonl(TextToJsonl):
|
|||||||
logger.debug(f"Body: {parsed_entry.body}")
|
logger.debug(f"Body: {parsed_entry.body}")
|
||||||
|
|
||||||
if compiled:
|
if compiled:
|
||||||
entries += [Entry(compiled=compiled, raw=f"{parsed_entry}", file=f"{entry_to_file_map[parsed_entry]}")]
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled,
|
||||||
|
raw=f"{parsed_entry}",
|
||||||
|
heading=f"{parsed_entry.heading}",
|
||||||
|
file=f"{entry_to_file_map[parsed_entry]}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|||||||
@@ -31,14 +31,30 @@ class TextToJsonl(ABC):
|
|||||||
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
"Split entries if compiled entry length exceeds the max tokens supported by the ML model."
|
||||||
chunked_entries: List[Entry] = []
|
chunked_entries: List[Entry] = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
# Split entry into words
|
||||||
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
compiled_entry_words = [word for word in entry.compiled.split(" ") if word != ""]
|
||||||
|
|
||||||
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
# Drop long words instead of having entry truncated to maintain quality of entry processed by models
|
||||||
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
|
||||||
|
|
||||||
|
# Split entry into chunks of max tokens
|
||||||
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
for chunk_index in range(0, len(compiled_entry_words), max_tokens):
|
||||||
compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
|
compiled_entry_words_chunk = compiled_entry_words[chunk_index : chunk_index + max_tokens]
|
||||||
compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
|
compiled_entry_chunk = " ".join(compiled_entry_words_chunk)
|
||||||
entry_chunk = Entry(compiled=compiled_entry_chunk, raw=entry.raw, file=entry.file)
|
|
||||||
chunked_entries.append(entry_chunk)
|
# Prepend heading to all other chunks, the first chunk already has heading from original entry
|
||||||
|
if chunk_index > 0:
|
||||||
|
compiled_entry_chunk = f"{entry.heading}.\n{compiled_entry_chunk}"
|
||||||
|
|
||||||
|
chunked_entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled_entry_chunk,
|
||||||
|
raw=entry.raw,
|
||||||
|
heading=entry.heading,
|
||||||
|
file=entry.file,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return chunked_entries
|
return chunked_entries
|
||||||
|
|
||||||
def mark_entries_for_update(
|
def mark_entries_for_update(
|
||||||
|
|||||||
@@ -103,11 +103,15 @@ class SearchResponse(ConfigBase):
|
|||||||
class Entry:
|
class Entry:
|
||||||
raw: str
|
raw: str
|
||||||
compiled: str
|
compiled: str
|
||||||
|
heading: Optional[str]
|
||||||
file: Optional[str]
|
file: Optional[str]
|
||||||
|
|
||||||
def __init__(self, raw: str = None, compiled: str = None, file: Optional[str] = None):
|
def __init__(
|
||||||
|
self, raw: str = None, compiled: str = None, heading: Optional[str] = None, file: Optional[str] = None
|
||||||
|
):
|
||||||
self.raw = raw
|
self.raw = raw
|
||||||
self.compiled = compiled
|
self.compiled = compiled
|
||||||
|
self.heading = heading
|
||||||
self.file = file
|
self.file = file
|
||||||
|
|
||||||
def to_json(self) -> str:
|
def to_json(self) -> str:
|
||||||
|
|||||||
@@ -1,17 +1,13 @@
|
|||||||
# Standard Packages
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||||
from khoj.utils.jsonl import load_jsonl
|
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
def test_process_entries_from_single_input_jsonl(tmp_path):
|
def test_process_entries_from_single_input_jsonl(tmp_path):
|
||||||
"Convert multiple jsonl entries from single file to entries."
|
"Convert multiple jsonl entries from single file to entries."
|
||||||
# Arrange
|
# Arrange
|
||||||
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}
|
input_jsonl = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}
|
||||||
{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}
|
{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}
|
||||||
"""
|
"""
|
||||||
input_jsonl_file = create_file(tmp_path, input_jsonl)
|
input_jsonl_file = create_file(tmp_path, input_jsonl)
|
||||||
|
|
||||||
@@ -29,8 +25,8 @@ def test_process_entries_from_single_input_jsonl(tmp_path):
|
|||||||
def test_process_entries_from_multiple_input_jsonls(tmp_path):
|
def test_process_entries_from_multiple_input_jsonls(tmp_path):
|
||||||
"Convert multiple jsonl entries from single file to entries."
|
"Convert multiple jsonl entries from single file to entries."
|
||||||
# Arrange
|
# Arrange
|
||||||
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "file": "source/file/path1"}"""
|
input_jsonl_1 = """{"raw": "raw input data 1", "compiled": "compiled input data 1", "heading": null, "file": "source/file/path1"}"""
|
||||||
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "file": "source/file/path2"}"""
|
input_jsonl_2 = """{"raw": "raw input data 2", "compiled": "compiled input data 2", "heading": null, "file": "source/file/path2"}"""
|
||||||
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
|
input_jsonl_file_1 = create_file(tmp_path, input_jsonl_1, filename="input1.jsonl")
|
||||||
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
|
input_jsonl_file_2 = create_file(tmp_path, input_jsonl_2, filename="input2.jsonl")
|
||||||
|
|
||||||
|
|||||||
@@ -62,9 +62,11 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(jsonl_data) == 2
|
||||||
|
# Ensure compiled entries split by max_words start with entry heading (for search context)
|
||||||
|
assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
|
||||||
|
|
||||||
|
|
||||||
def test_entry_split_drops_large_words(tmp_path):
|
def test_entry_split_drops_large_words():
|
||||||
"Ensure entries drops words larger than specified max word length from compiled version."
|
"Ensure entries drops words larger than specified max word length from compiled version."
|
||||||
# Arrange
|
# Arrange
|
||||||
entry_text = f"""*** Heading
|
entry_text = f"""*** Heading
|
||||||
|
|||||||
Reference in New Issue
Block a user