mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Set index_heading_entries field in config to index entries with no body
- Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content
This commit is contained in:
@@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
|
|||||||
def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
||||||
|
index_heading_entries = config.index_heading_entries
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
||||||
@@ -41,7 +42,7 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
|||||||
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
@@ -100,13 +101,13 @@ def extract_org_entries(org_files):
|
|||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
||||||
|
|
||||||
|
|
||||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]:
|
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
||||||
"Convert Org-Mode entries into list of dictionary"
|
"Convert Org-Mode entries into list of dictionary"
|
||||||
entry_maps = []
|
entry_maps = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
entry_dict = dict()
|
entry_dict = dict()
|
||||||
|
|
||||||
if not entry.hasBody:
|
if not entry.hasBody and not index_heading_entries:
|
||||||
# Ignore title notes i.e notes with just headings and empty body
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ class TextContentConfig(ConfigBase):
|
|||||||
input_filter: Optional[str]
|
input_filter: Optional[str]
|
||||||
compressed_jsonl: Path
|
compressed_jsonl: Path
|
||||||
embeddings_file: Path
|
embeddings_file: Path
|
||||||
|
index_heading_entries: Optional[bool] = False
|
||||||
|
|
||||||
@validator('input_filter')
|
@validator('input_filter')
|
||||||
def input_filter_or_files_required(cls, input_filter, values, **kwargs):
|
def input_filter_or_files_required(cls, input_filter, values, **kwargs):
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, co
|
|||||||
from src.utils.helpers import is_none_or_empty
|
from src.utils.helpers import is_none_or_empty
|
||||||
|
|
||||||
|
|
||||||
def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||||
'''Ensure entries with empty body are ignored.
|
'''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
|
||||||
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
|
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
|
||||||
# Arrange
|
# Arrange
|
||||||
entry = f'''*** Heading
|
entry = f'''*** Heading
|
||||||
@@ -18,15 +18,20 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
|||||||
'''
|
'''
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
for index_heading_entries in [True, False]:
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract entries into jsonl from specified Org files
|
||||||
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
|
||||||
|
*extract_org_entries(org_files=[orgfile]),
|
||||||
# Process Each Entry from All Notes Files
|
index_heading_entries=index_heading_entries))
|
||||||
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
if index_heading_entries:
|
||||||
|
# Entry with empty body indexed when index_heading_entries set to True
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
else:
|
||||||
|
# Entry with empty body ignored when index_heading_entries set to False
|
||||||
assert is_none_or_empty(jsonl_data)
|
assert is_none_or_empty(jsonl_data)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user