Set index_heading_entries field in config to index entries with no body

- Previously heading entries were not indexed to maintain search quality
- But given that there are use-cases for indexing entries with no body
- Add a configurable `index_heading_entries' field to index heading entries
- This `TextContentConfig' field is currently only used for OrgMode content
This commit is contained in:
Debanjum Singh Solanky
2022-09-11 12:40:58 +03:00
parent 1d3b3d5f39
commit 253c9eae9a
3 changed files with 22 additions and 15 deletions

View File

@@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
def org_to_jsonl(config: TextContentConfig, previous_entries=None): def org_to_jsonl(config: TextContentConfig, previous_entries=None):
# Extract required fields from config # Extract required fields from config
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
index_heading_entries = config.index_heading_entries
# Input Validation # Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
@@ -41,7 +42,7 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
start = time.time() start = time.time()
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
end = time.time() end = time.time()
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
@@ -100,13 +101,13 @@ def extract_org_entries(org_files):
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]: def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
"Convert Org-Mode entries into list of dictionary" "Convert Org-Mode entries into list of dictionary"
entry_maps = [] entry_maps = []
for entry in entries: for entry in entries:
entry_dict = dict() entry_dict = dict()
if not entry.hasBody: if not entry.hasBody and not index_heading_entries:
# Ignore title notes i.e notes with just headings and empty body # Ignore title notes i.e notes with just headings and empty body
continue continue

View File

@@ -18,6 +18,7 @@ class TextContentConfig(ConfigBase):
input_filter: Optional[str] input_filter: Optional[str]
compressed_jsonl: Path compressed_jsonl: Path
embeddings_file: Path embeddings_file: Path
index_heading_entries: Optional[bool] = False
@validator('input_filter') @validator('input_filter')
def input_filter_or_files_required(cls, input_filter, values, **kwargs): def input_filter_or_files_required(cls, input_filter, values, **kwargs):

View File

@@ -6,8 +6,8 @@ from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, co
from src.utils.helpers import is_none_or_empty from src.utils.helpers import is_none_or_empty
def test_entry_with_empty_body_line_to_jsonl(tmp_path): def test_configure_heading_entry_to_jsonl(tmp_path):
'''Ensure entries with empty body are ignored. '''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.''' Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
# Arrange # Arrange
entry = f'''*** Heading entry = f'''*** Heading
@@ -18,15 +18,20 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
''' '''
orgfile = create_file(tmp_path, entry) orgfile = create_file(tmp_path, entry)
for index_heading_entries in [True, False]:
# Act # Act
# Extract Entries from specified Org files # Extract entries into jsonl from specified Org files
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
*extract_org_entries(org_files=[orgfile]),
# Process Each Entry from All Notes Files index_heading_entries=index_heading_entries))
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
jsonl_data = convert_org_entries_to_jsonl(entries)
# Assert # Assert
if index_heading_entries:
# Entry with empty body indexed when index_heading_entries set to True
assert len(jsonl_data) == 1
else:
# Entry with empty body ignored when index_heading_entries set to False
assert is_none_or_empty(jsonl_data) assert is_none_or_empty(jsonl_data)