diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index d03f569f..532b9b4c 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) def org_to_jsonl(config: TextContentConfig, previous_entries=None): # Extract required fields from config org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl + index_heading_entries = config.index_heading_entries # Input Validation if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): @@ -41,7 +42,7 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None): logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") start = time.time() - current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) end = time.time() logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") @@ -100,13 +101,13 @@ def extract_org_entries(org_files): return entries, dict(entry_to_file_map) -def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]: +def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: "Convert Org-Mode entries into list of dictionary" entry_maps = [] for entry in entries: entry_dict = dict() - if not entry.hasBody: + if not entry.hasBody and not index_heading_entries: # Ignore title notes i.e notes with just headings and empty body continue diff --git a/src/utils/rawconfig.py b/src/utils/rawconfig.py index 4fd4be96..f9c19900 100644 --- a/src/utils/rawconfig.py +++ b/src/utils/rawconfig.py @@ -18,6 +18,7 @@ class TextContentConfig(ConfigBase): input_filter: Optional[str] compressed_jsonl: Path embeddings_file: Path + index_heading_entries: Optional[bool] = False @validator('input_filter') def input_filter_or_files_required(cls, input_filter, values, **kwargs): diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index eaac5ef8..04e7199e 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -6,28 +6,33 @@ from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, co from src.utils.helpers import is_none_or_empty -def test_entry_with_empty_body_line_to_jsonl(tmp_path): - '''Ensure entries with empty body are ignored. +def test_configure_heading_entry_to_jsonl(tmp_path): + '''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries. Property drawers not considered Body. Ignore control characters for evaluating if Body empty.''' # Arrange entry = f'''*** Heading :PROPERTIES: :ID: 42-42-42 :END: - \t\r + \t \r ''' orgfile = create_file(tmp_path, entry) - # Act - # Extract Entries from specified Org files - entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) + for index_heading_entries in [True, False]: + # Act + # Extract entries into jsonl from specified Org files + jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries( + *extract_org_entries(org_files=[orgfile]), + index_heading_entries=index_heading_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Process Each Entry from All Notes Files - entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_data = convert_org_entries_to_jsonl(entries) - - # Assert - assert is_none_or_empty(jsonl_data) + # Assert + if index_heading_entries: + # Entry with empty body indexed when index_heading_entries set to True + assert len(jsonl_data) == 1 + else: + # Entry with empty body ignored when index_heading_entries set to False + assert is_none_or_empty(jsonl_data) def test_entry_with_body_to_jsonl(tmp_path):