From fe03ba3dcee9cefbc4bba076009ba1a711f68880 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 1 Mar 2023 12:11:33 -0600 Subject: [PATCH] Index intro text before headings in org files - Text before headings was not being indexed due to buggy orgnode parsing logic - Resolved indexing intro text from files with and without headings in them - Ensure intro text node has heading set to all title lines collected from the file Resolves #165 --- src/khoj/processor/org_mode/orgnode.py | 42 +++++++++++++++-------- tests/test_org_to_jsonl.py | 24 ++++++++++++++ tests/test_orgnode.py | 46 +++++++++++++++++++++++++- 3 files changed, 97 insertions(+), 15 deletions(-) diff --git a/src/khoj/processor/org_mode/orgnode.py b/src/khoj/processor/org_mode/orgnode.py index e352ecf2..c4b0afa6 100644 --- a/src/khoj/processor/org_mode/orgnode.py +++ b/src/khoj/processor/org_mode/orgnode.py @@ -73,6 +73,7 @@ def makelist(filename): level = "" heading = "" bodytext = "" + introtext = "" tags = list() # set of all tags in headline closed_date = "" sched_date = "" @@ -133,7 +134,7 @@ def makelist(filename): file_title += f" {title_text}" continue - # Ignore Properties Drawers Completely + # Ignore Properties Drawer Start, End Lines if re.search(":PROPERTIES:", line): in_properties_drawer = True continue @@ -190,20 +191,33 @@ def makelist(filename): and not clocked_re and line[:1] != "#" ): - bodytext = bodytext + line + # if we are in a heading + if heading: + # add the line to the bodytext + bodytext += line + # else we are in the pre heading portion of the file + elif line.strip(): + # so add the line to the introtext + introtext += line - # write out last node - thisNode = Orgnode(level, heading or file_title, bodytext, tags) - thisNode.properties = property_map - if sched_date: - thisNode.scheduled = sched_date - if deadline_date: - thisNode.deadline = deadline_date - if closed_date: - thisNode.closed = closed_date - if logbook: - thisNode.logbook = logbook - nodelist.append(thisNode) + # write out intro node before headings + # this is done at the end to allow collating all title lines + if introtext: + thisNode = Orgnode(level, file_title, introtext, tags) + nodelist = [thisNode] + nodelist + # write out last heading node + if heading: + thisNode = Orgnode(level, heading, bodytext, tags) + thisNode.properties = property_map + if sched_date: + thisNode.scheduled = sched_date + if deadline_date: + thisNode.deadline = deadline_date + if closed_date: + thisNode.closed = closed_date + if logbook: + thisNode.logbook = logbook + nodelist.append(thisNode) # using the list of TODO keywords found in the file # process the headings searching for TODO keywords diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 89a82a4d..b8803772 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -108,6 +108,30 @@ def test_entry_with_body_to_jsonl(tmp_path): assert len(jsonl_data) == 1 +def test_file_with_entry_after_intro_text_to_jsonl(tmp_path): + "Ensure intro text before any headings is indexed." + # Arrange + entry = f""" +Intro text + +* Entry Heading + entry body +""" + orgfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Org files + entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile]) + + # Process Each Entry from All Notes Files + entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + def test_file_with_no_headings_to_jsonl(tmp_path): "Ensure files with no heading, only body text are loaded." # Arrange diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index 8102aef4..9232210c 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -268,7 +268,7 @@ def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path): # Arrange entry = f"""#+TITLE: title1 Body Line 1 -#+TITLE: title2 """ +#+TITLE: title2 """ orgfile = create_file(tmp_path, entry) # Act @@ -286,6 +286,50 @@ Body Line 1 assert entries[0].deadline == "" +# ---------------------------------------------------------------------------------------------------- +def test_parse_org_with_intro_text_before_heading(tmp_path): + "Test parsing of org file with intro text before heading" + # Arrange + body = f"""#+TITLE: Title +intro body +* Entry Heading +entry body +""" + orgfile = create_file(tmp_path, body) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 2 + assert entries[0].heading == "Title" + assert entries[0].body == "intro body\n" + assert entries[1].heading == "Entry Heading" + assert entries[1].body == "entry body\n" + + +# ---------------------------------------------------------------------------------------------------- +def test_parse_org_with_intro_text_multiple_titles_and_heading(tmp_path): + "Test parsing of org file with intro text, multiple titles and heading entry" + # Arrange + body = f"""#+TITLE: Title1 +intro body +* Entry Heading +entry body +#+TITLE: Title2 """ + orgfile = create_file(tmp_path, body) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 2 + assert entries[0].heading == "Title1 Title2" + assert entries[0].body == "intro body\n" + assert entries[1].heading == "Entry Heading" + assert entries[1].body == "entry body\n" + + # Helper Functions def create_file(tmp_path, entry, filename="test.org"): org_file = tmp_path / f"notes/{filename}"