From fe03ba3dcee9cefbc4bba076009ba1a711f68880 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 1 Mar 2023 12:11:33 -0600
Subject: [PATCH] Index intro text before headings in org files

- Text before headings was not being indexed due to buggy orgnode
  parsing logic
- Resolved indexing intro text from files with and without headings in
  them
- Ensure intro text node has heading set to all title lines collected
  from the file

Resolves #165
---
 src/khoj/processor/org_mode/orgnode.py | 42 +++++++++++++++--------
 tests/test_org_to_jsonl.py             | 24 ++++++++++++++
 tests/test_orgnode.py                  | 46 +++++++++++++++++++++++++-
 3 files changed, 97 insertions(+), 15 deletions(-)

diff --git a/src/khoj/processor/org_mode/orgnode.py b/src/khoj/processor/org_mode/orgnode.py
index e352ecf2..c4b0afa6 100644
--- a/src/khoj/processor/org_mode/orgnode.py
+++ b/src/khoj/processor/org_mode/orgnode.py
@@ -73,6 +73,7 @@ def makelist(filename):
     level = ""
     heading = ""
     bodytext = ""
+    introtext = ""
     tags = list()  # set of all tags in headline
     closed_date = ""
     sched_date = ""
@@ -133,7 +134,7 @@ def makelist(filename):
                     file_title += f" {title_text}"
                 continue
 
-            # Ignore Properties Drawers Completely
+            # Ignore Properties Drawer Start, End Lines
             if re.search(":PROPERTIES:", line):
                 in_properties_drawer = True
                 continue
@@ -190,20 +191,33 @@ def makelist(filename):
                 and not clocked_re
                 and line[:1] != "#"
             ):
-                bodytext = bodytext + line
+                # if we are in a heading
+                if heading:
+                    # add the line to the bodytext
+                    bodytext += line
+                # else we are in the pre heading portion of the file
+                elif line.strip():
+                    # so add the line to the introtext
+                    introtext += line
 
-    # write out last node
-    thisNode = Orgnode(level, heading or file_title, bodytext, tags)
-    thisNode.properties = property_map
-    if sched_date:
-        thisNode.scheduled = sched_date
-    if deadline_date:
-        thisNode.deadline = deadline_date
-    if closed_date:
-        thisNode.closed = closed_date
-    if logbook:
-        thisNode.logbook = logbook
-    nodelist.append(thisNode)
+    # write out intro node before headings
+    # this is done at the end to allow collating all title lines
+    if introtext:
+        thisNode = Orgnode(level, file_title, introtext, tags)
+        nodelist = [thisNode] + nodelist
+    # write out last heading node
+    if heading:
+        thisNode = Orgnode(level, heading, bodytext, tags)
+        thisNode.properties = property_map
+        if sched_date:
+            thisNode.scheduled = sched_date
+        if deadline_date:
+            thisNode.deadline = deadline_date
+        if closed_date:
+            thisNode.closed = closed_date
+        if logbook:
+            thisNode.logbook = logbook
+        nodelist.append(thisNode)
 
     # using the list of TODO keywords found in the file
     # process the headings searching for TODO keywords
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index 89a82a4d..b8803772 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -108,6 +108,30 @@ def test_entry_with_body_to_jsonl(tmp_path):
     assert len(jsonl_data) == 1
 
 
+def test_file_with_entry_after_intro_text_to_jsonl(tmp_path):
+    "Ensure intro text before any headings is indexed."
+    # Arrange
+    entry = f"""
+Intro text
+
+* Entry Heading
+  entry body
+"""
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
+
+    # Process Each Entry from All Notes Files
+    entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
+    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 2
+
+
 def test_file_with_no_headings_to_jsonl(tmp_path):
     "Ensure files with no heading, only body text are loaded."
     # Arrange
diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py
index 8102aef4..9232210c 100644
--- a/tests/test_orgnode.py
+++ b/tests/test_orgnode.py
@@ -268,7 +268,7 @@ def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
     # Arrange
     entry = f"""#+TITLE: title1
 Body Line 1
-#+TITLE:  title2  """
+#+TITLE:  title2 """
     orgfile = create_file(tmp_path, entry)
 
     # Act
@@ -286,6 +286,50 @@ Body Line 1
     assert entries[0].deadline == ""
 
 
+# ----------------------------------------------------------------------------------------------------
+def test_parse_org_with_intro_text_before_heading(tmp_path):
+    "Test parsing of org file with intro text before heading"
+    # Arrange
+    body = f"""#+TITLE: Title
+intro body
+* Entry Heading
+entry body
+"""
+    orgfile = create_file(tmp_path, body)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 2
+    assert entries[0].heading == "Title"
+    assert entries[0].body == "intro body\n"
+    assert entries[1].heading == "Entry Heading"
+    assert entries[1].body == "entry body\n"
+
+
+# ----------------------------------------------------------------------------------------------------
+def test_parse_org_with_intro_text_multiple_titles_and_heading(tmp_path):
+    "Test parsing of org file with intro text, multiple titles and heading entry"
+    # Arrange
+    body = f"""#+TITLE: Title1
+intro body
+* Entry Heading
+entry body
+#+TITLE: Title2 """
+    orgfile = create_file(tmp_path, body)
+
+    # Act
+    entries = orgnode.makelist(orgfile)
+
+    # Assert
+    assert len(entries) == 2
+    assert entries[0].heading == "Title1 Title2"
+    assert entries[0].body == "intro body\n"
+    assert entries[1].heading == "Entry Heading"
+    assert entries[1].body == "entry body\n"
+
+
 # Helper Functions
 def create_file(tmp_path, entry, filename="test.org"):
     org_file = tmp_path / f"notes/{filename}"