mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Index intro text before headings in org files
- Text before headings was not being indexed due to buggy orgnode parsing logic - Resolved indexing intro text from files with and without headings in them - Ensure intro text node has heading set to all title lines collected from the file Resolves #165
This commit is contained in:
@@ -73,6 +73,7 @@ def makelist(filename):
|
|||||||
level = ""
|
level = ""
|
||||||
heading = ""
|
heading = ""
|
||||||
bodytext = ""
|
bodytext = ""
|
||||||
|
introtext = ""
|
||||||
tags = list() # set of all tags in headline
|
tags = list() # set of all tags in headline
|
||||||
closed_date = ""
|
closed_date = ""
|
||||||
sched_date = ""
|
sched_date = ""
|
||||||
@@ -133,7 +134,7 @@ def makelist(filename):
|
|||||||
file_title += f" {title_text}"
|
file_title += f" {title_text}"
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Ignore Properties Drawers Completely
|
# Ignore Properties Drawer Start, End Lines
|
||||||
if re.search(":PROPERTIES:", line):
|
if re.search(":PROPERTIES:", line):
|
||||||
in_properties_drawer = True
|
in_properties_drawer = True
|
||||||
continue
|
continue
|
||||||
@@ -190,20 +191,33 @@ def makelist(filename):
|
|||||||
and not clocked_re
|
and not clocked_re
|
||||||
and line[:1] != "#"
|
and line[:1] != "#"
|
||||||
):
|
):
|
||||||
bodytext = bodytext + line
|
# if we are in a heading
|
||||||
|
if heading:
|
||||||
|
# add the line to the bodytext
|
||||||
|
bodytext += line
|
||||||
|
# else we are in the pre heading portion of the file
|
||||||
|
elif line.strip():
|
||||||
|
# so add the line to the introtext
|
||||||
|
introtext += line
|
||||||
|
|
||||||
# write out last node
|
# write out intro node before headings
|
||||||
thisNode = Orgnode(level, heading or file_title, bodytext, tags)
|
# this is done at the end to allow collating all title lines
|
||||||
thisNode.properties = property_map
|
if introtext:
|
||||||
if sched_date:
|
thisNode = Orgnode(level, file_title, introtext, tags)
|
||||||
thisNode.scheduled = sched_date
|
nodelist = [thisNode] + nodelist
|
||||||
if deadline_date:
|
# write out last heading node
|
||||||
thisNode.deadline = deadline_date
|
if heading:
|
||||||
if closed_date:
|
thisNode = Orgnode(level, heading, bodytext, tags)
|
||||||
thisNode.closed = closed_date
|
thisNode.properties = property_map
|
||||||
if logbook:
|
if sched_date:
|
||||||
thisNode.logbook = logbook
|
thisNode.scheduled = sched_date
|
||||||
nodelist.append(thisNode)
|
if deadline_date:
|
||||||
|
thisNode.deadline = deadline_date
|
||||||
|
if closed_date:
|
||||||
|
thisNode.closed = closed_date
|
||||||
|
if logbook:
|
||||||
|
thisNode.logbook = logbook
|
||||||
|
nodelist.append(thisNode)
|
||||||
|
|
||||||
# using the list of TODO keywords found in the file
|
# using the list of TODO keywords found in the file
|
||||||
# process the headings searching for TODO keywords
|
# process the headings searching for TODO keywords
|
||||||
|
|||||||
@@ -108,6 +108,30 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
|||||||
assert len(jsonl_data) == 1
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_file_with_entry_after_intro_text_to_jsonl(tmp_path):
|
||||||
|
"Ensure intro text before any headings is indexed."
|
||||||
|
# Arrange
|
||||||
|
entry = f"""
|
||||||
|
Intro text
|
||||||
|
|
||||||
|
* Entry Heading
|
||||||
|
entry body
|
||||||
|
"""
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Org files
|
||||||
|
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_file_with_no_headings_to_jsonl(tmp_path):
|
def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
"Ensure files with no heading, only body text are loaded."
|
"Ensure files with no heading, only body text are loaded."
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|||||||
@@ -268,7 +268,7 @@ def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
|
|||||||
# Arrange
|
# Arrange
|
||||||
entry = f"""#+TITLE: title1
|
entry = f"""#+TITLE: title1
|
||||||
Body Line 1
|
Body Line 1
|
||||||
#+TITLE: title2 """
|
#+TITLE: title2 """
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
@@ -286,6 +286,50 @@ Body Line 1
|
|||||||
assert entries[0].deadline == ""
|
assert entries[0].deadline == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_org_with_intro_text_before_heading(tmp_path):
|
||||||
|
"Test parsing of org file with intro text before heading"
|
||||||
|
# Arrange
|
||||||
|
body = f"""#+TITLE: Title
|
||||||
|
intro body
|
||||||
|
* Entry Heading
|
||||||
|
entry body
|
||||||
|
"""
|
||||||
|
orgfile = create_file(tmp_path, body)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 2
|
||||||
|
assert entries[0].heading == "Title"
|
||||||
|
assert entries[0].body == "intro body\n"
|
||||||
|
assert entries[1].heading == "Entry Heading"
|
||||||
|
assert entries[1].body == "entry body\n"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_org_with_intro_text_multiple_titles_and_heading(tmp_path):
|
||||||
|
"Test parsing of org file with intro text, multiple titles and heading entry"
|
||||||
|
# Arrange
|
||||||
|
body = f"""#+TITLE: Title1
|
||||||
|
intro body
|
||||||
|
* Entry Heading
|
||||||
|
entry body
|
||||||
|
#+TITLE: Title2 """
|
||||||
|
orgfile = create_file(tmp_path, body)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 2
|
||||||
|
assert entries[0].heading == "Title1 Title2"
|
||||||
|
assert entries[0].body == "intro body\n"
|
||||||
|
assert entries[1].heading == "Entry Heading"
|
||||||
|
assert entries[1].body == "entry body\n"
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry, filename="test.org"):
|
def create_file(tmp_path, entry, filename="test.org"):
|
||||||
org_file = tmp_path / f"notes/{filename}"
|
org_file = tmp_path / f"notes/{filename}"
|
||||||
|
|||||||
Reference in New Issue
Block a user