From e951ba37adff328b7e22920c95b668d737d903db Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 01:09:24 +0300 Subject: [PATCH 1/7] Raise exception when org file not found - No need to catch the IOError in OrgNode --- src/processor/org_mode/orgnode.py | 7 +------ tests/test_text_search.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 31cedbb9..5f47a448 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -57,12 +57,7 @@ def makelist(filename): """ ctr = 0 - try: - f = open(filename, 'r') - except IOError: - print(f"Unable to open file {filename}") - print("Program terminating.") - sys.exit(1) + f = open(filename, 'r') todos = { "TODO": "", "WAITING": "", "ACTIVE": "", "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line diff --git a/tests/test_text_search.py b/tests/test_text_search.py index dce1070a..20e0aea0 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -13,6 +13,20 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl # Test +# ---------------------------------------------------------------------------------------------------- +def test_asymmetric_setup_with_missing_file_raises_error(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org" + new_org_content_config = deepcopy(content_config.org) + new_org_content_config.input_files = [f'{file_to_index}'] + new_org_content_config.input_filter = None + + # Act + # Generate notes embeddings during asymmetric setup + with pytest.raises(FileNotFoundError): + text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True) + + # ---------------------------------------------------------------------------------------------------- def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig): # Arrange From 52e3dd98356c12cdf90dc418e6cd4c25b1f40d88 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 10:09:17 +0300 Subject: [PATCH 2/7] Pass the whole TextContentConfig as argument to text_to_jsonl methods - Let the specific text_to_jsonl method decide which of the TextContentConfig fields it needs to convert type to jsonl - This simplifies extending TextContentConfig for a specific type without modifying all text_to_jsonl methods - It keeps the number of args being passed to the `text_to_jsonl' methods in check --- src/processor/ledger/beancount_to_jsonl.py | 6 +++++- src/processor/markdown/markdown_to_jsonl.py | 6 +++++- src/processor/org_mode/org_to_jsonl.py | 6 +++++- src/search_type/text_search.py | 2 +- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 3af45c7a..d0d43f1a 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -13,13 +13,17 @@ import time from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data +from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) # Define Functions -def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None): +def beancount_to_jsonl(config: TextContentConfig, previous_entries=None): + # Extract required fields from config + beancount_files, beancount_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl + # Input Validation if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter): print("At least one of beancount-files or beancount-file-filter is required to be specified") diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index e7fc2779..9856c502 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -13,13 +13,17 @@ import time from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data +from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) # Define Functions -def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None): +def markdown_to_jsonl(config: TextContentConfig, previous_entries=None): + # Extract required fields from config + markdown_files, markdown_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl + # Input Validation if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): print("At least one of markdown-files or markdown-file-filter is required to be specified") diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index f166810f..25ab1d9b 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -14,13 +14,17 @@ from src.processor.org_mode import orgnode from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils import state +from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) # Define Functions -def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None): +def org_to_jsonl(config: TextContentConfig, previous_entries=None): + # Extract required fields from config + org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl + # Input Validation if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): print("At least one of org-files or org-file-filter is required to be specified") diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 238c4736..e48b8803 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -183,7 +183,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None - entries_with_indices = text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, previous_entries) + entries_with_indices = text_to_jsonl(config, previous_entries) # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) From b4878d76eab0eb7899fb570f67780f012874dc3d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 10:14:08 +0300 Subject: [PATCH 3/7] Extract entries from scratch when regenerate requested - Do not rely on previously extracted entries to find new entries in regenerate scenario --- src/search_type/text_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index e48b8803..cd669094 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -182,7 +182,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None + previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None entries_with_indices = text_to_jsonl(config, previous_entries) # Extract Updated Entries From db37e38df7c968f95a5947287a31935617e9a341 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 10:47:44 +0300 Subject: [PATCH 4/7] Create OrgNode hasBody method. Use it in org_to_jsonl checks --- src/processor/org_mode/org_to_jsonl.py | 6 +++--- src/processor/org_mode/orgnode.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 25ab1d9b..93955659 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -106,8 +106,8 @@ def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_m for entry in entries: entry_dict = dict() - # Ignore title notes i.e notes with just headings and empty body - if not entry.Body() or re.sub(r'\n|\t|\r| ', '', entry.Body()) == "": + if not entry.hasBody(): + # Ignore title notes i.e notes with just headings and empty body continue entry_dict["compiled"] = f'{entry.Heading()}.' @@ -130,7 +130,7 @@ def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_m if state.verbose > 2: logger.debug(f'Scheduled: {entry.Scheduled().strftime("%Y-%m-%d")}') - if entry.Body(): + if entry.hasBody(): entry_dict["compiled"] += f'\n {entry.Body()}' if state.verbose > 2: logger.debug(f"Body: {entry.Body()}") diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 5f47a448..5ad28d88 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -262,6 +262,12 @@ class Orgnode(object): """ return self.body + def hasBody(self): + """ + Returns True if node has non empty body, else False + """ + return self.body and re.sub(r'\n|\t|\r| ', '', self.body) != '' + def Level(self): """ Returns an integer corresponding to the level of the node. From 1d3b3d5f3993349c5607f164fcbda1f5e50ebf99 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 12:25:26 +0300 Subject: [PATCH 5/7] Convert field get/set methods in OrgNode class to @property - Use more descriptive variable names in OrgNode parser and class - Convert OrgNode fields to private/protected, use property methods to get/set them --- src/processor/org_mode/org_to_jsonl.py | 28 +-- src/processor/org_mode/orgnode.py | 271 ++++++++++++++----------- tests/test_orgnode.py | 110 +++++----- 3 files changed, 218 insertions(+), 191 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 93955659..d03f569f 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -106,34 +106,34 @@ def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_m for entry in entries: entry_dict = dict() - if not entry.hasBody(): + if not entry.hasBody: # Ignore title notes i.e notes with just headings and empty body continue - entry_dict["compiled"] = f'{entry.Heading()}.' + entry_dict["compiled"] = f'{entry.heading}.' if state.verbose > 2: - logger.debug(f"Title: {entry.Heading()}") + logger.debug(f"Title: {entry.heading}") - if entry.Tags(): - tags_str = " ".join(entry.Tags()) + if entry.tags: + tags_str = " ".join(entry.tags) entry_dict["compiled"] += f'\t {tags_str}.' if state.verbose > 2: logger.debug(f"Tags: {tags_str}") - if entry.Closed(): - entry_dict["compiled"] += f'\n Closed on {entry.Closed().strftime("%Y-%m-%d")}.' + if entry.closed: + entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.' if state.verbose > 2: - logger.debug(f'Closed: {entry.Closed().strftime("%Y-%m-%d")}') + logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}') - if entry.Scheduled(): - entry_dict["compiled"] += f'\n Scheduled for {entry.Scheduled().strftime("%Y-%m-%d")}.' + if entry.scheduled: + entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.' if state.verbose > 2: - logger.debug(f'Scheduled: {entry.Scheduled().strftime("%Y-%m-%d")}') + logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}') - if entry.hasBody(): - entry_dict["compiled"] += f'\n {entry.Body()}' + if entry.hasBody: + entry_dict["compiled"] += f'\n {entry.body}' if state.verbose > 2: - logger.debug(f"Body: {entry.Body()}") + logger.debug(f"Body: {entry.body}") if entry_dict: entry_dict["raw"] = f'{entry}' diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 5ad28d88..bed27975 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -33,7 +33,7 @@ headline and associated text from an org-mode file, and routines for constructing data structures of these classes. """ -import re, sys +import re import datetime from pathlib import Path from os.path import relpath @@ -69,41 +69,41 @@ def makelist(filename): sched_date = '' deadline_date = '' logbook = list() - nodelist = [] - propdict = dict() + nodelist: list[Orgnode] = list() + property_map = dict() in_properties_drawer = False in_logbook_drawer = False file_title = f'{filename}' for line in f: ctr += 1 - hdng = re.search(r'^(\*+)\s(.*?)\s*$', line) - if hdng: # we are processing a heading line + heading_search = re.search(r'^(\*+)\s(.*?)\s*$', line) + if heading_search: # we are processing a heading line if heading: # if we have are on second heading, append first heading to headings list thisNode = Orgnode(level, heading, bodytext, tags) if closed_date: - thisNode.setClosed(closed_date) + thisNode.closed = closed_date closed_date = '' if sched_date: - thisNode.setScheduled(sched_date) + thisNode.scheduled = sched_date sched_date = "" if deadline_date: - thisNode.setDeadline(deadline_date) + thisNode.deadline = deadline_date deadline_date = '' if logbook: - thisNode.setLogbook(logbook) + thisNode.logbook = logbook logbook = list() - thisNode.setProperties(propdict) + thisNode.properties = property_map nodelist.append( thisNode ) - propdict = {'LINE': f'file:{normalize_filename(filename)}::{ctr}'} - level = hdng.group(1) - heading = hdng.group(2) + property_map = {'LINE': f'file:{normalize_filename(filename)}::{ctr}'} + level = heading_search.group(1) + heading = heading_search.group(2) bodytext = "" tags = list() # set of all tags in headline - tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading) - if tagsrch: - heading = tagsrch.group(1) - parsedtags = tagsrch.group(2) + tag_search = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading) + if tag_search: + heading = tag_search.group(1) + parsedtags = tag_search.group(2) if parsedtags: for parsedtag in parsedtags.split(':'): if parsedtag != '': tags.append(parsedtag) @@ -148,13 +148,13 @@ def makelist(filename): logbook += [(clocked_in, clocked_out)] line = "" - prop_srch = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line) - if prop_srch: + property_search = re.search(r'^\s*:([a-zA-Z0-9]+):\s*(.*?)\s*$', line) + if property_search: # Set ID property to an id based org-mode link to the entry - if prop_srch.group(1) == 'ID': - propdict['ID'] = f'id:{prop_srch.group(2)}' + if property_search.group(1) == 'ID': + property_map['ID'] = f'id:{property_search.group(2)}' else: - propdict[prop_srch.group(1)] = prop_srch.group(2) + property_map[property_search.group(1)] = property_search.group(2) continue cd_re = re.search(r'CLOSED:\s*\[([0-9]{4})-([0-9]{2})-([0-9]{2})', line) @@ -179,39 +179,38 @@ def makelist(filename): # write out last node thisNode = Orgnode(level, heading or file_title, bodytext, tags) - thisNode.setProperties(propdict) + thisNode.properties = property_map if sched_date: - thisNode.setScheduled(sched_date) + thisNode.scheduled = sched_date if deadline_date: - thisNode.setDeadline(deadline_date) + thisNode.deadline = deadline_date if closed_date: - thisNode.setClosed(closed_date) + thisNode.closed = closed_date if logbook: - thisNode.setLogbook(logbook) + thisNode.logbook = logbook nodelist.append( thisNode ) # using the list of TODO keywords found in the file # process the headings searching for TODO keywords for n in nodelist: - h = n.Heading() - todoSrch = re.search(r'([A-Z]+)\s(.*?)$', h) - if todoSrch: - if todoSrch.group(1) in todos: - n.setHeading( todoSrch.group(2) ) - n.setTodo ( todoSrch.group(1) ) + todo_search = re.search(r'([A-Z]+)\s(.*?)$', n.heading) + if todo_search: + if todo_search.group(1) in todos: + n.heading = todo_search.group(2) + n.todo = todo_search.group(1) # extract, set priority from heading, update heading if necessary - prtysrch = re.search(r'^\[\#(A|B|C)\] (.*?)$', n.Heading()) - if prtysrch: - n.setPriority(prtysrch.group(1)) - n.setHeading(prtysrch.group(2)) + priority_search = re.search(r'^\[\#(A|B|C)\] (.*?)$', n.heading) + if priority_search: + n.priority = priority_search.group(1) + n.heading = priority_search.group(2) # Set SOURCE property to a file+heading based org-mode link to the entry - if n.Level() == 0: + if n.level == 0: n.properties['LINE'] = f'file:{normalize_filename(filename)}::0' n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]' else: - escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]") + escaped_heading = n.heading.replace("[","\\[").replace("]","\\]") n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]' return nodelist @@ -229,205 +228,233 @@ class Orgnode(object): first tag. The makelist routine postprocesses the list to identify TODO tags and updates headline and todo fields. """ - self.level = len(level) - self.headline = headline - self.body = body - self.tags = tags # All tags in the headline - self.todo = "" - self.prty = "" # empty of A, B or C - self.scheduled = "" # Scheduled date - self.deadline = "" # Deadline date - self.closed = "" # Closed date - self.properties = dict() - self.logbook = list() # List of clock-in, clock-out tuples representing logbook entries + self._level = len(level) + self._heading = headline + self._body = body + self._tags = tags # All tags in the headline + self._todo = "" + self._priority = "" # empty of A, B or C + self._scheduled = "" # Scheduled date + self._deadline = "" # Deadline date + self._closed = "" # Closed date + self._properties = dict() + self._logbook = list() # List of clock-in, clock-out tuples representing logbook entries # Look for priority in headline and transfer to prty field - def Heading(self): + @property + def heading(self): """ Return the Heading text of the node without the TODO tag """ - return self.headline + return self._heading - def setHeading(self, newhdng): + @heading.setter + def heading(self, newhdng): """ Change the heading to the supplied string """ - self.headline = newhdng + self._heading = newhdng - def Body(self): + @property + def body(self): """ Returns all lines of text of the body of this node except the Property Drawer """ - return self.body + return self._body + @property def hasBody(self): """ Returns True if node has non empty body, else False """ - return self.body and re.sub(r'\n|\t|\r| ', '', self.body) != '' + return self._body and re.sub(r'\n|\t|\r| ', '', self._body) != '' - def Level(self): + @property + def level(self): """ Returns an integer corresponding to the level of the node. Top level (one asterisk) has a level of 1. """ - return self.level + return self._level - def Priority(self): + @property + def priority(self): """ Returns the priority of this headline: 'A', 'B', 'C' or empty string if priority has not been set. """ - return self.prty + return self._priority - def setPriority(self, newprty): + @priority.setter + def priority(self, new_priority): """ Change the value of the priority of this headline. Values values are '', 'A', 'B', 'C' """ - self.prty = newprty + self._priority = new_priority - def Tags(self): + @property + def tags(self): """ Returns the list of all tags For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER'] """ - return self.tags + return self._tags - def hasTag(self, srch): + @property + def hasTag(self, tag): """ Returns True if the supplied tag is present in this headline For example, hasTag('COMPUTER') on headling containing :HOME:COMPUTER: would return True. """ - return srch in self.tags + return tag in self._tags - def setTags(self, newtags): + @tags.setter + def tags(self, newtags): """ Store all the tags found in the headline. """ - self.tags = newtags + self._tags = newtags - def Todo(self): + @property + def todo(self): """ Return the value of the TODO tag """ - return self.todo + return self._todo - def setTodo(self, value): + @todo.setter + def todo(self, new_todo): """ Set the value of the TODO tag to the supplied string """ - self.todo = value + self._todo = new_todo - def setProperties(self, dictval): + @property + def properties(self): + """ + Return the dictionary of properties + """ + return self._properties + + @properties.setter + def properties(self, new_properties): """ Sets all properties using the supplied dictionary of name/value pairs """ - self.properties = dictval + self._properties = new_properties - def Property(self, keyval): + def Property(self, property_key): """ Returns the value of the requested property or null if the property does not exist. """ - return self.properties.get(keyval, "") + return self._properties.get(property_key, "") - def setScheduled(self, dateval): + @property + def scheduled(self): """ - Set the scheduled date using the supplied date object + Return the scheduled date """ - self.scheduled = dateval + return self._scheduled - def Scheduled(self): + @scheduled.setter + def scheduled(self, new_scheduled): """ - Return the scheduled date object or null if nonexistent + Set the scheduled date to the scheduled date """ - return self.scheduled + self._scheduled = new_scheduled - def setDeadline(self, dateval): + @property + def deadline(self): """ - Set the deadline (due) date using the supplied date object + Return the deadline date """ - self.deadline = dateval + return self._deadline - def Deadline(self): + @deadline.setter + def deadline(self, new_deadline): """ - Return the deadline date object or null if nonexistent + Set the deadline (due) date to the new deadline date """ - return self.deadline + self._deadline = new_deadline - def setClosed(self, dateval): + @property + def closed(self): """ - Set the closed date using the supplied date object + Return the closed date """ - self.closed = dateval + return self._closed - def Closed(self): + @closed.setter + def closed(self, new_closed): """ - Return the closed date object or null if nonexistent + Set the closed date to the new closed date """ - return self.closed + self._closed = new_closed - def setLogbook(self, logbook): - """ - Set the logbook with list of clocked-in, clocked-out tuples for the entry - """ - self.logbook = logbook - - def Logbook(self): + @property + def logbook(self): """ Return the logbook with all clocked-in, clocked-out date object pairs or empty list if nonexistent """ - return self.logbook + return self._logbook + + @logbook.setter + def logbook(self, new_logbook): + """ + Set the logbook with list of clocked-in, clocked-out tuples for the entry + """ + self._logbook = new_logbook def __repr__(self): """ Print the level, heading text and tag of a node and the body text as used to construct the node. """ - # This method is not completed yet. + # Output heading line n = '' - for _ in range(0, self.level): + for _ in range(0, self._level): n = n + '*' n = n + ' ' - if self.todo: - n = n + self.todo + ' ' - if self.prty: - n = n + '[#' + self.prty + '] ' - n = n + self.headline + if self._todo: + n = n + self._todo + ' ' + if self._priority: + n = n + '[#' + self._priority + '] ' + n = n + self._heading n = "%-60s " % n # hack - tags will start in column 62 closecolon = '' - for t in self.tags: + for t in self._tags: n = n + ':' + t closecolon = ':' n = n + closecolon n = n + "\n" # Get body indentation from first line of body - indent = indent_regex.match(self.body).group() + indent = indent_regex.match(self._body).group() # Output Closed Date, Scheduled Date, Deadline Date - if self.closed or self.scheduled or self.deadline: + if self._closed or self._scheduled or self._deadline: n = n + indent - if self.closed: - n = n + f'CLOSED: [{self.closed.strftime("%Y-%m-%d %a")}] ' - if self.scheduled: - n = n + f'SCHEDULED: <{self.scheduled.strftime("%Y-%m-%d %a")}> ' - if self.deadline: - n = n + f'DEADLINE: <{self.deadline.strftime("%Y-%m-%d %a")}> ' - if self.closed or self.scheduled or self.deadline: + if self._closed: + n = n + f'CLOSED: [{self._closed.strftime("%Y-%m-%d %a")}] ' + if self._scheduled: + n = n + f'SCHEDULED: <{self._scheduled.strftime("%Y-%m-%d %a")}> ' + if self._deadline: + n = n + f'DEADLINE: <{self._deadline.strftime("%Y-%m-%d %a")}> ' + if self._closed or self._scheduled or self._deadline: n = n + '\n' # Ouput Property Drawer n = n + indent + ":PROPERTIES:\n" - for key, value in self.properties.items(): + for key, value in self._properties.items(): n = n + indent + f":{key}: {value}\n" n = n + indent + ":END:\n" - n = n + self.body + n = n + self._body return n diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index c1e0aaa9..372969ab 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -20,14 +20,14 @@ def test_parse_entry_with_no_headings(tmp_path): # Assert assert len(entries) == 1 - assert entries[0].Heading() == f'{orgfile}' - assert entries[0].Tags() == list() - assert entries[0].Body() == "Body Line 1" - assert entries[0].Priority() == "" + assert entries[0].heading == f'{orgfile}' + assert entries[0].tags == list() + assert entries[0].body == "Body Line 1" + assert entries[0].priority == "" assert entries[0].Property("ID") == "" - assert entries[0].Closed() == "" - assert entries[0].Scheduled() == "" - assert entries[0].Deadline() == "" + assert entries[0].closed == "" + assert entries[0].scheduled == "" + assert entries[0].deadline == "" # ---------------------------------------------------------------------------------------------------- @@ -44,14 +44,14 @@ Body Line 1''' # Assert assert len(entries) == 1 - assert entries[0].Heading() == "Heading" - assert entries[0].Tags() == list() - assert entries[0].Body() == "Body Line 1" - assert entries[0].Priority() == "" + assert entries[0].heading == "Heading" + assert entries[0].tags == list() + assert entries[0].body == "Body Line 1" + assert entries[0].priority == "" assert entries[0].Property("ID") == "" - assert entries[0].Closed() == "" - assert entries[0].Scheduled() == "" - assert entries[0].Deadline() == "" + assert entries[0].closed == "" + assert entries[0].scheduled == "" + assert entries[0].deadline == "" # ---------------------------------------------------------------------------------------------------- @@ -77,16 +77,16 @@ Body Line 2''' # Assert assert len(entries) == 1 - assert entries[0].Heading() == "Heading" - assert entries[0].Todo() == "DONE" - assert entries[0].Tags() == ["Tag1", "TAG2", "tag3"] - assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2" - assert entries[0].Priority() == "A" + assert entries[0].heading == "Heading" + assert entries[0].todo == "DONE" + assert entries[0].tags == ["Tag1", "TAG2", "tag3"] + assert entries[0].body == "- Clocked Log 1\nBody Line 1\nBody Line 2" + assert entries[0].priority == "A" assert entries[0].Property("ID") == "id:123-456-789-4234-1231" - assert entries[0].Closed() == datetime.date(1984,4,1) - assert entries[0].Scheduled() == datetime.date(1984,4,1) - assert entries[0].Deadline() == datetime.date(1984,4,1) - assert entries[0].Logbook() == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))] + assert entries[0].closed == datetime.date(1984,4,1) + assert entries[0].scheduled == datetime.date(1984,4,1) + assert entries[0].deadline == datetime.date(1984,4,1) + assert entries[0].logbook == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))] # ---------------------------------------------------------------------------------------------------- @@ -109,7 +109,7 @@ Body Line 2 # Assert # SOURCE link rendered with Heading - assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}' + assert f':SOURCE: [[file:{orgfile}::*{entries[0].heading}]]' in f'{entries[0]}' # ID link rendered with ID assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}' # LINE link rendered with line number @@ -134,7 +134,7 @@ Body Line 1''' # Assert assert len(entries) == 1 # parsed heading from entry - assert entries[0].Heading() == "Heading[1]" + assert entries[0].heading == "Heading[1]" # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]") assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}' @@ -176,16 +176,16 @@ Body 2 # Assert assert len(entries) == 2 for index, entry in enumerate(entries): - assert entry.Heading() == f"Heading{index+1}" - assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED" - assert entry.Tags() == [f"tag{index+1}"] - assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n" - assert entry.Priority() == "A" + assert entry.heading == f"Heading{index+1}" + assert entry.todo == "FAILED" if index == 0 else "CANCELLED" + assert entry.tags == [f"tag{index+1}"] + assert entry.body == f"- Clocked Log {index+1}\nBody {index+1}\n\n" + assert entry.priority == "A" assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}" - assert entry.Closed() == datetime.date(1984,4,index+1) - assert entry.Scheduled() == datetime.date(1984,4,index+1) - assert entry.Deadline() == datetime.date(1984,4,index+1) - assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))] + assert entry.closed == datetime.date(1984,4,index+1) + assert entry.scheduled == datetime.date(1984,4,index+1) + assert entry.deadline == datetime.date(1984,4,index+1) + assert entry.logbook == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))] # ---------------------------------------------------------------------------------------------------- @@ -201,14 +201,14 @@ Body Line 1''' # Assert assert len(entries) == 1 - assert entries[0].Heading() == f'{orgfile}' - assert entries[0].Tags() == list() - assert entries[0].Body() == "Body Line 1" - assert entries[0].Priority() == "" + assert entries[0].heading == f'{orgfile}' + assert entries[0].tags == list() + assert entries[0].body == "Body Line 1" + assert entries[0].priority == "" assert entries[0].Property("ID") == "" - assert entries[0].Closed() == "" - assert entries[0].Scheduled() == "" - assert entries[0].Deadline() == "" + assert entries[0].closed == "" + assert entries[0].scheduled == "" + assert entries[0].deadline == "" # ---------------------------------------------------------------------------------------------------- @@ -224,14 +224,14 @@ Body Line 1''' # Assert assert len(entries) == 1 - assert entries[0].Heading() == 'test' - assert entries[0].Tags() == list() - assert entries[0].Body() == "Body Line 1" - assert entries[0].Priority() == "" + assert entries[0].heading == 'test' + assert entries[0].tags == list() + assert entries[0].body == "Body Line 1" + assert entries[0].priority == "" assert entries[0].Property("ID") == "" - assert entries[0].Closed() == "" - assert entries[0].Scheduled() == "" - assert entries[0].Deadline() == "" + assert entries[0].closed == "" + assert entries[0].scheduled == "" + assert entries[0].deadline == "" # ---------------------------------------------------------------------------------------------------- @@ -248,14 +248,14 @@ Body Line 1 # Assert assert len(entries) == 1 - assert entries[0].Heading() == 'title1 title2' - assert entries[0].Tags() == list() - assert entries[0].Body() == "Body Line 1\n" - assert entries[0].Priority() == "" + assert entries[0].heading == 'title1 title2' + assert entries[0].tags == list() + assert entries[0].body == "Body Line 1\n" + assert entries[0].priority == "" assert entries[0].Property("ID") == "" - assert entries[0].Closed() == "" - assert entries[0].Scheduled() == "" - assert entries[0].Deadline() == "" + assert entries[0].closed == "" + assert entries[0].scheduled == "" + assert entries[0].deadline == "" # Helper Functions From 253c9eae9afe81ac7a9f16166b2820634ce50c39 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 12:40:58 +0300 Subject: [PATCH 6/7] Set index_heading_entries field in config to index entries with no body - Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content --- src/processor/org_mode/org_to_jsonl.py | 7 ++++--- src/utils/rawconfig.py | 1 + tests/test_org_to_jsonl.py | 29 +++++++++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index d03f569f..532b9b4c 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) def org_to_jsonl(config: TextContentConfig, previous_entries=None): # Extract required fields from config org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl + index_heading_entries = config.index_heading_entries # Input Validation if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): @@ -41,7 +42,7 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None): logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") start = time.time() - current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) end = time.time() logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") @@ -100,13 +101,13 @@ def extract_org_entries(org_files): return entries, dict(entry_to_file_map) -def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]: +def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: "Convert Org-Mode entries into list of dictionary" entry_maps = [] for entry in entries: entry_dict = dict() - if not entry.hasBody: + if not entry.hasBody and not index_heading_entries: # Ignore title notes i.e notes with just headings and empty body continue diff --git a/src/utils/rawconfig.py b/src/utils/rawconfig.py index 4fd4be96..f9c19900 100644 --- a/src/utils/rawconfig.py +++ b/src/utils/rawconfig.py @@ -18,6 +18,7 @@ class TextContentConfig(ConfigBase): input_filter: Optional[str] compressed_jsonl: Path embeddings_file: Path + index_heading_entries: Optional[bool] = False @validator('input_filter') def input_filter_or_files_required(cls, input_filter, values, **kwargs): diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index eaac5ef8..04e7199e 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -6,28 +6,33 @@ from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, co from src.utils.helpers import is_none_or_empty -def test_entry_with_empty_body_line_to_jsonl(tmp_path): - '''Ensure entries with empty body are ignored. +def test_configure_heading_entry_to_jsonl(tmp_path): + '''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries. Property drawers not considered Body. Ignore control characters for evaluating if Body empty.''' # Arrange entry = f'''*** Heading :PROPERTIES: :ID: 42-42-42 :END: - \t\r + \t \r ''' orgfile = create_file(tmp_path, entry) - # Act - # Extract Entries from specified Org files - entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) + for index_heading_entries in [True, False]: + # Act + # Extract entries into jsonl from specified Org files + jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries( + *extract_org_entries(org_files=[orgfile]), + index_heading_entries=index_heading_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Process Each Entry from All Notes Files - entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_data = convert_org_entries_to_jsonl(entries) - - # Assert - assert is_none_or_empty(jsonl_data) + # Assert + if index_heading_entries: + # Entry with empty body indexed when index_heading_entries set to True + assert len(jsonl_data) == 1 + else: + # Entry with empty body ignored when index_heading_entries set to False + assert is_none_or_empty(jsonl_data) def test_entry_with_body_to_jsonl(tmp_path): From 9d369ae4df4936b8aa3ecccc3e3be0cc31746413 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 15:54:26 +0300 Subject: [PATCH 7/7] Fix OrgNode render of entries with property drawers and empty body - Issue - Indent regex was previously catching escape sequences like newlines - This was resulting in entries with only escape sequences in body to be prepended to property drawers etc during rendering - Fix - Update indent regex to only look for spaces in each line - Only render body when body contains non-escape characters - Create test to prevent this regression from silently resurfacing --- src/processor/org_mode/orgnode.py | 6 ++++-- tests/test_orgnode.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index bed27975..1853c84b 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -38,7 +38,7 @@ import datetime from pathlib import Path from os.path import relpath -indent_regex = re.compile(r'^\s*') +indent_regex = re.compile(r'^ *') def normalize_filename(filename): "Normalize and escape filename for rendering" @@ -455,6 +455,8 @@ class Orgnode(object): n = n + indent + f":{key}: {value}\n" n = n + indent + ":END:\n" - n = n + self._body + # Output Body + if self.hasBody: + n = n + self._body return n diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index 372969ab..d36cca79 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -1,7 +1,5 @@ # Standard Packages import datetime -from os.path import relpath -from pathlib import Path # Internal Packages from src.processor.org_mode import orgnode @@ -89,6 +87,34 @@ Body Line 2''' assert entries[0].logbook == [(datetime.datetime(1984,4,1,9,0,0), datetime.datetime(1984,4,1,12,0,0))] +# ---------------------------------------------------------------------------------------------------- +def test_render_entry_with_property_drawer_and_empty_body(tmp_path): + "Render heading entry with property drawer" + # Arrange + entry_to_render = f''' +*** [#A] Heading1 :tag1: + :PROPERTIES: + :ID: 111-111-111-1111-1111 + :END: +\t\r \n +''' + orgfile = create_file(tmp_path, entry_to_render) + + expected_entry = f'''*** [#A] Heading1 :tag1: +:PROPERTIES: +:LINE: file:{orgfile}::2 +:ID: id:111-111-111-1111-1111 +:SOURCE: [[file:{orgfile}::*Heading1]] +:END: +''' + + # Act + parsed_entries = orgnode.makelist(orgfile) + + # Assert + assert f'{parsed_entries[0]}' == expected_entry + + # ---------------------------------------------------------------------------------------------------- def test_all_links_to_entry_rendered(tmp_path): "Ensure all links to entry rendered in property drawer from entry"