Handle Empty Org Files or Org Files with No Headings

### Main Changes - bf01a4f Use filename or "#+TITLE" as heading for 0th level content in org files - d6bd7bf Fix initializing `OrgNode` `level` to string to parse org files with no headings - d835467 Throw exception if no valid entries found in specified content files ### Miscellaneous Improvements - 7df39e5 Reuse search models across `pytest` sessions. Merge unused pytest fixtures - 2dc0588 Do not normalize absolute filenames for entry links in `OrgNode` - e00bb53 Init word filter dictionary with default value as set to simplify code Resolves #83
2026-03-07 21:29:13 +00:00 · 2022-09-10 12:42:07 +00:00
parent 4d776d9c7a 976397bd82
commit 372dcd2dbc
7 changed files with 190 additions and 50 deletions
--- a/src/processor/org_mode/orgnode.py
+++ b/src/processor/org_mode/orgnode.py
@@ -41,8 +41,13 @@ from os.path import relpath
 indent_regex = re.compile(r'^\s*')
 def normalize_filename(filename):
-   file_relative_to_home = f'~/{relpath(filename, start=Path.home())}'
+   "Normalize and escape filename for rendering"
-   escaped_filename = f'{file_relative_to_home}'.replace("[","\[").replace("]","\]")
+   if not Path(filename).is_absolute():
      # Normalize relative filename to be relative to current directory
      normalized_filename = f'~/{relpath(filename, start=Path.home())}'
   else:
      normalized_filename = filename
   escaped_filename = f'{normalized_filename}'.replace("[","\[").replace("]","\]")
   return escaped_filename
 def makelist(filename):
@@ -61,7 +66,7 @@ def makelist(filename):
   todos         = { "TODO": "", "WAITING": "", "ACTIVE": "",
                     "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
-   level         = 0
+   level         = ""
   heading       = ""
   bodytext      = ""
   tags          = set()      # set of all tags in headline
@@ -73,6 +78,7 @@ def makelist(filename):
   propdict      = dict()
   in_properties_drawer = False
   in_logbook_drawer = False
   file_title = f'{filename}'
   for line in f:
       ctr += 1
@@ -111,6 +117,16 @@ def makelist(filename):
              kwlist = re.findall(r'([A-Z]+)\(', line)
              for kw in kwlist: todos[kw] = ""
           # Set file title to TITLE property, if it exists
           title_search = re.search(r'^#\+TITLE:\s*(.*)$', line)
           if title_search and title_search.group(1).strip() != '':
               title_text = title_search.group(1).strip()
               if file_title == f'{filename}':
                  file_title = title_text
               else:
                  file_title += f' {title_text}'
               continue
           # Ignore Properties Drawers Completely
           if re.search(':PROPERTIES:', line):
              in_properties_drawer=True
@@ -167,7 +183,7 @@ def makelist(filename):
               bodytext = bodytext + line
   # write out last node
-   thisNode = Orgnode(level, heading, bodytext, tags)
+   thisNode = Orgnode(level, heading or file_title, bodytext, tags)
   thisNode.setProperties(propdict)
   if sched_date:
      thisNode.setScheduled(sched_date)
@@ -196,6 +212,10 @@ def makelist(filename):
          n.setHeading(prtysrch.group(2))
       # Set SOURCE property to a file+heading based org-mode link to the entry
       if n.Level() == 0:
         n.properties['LINE'] = f'file:{normalize_filename(filename)}::0'
         n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]'
       else:
         escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
         n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'
--- a/src/search_filter/word_filter.py
+++ b/src/search_filter/word_filter.py
@@ -3,6 +3,7 @@ import re
 import time
 import pickle
 import logging
 from collections import defaultdict
 # Internal Packages
 from src.search_filter.base_filter import BaseFilter
@@ -37,19 +38,18 @@ class WordFilter(BaseFilter):
            start = time.time()
            self.cache = {}  # Clear cache on (re-)generating entries_by_word_set
            entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
            self.word_to_entry_index = defaultdict(set)
            # Create map of words to entries they exist in
            for entry_index, entry in enumerate(entries):
                for word in re.split(entry_splitter, entry[self.entry_key].lower()):
                    if word == '':
                        continue
                    if word not in self.word_to_entry_index:
                        self.word_to_entry_index[word] = set()
                    self.word_to_entry_index[word].add(entry_index)
            with self.filter_file.open('wb') as f:
                pickle.dump(self.word_to_entry_index, f)
            end = time.time()
-            logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds")
+            logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds")
        return self.word_to_entry_index
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -11,7 +11,7 @@ from src.search_filter.base_filter import BaseFilter
 # Internal Packages
 from src.utils import state
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
+from src.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import TextSearchConfig, TextContentConfig
 from src.utils.jsonl import load_jsonl
@@ -174,6 +174,8 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
    # Extract Entries
    entries = extract_entries(config.compressed_jsonl)
    if is_none_or_empty(entries):
        raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
    top_k = min(len(entries), top_k)  # top_k hits can't be more than the total entries in corpus
    # Compute or Load Embeddings
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@ import pytest
 # Internal Packages
 from src.search_type import image_search, text_search
 from src.utils.config import SearchType
 from src.utils.helpers import resolve_absolute_path
 from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.search_filter.date_filter import DateFilter
@@ -12,41 +13,41 @@ from src.search_filter.file_filter import FileFilter
@pytest.fixture(scope='session')
-def search_config(tmp_path_factory) -> SearchConfig:
+def search_config() -> SearchConfig:
-    model_dir = tmp_path_factory.mktemp('data')
+    model_dir = resolve_absolute_path('~/.khoj/search')
-
+    model_dir.mkdir(parents=True, exist_ok=True)
    search_config = SearchConfig()
    search_config.symmetric = TextSearchConfig(
        encoder = "sentence-transformers/all-MiniLM-L6-v2",
        cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
-        model_directory = model_dir
+        model_directory = model_dir / 'symmetric/'
    )
    search_config.asymmetric = TextSearchConfig(
        encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
        cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
-        model_directory = model_dir
+        model_directory = model_dir / 'asymmetric/'
    )
    search_config.image = ImageSearchConfig(
        encoder = "sentence-transformers/clip-ViT-B-32",
-        model_directory = model_dir
+        model_directory = model_dir / 'image/'
    )
    return search_config
@pytest.fixture(scope='session')
-def model_dir(search_config: SearchConfig):
+def content_config(tmp_path_factory, search_config: SearchConfig):
-    model_dir = search_config.asymmetric.model_directory
+    content_dir = tmp_path_factory.mktemp('content')
    # Generate Image Embeddings from Test Images
    content_config = ContentConfig()
    content_config.image = ImageContentConfig(
        input_directories = ['tests/data/images'],
-        embeddings_file = model_dir.joinpath('image_embeddings.pt'),
+        embeddings_file = content_dir.joinpath('image_embeddings.pt'),
-        batch_size = 10,
+        batch_size = 1,
        use_xmp_metadata = False)
    image_search.setup(content_config.image, search_config.image, regenerate=False)
@@ -55,28 +56,10 @@ def model_dir(search_config: SearchConfig):
    content_config.org = TextContentConfig(
        input_files = None,
        input_filter = 'tests/data/org/*.org',
-        compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
+        compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
-        embeddings_file = model_dir.joinpath('note_embeddings.pt'))
+        embeddings_file = content_dir.joinpath('note_embeddings.pt'))
-    filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()]
+    filters = [DateFilter(), WordFilter(content_dir, search_type=SearchType.Org), FileFilter()]
    text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
    return model_dir
@pytest.fixture(scope='session')
 def content_config(model_dir) -> ContentConfig:
    content_config = ContentConfig()
    content_config.org = TextContentConfig(
        input_files = None,
        input_filter = 'tests/data/org/*.org',
        compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
        embeddings_file = model_dir.joinpath('note_embeddings.pt'))
    content_config.image = ImageContentConfig(
        input_directories = ['tests/data/images'],
        embeddings_file = model_dir.joinpath('image_embeddings.pt'),
        batch_size = 1,
        use_xmp_metadata = False)
    return content_config
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -15,7 +15,7 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
    :PROPERTIES:
    :ID:       42-42-42
    :END:
-    \t\r\n 
+    \t\r 
    '''
    orgfile = create_file(tmp_path, entry)
@@ -37,7 +37,29 @@ def test_entry_with_body_to_jsonl(tmp_path):
    :PROPERTIES:
    :ID:       42-42-42
    :END:
-    \t\r\nBody Line 1\n
+    \t\r
    Body Line 1
    '''
    orgfile = create_file(tmp_path, entry)
    # Act
    # Extract Entries from specified Org files
    entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
    # Process Each Entry from All Notes Files
    jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
    # Assert
    assert len(jsonl_data) == 1
 def test_file_with_no_headings_to_jsonl(tmp_path):
    "Ensure files with no heading, only body text are loaded."
    # Arrange
    entry = f'''
    - Bullet point 1
    - Bullet point 2
    '''
    orgfile = create_file(tmp_path, entry)
--- a/tests/test_orgnode.py
+++ b/tests/test_orgnode.py
@@ -8,6 +8,28 @@ from src.processor.org_mode import orgnode
 # Test
 # ----------------------------------------------------------------------------------------------------
 def test_parse_entry_with_no_headings(tmp_path):
    "Test parsing of entry with minimal fields"
    # Arrange
    entry = f'''Body Line 1'''
    orgfile = create_file(tmp_path, entry)
    # Act
    entries = orgnode.makelist(orgfile)
    # Assert
    assert len(entries) == 1
    assert entries[0].Heading() == f'{orgfile}'
    assert entries[0].Tags() == set()
    assert entries[0].Body() == "Body Line 1"
    assert entries[0].Priority() == ""
    assert entries[0].Property("ID") == ""
    assert entries[0].Closed() == ""
    assert entries[0].Scheduled() == ""
    assert entries[0].Deadline() == ""
 # ----------------------------------------------------------------------------------------------------
 def test_parse_minimal_entry(tmp_path):
    "Test parsing of entry with minimal fields"
@@ -81,18 +103,17 @@ Body Line 1
 Body Line 2
 '''
    orgfile = create_file(tmp_path, entry)
    normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'
    # Act
    entries = orgnode.makelist(orgfile)
    # Assert
    # SOURCE link rendered with Heading
-    assert f':SOURCE: [[file:{normalized_orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
+    assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
    # ID link rendered with ID
    assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
    # LINE link rendered with line number
-    assert f':LINE: file:{normalized_orgfile}::2' in f'{entries[0]}'
+    assert f':LINE: file:{orgfile}::2' in f'{entries[0]}'
 # ----------------------------------------------------------------------------------------------------
@@ -115,8 +136,7 @@ Body Line 1'''
    # parsed heading from entry
    assert entries[0].Heading() == "Heading[1]"
    # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
-    normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'
+    escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]")
    escaped_orgfile = f'{normalized_orgfile}'.replace("[1]", "\\[1\\]")
    assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'
@@ -168,6 +188,76 @@ Body 2
        assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
 # ----------------------------------------------------------------------------------------------------
 def test_parse_entry_with_empty_title(tmp_path):
    "Test parsing of entry with minimal fields"
    # Arrange
    entry = f'''#+TITLE: 
 Body Line 1'''
    orgfile = create_file(tmp_path, entry)
    # Act
    entries = orgnode.makelist(orgfile)
    # Assert
    assert len(entries) == 1
    assert entries[0].Heading() == f'{orgfile}'
    assert entries[0].Tags() == set()
    assert entries[0].Body() == "Body Line 1"
    assert entries[0].Priority() == ""
    assert entries[0].Property("ID") == ""
    assert entries[0].Closed() == ""
    assert entries[0].Scheduled() == ""
    assert entries[0].Deadline() == ""
 # ----------------------------------------------------------------------------------------------------
 def test_parse_entry_with_title_and_no_headings(tmp_path):
    "Test parsing of entry with minimal fields"
    # Arrange
    entry = f'''#+TITLE: test
 Body Line 1'''
    orgfile = create_file(tmp_path, entry)
    # Act
    entries = orgnode.makelist(orgfile)
    # Assert
    assert len(entries) == 1
    assert entries[0].Heading() == 'test'
    assert entries[0].Tags() == set()
    assert entries[0].Body() == "Body Line 1"
    assert entries[0].Priority() == ""
    assert entries[0].Property("ID") == ""
    assert entries[0].Closed() == ""
    assert entries[0].Scheduled() == ""
    assert entries[0].Deadline() == ""
 # ----------------------------------------------------------------------------------------------------
 def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
    "Test parsing of entry with minimal fields"
    # Arrange
    entry = f'''#+TITLE: title1 
 Body Line 1
 #+TITLE:  title2  '''
    orgfile = create_file(tmp_path, entry)
    # Act
    entries = orgnode.makelist(orgfile)
    # Assert
    assert len(entries) == 1
    assert entries[0].Heading() == 'title1 title2'
    assert entries[0].Tags() == set()
    assert entries[0].Body() == "Body Line 1\n"
    assert entries[0].Priority() == ""
    assert entries[0].Property("ID") == ""
    assert entries[0].Closed() == ""
    assert entries[0].Scheduled() == ""
    assert entries[0].Deadline() == ""
 # Helper Functions
 def create_file(tmp_path, entry, filename="test.org"):
    org_file = tmp_path / f"notes/{filename}"
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -1,6 +1,10 @@
 # System Packages
 from copy import deepcopy
 from pathlib import Path
 # External Packages
 import pytest
 # Internal Packages
 from src.utils.state import model
 from src.search_type import text_search
@@ -9,6 +13,25 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 # Test
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange
    file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org"
    file_to_index.touch()
    new_org_content_config = deepcopy(content_config.org)
    new_org_content_config.input_files = [f'{file_to_index}']
    new_org_content_config.input_filter = None
    # Act
    # Generate notes embeddings during asymmetric setup
    with pytest.raises(ValueError, match=r'^No valid entries found*'):
        text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
    # Cleanup
    # delete created test file
    file_to_index.unlink()
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
    # Act
@@ -23,7 +46,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
 # ----------------------------------------------------------------------------------------------------
 def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange
-    model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
+    model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
    query = "How to git install application?"
    # Act