Fix extracting Markdown Entries with Top Level Headings

- Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression
2026-03-09 05:39:12 +00:00 · 2023-01-17 12:42:36 -03:00
parent 1a296518c5
commit 7b4f78776c
3 changed files with 44 additions and 4 deletions
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -154,6 +154,25 @@ def test_get_org_files(tmp_path):
    assert extracted_org_files == expected_files


+def test_extract_entries_with_different_level_headings(tmp_path):
+    "Extract org entries with different level headings."
+    # Arrange
+    entry = f'''
+* Heading 1
+** Heading 2
+'''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
+
+    # Assert
+    assert len(entries) == 2
+    assert f'{entries[0]}'.startswith("* Heading 1")
+    assert f'{entries[1]}'.startswith("** Heading 2")
+
+
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="test.org"):
    org_file = tmp_path / filename