mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Fix extracting Markdown Entries with Top Level Headings
- Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression
This commit is contained in:
@@ -154,6 +154,25 @@ def test_get_org_files(tmp_path):
|
||||
assert extracted_org_files == expected_files
|
||||
|
||||
|
||||
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||
"Extract org entries with different level headings."
|
||||
# Arrange
|
||||
entry = f'''
|
||||
* Heading 1
|
||||
** Heading 2
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Assert
|
||||
assert len(entries) == 2
|
||||
assert f'{entries[0]}'.startswith("* Heading 1")
|
||||
assert f'{entries[1]}'.startswith("** Heading 2")
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry=None, filename="test.org"):
|
||||
org_file = tmp_path / filename
|
||||
|
||||
Reference in New Issue
Block a user