Files
khoj/tests/test_markdown_to_entries.py
Debanjum Singh Solanky a627f56a64 Remove unused Entry to Jsonl converter from text to entry class, tests
This was earlier used when the index was plaintext jsonl file. Now
that documents are indexed in a DB this func is not required.

Simplify org,md,pdf,plaintext to entries tests by removing the entry
to jsonl conversion step
2024-04-04 02:41:55 +05:30

145 lines
4.2 KiB
Python

import json
import os
from pathlib import Path
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
from khoj.utils.fs_syncer import get_markdown_files
from khoj.utils.rawconfig import TextContentConfig
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
"Convert files with no heading to jsonl."
# Arrange
entry = f"""
- Bullet point 1
- Bullet point 2
"""
data = {
f"{tmp_path}": entry,
}
expected_heading = f"# {tmp_path.stem}"
# Act
# Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 1
# Ensure raw entry with no headings do not get heading prefix prepended
assert not entries[0].raw.startswith("#")
# Ensure compiled entry has filename prepended as top level heading
assert entries[0].compiled.startswith(expected_heading)
# Ensure compiled entry also includes the file name
assert str(tmp_path) in entries[0].compiled
def test_single_markdown_entry_to_jsonl(tmp_path):
"Convert markdown entry from single file to jsonl."
# Arrange
entry = f"""### Heading
\t\r
Body Line 1
"""
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 1
def test_multiple_markdown_entries_to_jsonl(tmp_path):
"Convert multiple markdown entries from single file to jsonl."
# Arrange
entry = f"""
### Heading 1
\t\r
Heading 1 Body Line 1
### Heading 2
\t\r
Heading 2 Body Line 2
"""
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 2
# Ensure entry compiled strings include the markdown files they originate from
assert all([tmp_path.stem in entry.compiled for entry in entries])
def test_get_markdown_files(tmp_path):
"Ensure Markdown files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
# Include via input-file field
file1 = create_file(tmp_path, filename="notes.md")
# Not included by any filter
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.md"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
markdown_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_org_files = get_markdown_files(markdown_config)
# Assert
assert len(extracted_org_files) == 5
assert set(extracted_org_files.keys()) == expected_files
def test_extract_entries_with_different_level_headings(tmp_path):
"Extract markdown entries with different level headings."
# Arrange
entry = f"""
# Heading 1
## Heading 2
"""
data = {
f"{tmp_path}": entry,
}
# Act
# Extract Entries from specified Markdown files
entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert
assert len(entries) == 2
assert entries[0].raw == "# Heading 1"
assert entries[1].raw == "## Heading 2"
# Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"):
markdown_file = tmp_path / filename
markdown_file.touch()
if entry:
markdown_file.write_text(entry)
return markdown_file