Create wrapper function to get entries from org, md, pdf & text files

- Convert extract_org_entries function to actually extract org entries Previously it was extracting intermediary org-node objects instead Now it extracts the org-node objects from files and converts them into entries - Create separate, new function to extract_org_nodes from files - Similarly create wrapper funcs for md, pdf, plaintext to entries - Update org, md, pdf, plaintext to entries tests to use the new simplified wrapper function to extract org entries
2026-03-02 13:18:18 +00:00 · 2024-02-09 16:04:41 +05:30
parent f01a12b1d2
commit 28105ee027
8 changed files with 71 additions and 94 deletions
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -21,12 +21,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)

    # Process Each Entry from All Notes Files
-    jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
-        MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
-    )
+    jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -52,12 +50,10 @@ def test_single_markdown_entry_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)

    # Process Each Entry from All Notes Files
-    jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
-        MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
-    )
+    jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -81,8 +77,7 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
-    entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)

    # Process Each Entry from All Notes Files
    jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
@@ -144,12 +139,12 @@ def test_extract_entries_with_different_level_headings(tmp_path):

    # Act
    # Extract Entries from specified Markdown files
-    entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
+    entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)

    # Assert
    assert len(entries) == 2
-    assert entries[0] == "# Heading 1"
-    assert entries[1] == "## Heading 2"
+    assert entries[0].raw == "# Heading 1"
+    assert entries[1].raw == "## Heading 2"


 # Helper Functions
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -27,9 +27,7 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
        # Act
        # Extract entries into jsonl from specified Org files
        jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
-            OrgToEntries.convert_org_nodes_to_entries(
-                *OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
-            )
+            OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries)
        )
        jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

@@ -57,13 +55,11 @@ def test_entry_split_when_exceeds_max_words():

    # Act
    # Extract Entries from specified Org files
-    entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
+    entries = OrgToEntries.extract_org_entries(org_files=data)

    # Split each entry from specified Org files by max words
    jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
-        TextToEntries.split_entries_by_max_tokens(
-            OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
-        )
+        TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4)
    )
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

@@ -107,12 +103,7 @@ def test_entry_with_body_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Org files
-    entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
-
-    # Process Each Entry from All Notes Files
-    jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
-        OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
-    )
+    jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(OrgToEntries.extract_org_entries(org_files=data))
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -134,10 +125,9 @@ Intro text

    # Act
    # Extract Entries from specified Org files
-    entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
+    entries = OrgToEntries.extract_org_entries(org_files=data)

    # Process Each Entry from All Notes Files
-    entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
    jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

@@ -158,10 +148,9 @@ def test_file_with_no_headings_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Org files
-    entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
+    entries = OrgToEntries.extract_org_entries(org_files=data)

    # Process Each Entry from All Notes Files
-    entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
    jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

@@ -222,12 +211,12 @@ def test_extract_entries_with_different_level_headings(tmp_path):

    # Act
    # Extract Entries from specified Org files
-    entries, _ = OrgToEntries.extract_org_entries(org_files=data)
+    entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=True)

    # Assert
    assert len(entries) == 2
-    assert f"{entries[0]}".startswith("* Heading 1")
-    assert f"{entries[1]}".startswith("** Heading 2")
+    assert f"{entries[0].raw}".startswith("* Heading 1")
+    assert f"{entries[1].raw}".startswith("** Heading 2")


 # Helper Functions
--- a/tests/test_pdf_to_entries.py
+++ b/tests/test_pdf_to_entries.py
@@ -15,12 +15,10 @@ def test_single_page_pdf_to_jsonl():
        pdf_bytes = f.read()

    data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
-    entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
+    entries = PdfToEntries.extract_pdf_entries(pdf_files=data)

    # Process Each Entry from All Pdf Files
-    jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
-        PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
-    )
+    jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -35,12 +33,10 @@ def test_multi_page_pdf_to_jsonl():
        pdf_bytes = f.read()

    data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
-    entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
+    entries = PdfToEntries.extract_pdf_entries(pdf_files=data)

    # Process Each Entry from All Pdf Files
-    jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
-        PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
-    )
+    jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -55,10 +51,7 @@ def test_ocr_page_pdf_to_jsonl():
        pdf_bytes = f.read()

    data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
-    entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
-
-    # Process Each Entry from All Pdf Files
-    entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
+    entries = PdfToEntries.extract_pdf_entries(pdf_files=data)

    assert len(entries) == 1
    assert "playing on a strip of marsh" in entries[0].raw
--- a/tests/test_plaintext_to_entries.py
+++ b/tests/test_plaintext_to_entries.py
@@ -11,10 +11,10 @@ from khoj.utils.rawconfig import TextContentConfig
 def test_plaintext_file(tmp_path):
    "Convert files with no heading to jsonl."
    # Arrange
-    entry = f"""
+    raw_entry = f"""
    Hi, I am a plaintext file and I have some plaintext words.
    """
-    plaintextfile = create_file(tmp_path, entry)
+    plaintextfile = create_file(tmp_path, raw_entry)

    filename = plaintextfile.stem

@@ -22,17 +22,17 @@ def test_plaintext_file(tmp_path):
    # Extract Entries from specified plaintext files

    data = {
-        f"{plaintextfile}": entry,
+        f"{plaintextfile}": raw_entry,
    }

-    maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
+    entries = PlaintextToEntries.extract_plaintext_entries(entry_to_file_map=data)

    # Convert each entry.file to absolute path to make them JSON serializable
-    for map in maps:
-        map.file = str(Path(map.file).absolute())
+    for entry in entries:
+        entry.file = str(Path(entry.file).absolute())

    # Process Each Entry from All Notes Files
-    jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
+    jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
@@ -40,7 +40,7 @@ def test_plaintext_file(tmp_path):
    # Ensure raw entry with no headings do not get heading prefix prepended
    assert not jsonl_data[0]["raw"].startswith("#")
    # Ensure compiled entry has filename prepended as top level heading
-    assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"
+    assert jsonl_data[0]["compiled"] == f"{filename}\n{raw_entry}"


 def test_get_plaintext_files(tmp_path):
@@ -98,11 +98,11 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
    extracted_plaintext_files = get_plaintext_files(config=config)

    # Act
-    maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
+    entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)

    # Assert
-    assert len(maps) == 1
-    assert "<div>" not in maps[0].raw
+    assert len(entries) == 1
+    assert "<div>" not in entries[0].raw


 # Helper Functions