Standardize structure of text to entries to match other entry processors

Add process_single_plaintext_file func etc with similar signatures as org_to_entries and markdown_to_entries processors The standardization makes modifications, abstractions easier to create
2026-03-02 13:18:18 +00:00 · 2024-04-08 23:49:34 +05:30
parent 079f409238
commit 8291b898ca
3 changed files with 84 additions and 28 deletions
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -1,6 +1,5 @@
 # Standard Modules
 import os
-from io import BytesIO
 from urllib.parse import quote

 import pytest
--- a/tests/test_plaintext_to_entries.py
+++ b/tests/test_plaintext_to_entries.py
@@ -15,8 +15,6 @@ def test_plaintext_file(tmp_path):
    """
    plaintextfile = create_file(tmp_path, raw_entry)

-    filename = plaintextfile.stem
-
    # Act
    # Extract Entries from specified plaintext files

@@ -24,7 +22,7 @@ def test_plaintext_file(tmp_path):
        f"{plaintextfile}": raw_entry,
    }

-    entries = PlaintextToEntries.extract_plaintext_entries(entry_to_file_map=data)
+    entries = PlaintextToEntries.extract_plaintext_entries(data)

    # Convert each entry.file to absolute path to make them JSON serializable
    for entry in entries:
@@ -35,7 +33,7 @@ def test_plaintext_file(tmp_path):
    # Ensure raw entry with no headings do not get heading prefix prepended
    assert not entries[0].raw.startswith("#")
    # Ensure compiled entry has filename prepended as top level heading
-    assert entries[0].compiled == f"{filename}\n{raw_entry}"
+    assert entries[0].compiled == f"{plaintextfile}\n{raw_entry}"


 def test_get_plaintext_files(tmp_path):
@@ -100,6 +98,35 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
    assert "<div>" not in entries[0].raw


+def test_large_plaintext_file_split_into_multiple_entries(tmp_path):
+    "Convert files with no heading to jsonl."
+    # Arrange
+    max_tokens = 256
+    normal_entry = " ".join([f"{number}" for number in range(max_tokens - 1)])
+    large_entry = " ".join([f"{number}" for number in range(max_tokens)])
+
+    normal_plaintextfile = create_file(tmp_path, normal_entry)
+    large_plaintextfile = create_file(tmp_path, large_entry)
+
+    normal_data = {f"{normal_plaintextfile}": normal_entry}
+    large_data = {f"{large_plaintextfile}": large_entry}
+
+    # Act
+    # Extract Entries from specified plaintext files
+    normal_entries = PlaintextToEntries.split_entries_by_max_tokens(
+        PlaintextToEntries.extract_plaintext_entries(normal_data),
+        max_tokens=max_tokens,
+        raw_is_compiled=True,
+    )
+    large_entries = PlaintextToEntries.split_entries_by_max_tokens(
+        PlaintextToEntries.extract_plaintext_entries(large_data), max_tokens=max_tokens, raw_is_compiled=True
+    )
+
+    # Assert
+    assert len(normal_entries) == 1
+    assert len(large_entries) == 2
+
+
 # Helper Functions
 def create_file(tmp_path: Path, entry=None, filename="test.md"):
    file_ = tmp_path / filename