Add support for indexing plaintext files (#420)

* Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist
2026-03-02 21:19:12 +00:00 · 2023-08-09 22:44:40 +00:00
parent 84d774ea34
commit 7b907add77
14 changed files with 314 additions and 15 deletions
--- a/tests/data/config.yml
+++ b/tests/data/config.yml
@@ -25,4 +25,4 @@ search-type:
  asymmetric:
    cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
    encoder: sentence-transformers/msmarco-MiniLM-L-6-v3
-version: 0.9.1.dev0
+version: 0.10.1
--- a/tests/test_plaintext_to_jsonl.py
+++ b/tests/test_plaintext_to_jsonl.py
@@ -0,0 +1,84 @@
+# Standard Packages
+import json
+from pathlib import Path
+
+# Internal Packages
+from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
+
+
+def test_plaintext_file(tmp_path):
+    "Convert files with no heading to jsonl."
+    # Arrange
+    entry = f"""
+    Hi, I am a plaintext file and I have some plaintext words.
+    """
+    plaintextfile = create_file(tmp_path, entry)
+
+    filename = plaintextfile.stem
+
+    # Act
+    # Extract Entries from specified plaintext files
+    file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile])
+
+    maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
+
+    # Convert each entry.file to absolute path to make them JSON serializable
+    for map in maps:
+        map.file = str(Path(map.file).absolute())
+
+    # Process Each Entry from All Notes Files
+    jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 1
+    # Ensure raw entry with no headings do not get heading prefix prepended
+    assert not jsonl_data[0]["raw"].startswith("#")
+    # Ensure compiled entry has filename prepended as top level heading
+    assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"
+
+
+def test_get_plaintext_files(tmp_path):
+    "Ensure Plaintext files specified via input-filter, input-files extracted"
+    # Arrange
+    # Include via input-filter globs
+    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
+    group1_file2 = create_file(tmp_path, filename="group1-file2.md")
+
+    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
+    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
+    group2_file3 = create_file(tmp_path, filename="group2-file3.mbox")
+    group2_file4 = create_file(tmp_path, filename="group2-file4.html")
+    # Include via input-file field
+    file1 = create_file(tmp_path, filename="notes.txt")
+    # Include unsupported file types
+    create_file(tmp_path, filename="group2-unincluded.py")
+    create_file(tmp_path, filename="group2-unincluded.csv")
+    create_file(tmp_path, filename="group2-unincluded.csv")
+    # Not included by any filter
+    create_file(tmp_path, filename="not-included-markdown.md")
+    create_file(tmp_path, filename="not-included-text.txt")
+
+    expected_files = sorted(
+        map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4])
+    )
+
+    # Setup input-files, input-filters
+    input_files = [tmp_path / "notes.txt"]
+    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
+
+    # Act
+    extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
+
+    # Assert
+    assert len(extracted_plaintext_files) == 7
+    assert set(extracted_plaintext_files) == set(expected_files)
+
+
+# Helper Functions
+def create_file(tmp_path: Path, entry=None, filename="test.md"):
+    file_ = tmp_path / filename
+    file_.touch()
+    if entry:
+        file_.write_text(entry)
+    return file_