Support Indexing Docx Files (#801)

* Add support for indexing docx files and associated unit tests --------- Co-authored-by: sabaimran <narmiabas@gmail.com>
2026-03-02 13:18:18 +00:00 · 2024-06-20 01:48:01 -04:00
parent d4e5c95711
commit bd3b590153
15 changed files with 193 additions and 4 deletions
--- a/tests/data/docx/bangalore.docx
+++ b/tests/data/docx/bangalore.docx
--- a/tests/data/docx/iceland.docx
+++ b/tests/data/docx/iceland.docx
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -61,7 +61,7 @@ def test_search_with_invalid_content_type(client):
@pytest.mark.django_db(transaction=True)
 def test_search_with_valid_content_type(client):
    headers = {"Authorization": "Bearer kk-secret"}
-    for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext"]:
+    for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext", "docx"]:
        # Act
        response = client.get(f"/api/search?q=random&t={content_type}", headers=headers)
        # Assert
@@ -480,6 +480,14 @@ def get_sample_files_data():
        ("files", ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain")),
        ("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")),
        ("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")),
+        (
+            "files",
+            (
+                "path/to/filename.docx",
+                "## Studying anthropological records from the Fatimid caliphate",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            ),
+        ),
        (
            "files",
            ("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"),
--- a/tests/test_docx_to_entries.py
+++ b/tests/test_docx_to_entries.py
@@ -0,0 +1,37 @@
+import os
+
+from khoj.processor.content.docx.docx_to_entries import DocxToEntries
+
+
+def test_single_page_docx_to_jsonl():
+    "Convert single page DOCX file to jsonl."
+    # Act
+    # Extract Entries from specified Docx files
+    # Read singlepage.docx into memory as bytes
+    with open("tests/data/docx/iceland.docx", "rb") as f:
+        docx_bytes = f.read()
+
+    data = {"tests/data/docx/iceland.docx": docx_bytes}
+    entries = DocxToEntries.extract_docx_entries(docx_files=data)
+
+    # Assert
+    assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0]
+    assert len(entries) == 2
+    assert len(entries[1]) == 1
+
+
+def test_multi_page_docx_to_jsonl():
+    "Convert multi page DOCX file to jsonl."
+    # Act
+    # Extract Entries from specified Docx files
+    # Read multipage.docx into memory as bytes
+    with open("tests/data/docx/bangalore.docx", "rb") as f:
+        docx_bytes = f.read()
+
+    data = {"tests/data/docx/bangalore.docx": docx_bytes}
+    entries = DocxToEntries.extract_docx_entries(docx_files=data)
+
+    # Assert
+    assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0]
+    assert len(entries) == 2
+    assert len(entries[1]) == 1