mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Support Indexing Docx Files (#801)
* Add support for indexing docx files and associated unit tests --------- Co-authored-by: sabaimran <narmiabas@gmail.com>
This commit is contained in:
BIN
tests/data/docx/bangalore.docx
vendored
Normal file
BIN
tests/data/docx/bangalore.docx
vendored
Normal file
Binary file not shown.
BIN
tests/data/docx/iceland.docx
vendored
Normal file
BIN
tests/data/docx/iceland.docx
vendored
Normal file
Binary file not shown.
@@ -61,7 +61,7 @@ def test_search_with_invalid_content_type(client):
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_search_with_valid_content_type(client):
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext"]:
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext", "docx"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}", headers=headers)
|
||||
# Assert
|
||||
@@ -480,6 +480,14 @@ def get_sample_files_data():
|
||||
("files", ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain")),
|
||||
("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")),
|
||||
("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")),
|
||||
(
|
||||
"files",
|
||||
(
|
||||
"path/to/filename.docx",
|
||||
"## Studying anthropological records from the Fatimid caliphate",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
),
|
||||
),
|
||||
(
|
||||
"files",
|
||||
("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"),
|
||||
|
||||
37
tests/test_docx_to_entries.py
Normal file
37
tests/test_docx_to_entries.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
|
||||
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||||
|
||||
|
||||
def test_single_page_docx_to_jsonl():
|
||||
"Convert single page DOCX file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Docx files
|
||||
# Read singlepage.docx into memory as bytes
|
||||
with open("tests/data/docx/iceland.docx", "rb") as f:
|
||||
docx_bytes = f.read()
|
||||
|
||||
data = {"tests/data/docx/iceland.docx": docx_bytes}
|
||||
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||||
|
||||
# Assert
|
||||
assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0]
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
|
||||
|
||||
def test_multi_page_docx_to_jsonl():
|
||||
"Convert multi page DOCX file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Docx files
|
||||
# Read multipage.docx into memory as bytes
|
||||
with open("tests/data/docx/bangalore.docx", "rb") as f:
|
||||
docx_bytes = f.read()
|
||||
|
||||
data = {"tests/data/docx/bangalore.docx": docx_bytes}
|
||||
entries = DocxToEntries.extract_docx_entries(docx_files=data)
|
||||
|
||||
# Assert
|
||||
assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0]
|
||||
assert len(entries) == 2
|
||||
assert len(entries[1]) == 1
|
||||
Reference in New Issue
Block a user