Support Indexing Docx Files (#801)

* Add support for indexing docx files and associated unit tests

---------

Co-authored-by: sabaimran <narmiabas@gmail.com>
This commit is contained in:
Raghav Tirumale
2024-06-20 01:48:01 -04:00
committed by GitHub
parent d4e5c95711
commit bd3b590153
15 changed files with 193 additions and 4 deletions

View File

@@ -0,0 +1,37 @@
import os
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
def test_single_page_docx_to_jsonl():
"Convert single page DOCX file to jsonl."
# Act
# Extract Entries from specified Docx files
# Read singlepage.docx into memory as bytes
with open("tests/data/docx/iceland.docx", "rb") as f:
docx_bytes = f.read()
data = {"tests/data/docx/iceland.docx": docx_bytes}
entries = DocxToEntries.extract_docx_entries(docx_files=data)
# Assert
assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0]
assert len(entries) == 2
assert len(entries[1]) == 1
def test_multi_page_docx_to_jsonl():
"Convert multi page DOCX file to jsonl."
# Act
# Extract Entries from specified Docx files
# Read multipage.docx into memory as bytes
with open("tests/data/docx/bangalore.docx", "rb") as f:
docx_bytes = f.read()
data = {"tests/data/docx/bangalore.docx": docx_bytes}
entries = DocxToEntries.extract_docx_entries(docx_files=data)
# Assert
assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0]
assert len(entries) == 2
assert len(entries[1]) == 1