mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
- Convert extract_org_entries function to actually extract org entries Previously it was extracting intermediary org-node objects instead Now it extracts the org-node objects from files and converts them into entries - Create separate, new function to extract_org_nodes from files - Similarly create wrapper funcs for md, pdf, plaintext to entries - Update org, md, pdf, plaintext to entries tests to use the new simplified wrapper function to extract org entries
104 lines
3.4 KiB
Python
104 lines
3.4 KiB
Python
import json
|
|
import os
|
|
|
|
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
|
from khoj.utils.fs_syncer import get_pdf_files
|
|
from khoj.utils.rawconfig import TextContentConfig
|
|
|
|
|
|
def test_single_page_pdf_to_jsonl():
|
|
"Convert single page PDF file to jsonl."
|
|
# Act
|
|
# Extract Entries from specified Pdf files
|
|
# Read singlepage.pdf into memory as bytes
|
|
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
|
|
|
# Process Each Entry from All Pdf Files
|
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
|
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
|
|
# Assert
|
|
assert len(jsonl_data) == 1
|
|
|
|
|
|
def test_multi_page_pdf_to_jsonl():
|
|
"Convert multiple pages from single PDF file to jsonl."
|
|
# Act
|
|
# Extract Entries from specified Pdf files
|
|
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
|
|
|
# Process Each Entry from All Pdf Files
|
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
|
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
|
|
|
# Assert
|
|
assert len(jsonl_data) == 6
|
|
|
|
|
|
def test_ocr_page_pdf_to_jsonl():
|
|
"Convert multiple pages from single PDF file to jsonl."
|
|
# Act
|
|
# Extract Entries from specified Pdf files
|
|
with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
|
|
pdf_bytes = f.read()
|
|
|
|
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
|
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
|
|
|
assert len(entries) == 1
|
|
assert "playing on a strip of marsh" in entries[0].raw
|
|
|
|
|
|
def test_get_pdf_files(tmp_path):
|
|
"Ensure Pdf files specified via input-filter, input-files extracted"
|
|
# Arrange
|
|
# Include via input-filter globs
|
|
group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
|
|
group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
|
|
group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
|
|
group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
|
|
# Include via input-file field
|
|
file1 = create_file(tmp_path, filename="document.pdf")
|
|
# Not included by any filter
|
|
create_file(tmp_path, filename="not-included-document.pdf")
|
|
create_file(tmp_path, filename="not-included-text.txt")
|
|
|
|
expected_files = set(
|
|
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
|
|
)
|
|
|
|
# Setup input-files, input-filters
|
|
input_files = [tmp_path / "document.pdf"]
|
|
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
|
|
|
|
pdf_config = TextContentConfig(
|
|
input_files=input_files,
|
|
input_filter=[str(path) for path in input_filter],
|
|
compressed_jsonl=tmp_path / "test.jsonl",
|
|
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
|
)
|
|
|
|
# Act
|
|
extracted_pdf_files = get_pdf_files(pdf_config)
|
|
|
|
# Assert
|
|
assert len(extracted_pdf_files) == 5
|
|
assert set(extracted_pdf_files.keys()) == expected_files
|
|
|
|
|
|
# Helper Functions
|
|
def create_file(tmp_path, entry=None, filename="document.pdf"):
|
|
pdf_file = tmp_path / filename
|
|
pdf_file.touch()
|
|
if entry:
|
|
pdf_file.write_text(entry)
|
|
return pdf_file
|