Create wrapper function to get entries from org, md, pdf & text files

- Convert extract_org_entries function to actually extract org entries
  Previously it was extracting intermediary org-node objects instead
  Now it extracts the org-node objects from files and converts them
  into entries
- Create separate, new function to extract_org_nodes from files
- Similarly create wrapper funcs for md, pdf, plaintext to entries

- Update org, md, pdf, plaintext to entries tests to use the new
  simplified wrapper function to extract org entries
This commit is contained in:
Debanjum Singh Solanky
2024-02-09 16:04:41 +05:30
parent f01a12b1d2
commit 28105ee027
8 changed files with 71 additions and 94 deletions

View File

@@ -15,12 +15,10 @@ def test_single_page_pdf_to_jsonl():
pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
)
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -35,12 +33,10 @@ def test_multi_page_pdf_to_jsonl():
pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
)
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@@ -55,10 +51,7 @@ def test_ocr_page_pdf_to_jsonl():
pdf_bytes = f.read()
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
assert len(entries) == 1
assert "playing on a strip of marsh" in entries[0].raw