mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 13:25:11 +00:00
Let fs_syncer pass PDF files directly as binary before indexing
No need to do unneeded base64 encoding/decoding to pass pdf contents for indexing from fs_syncer to pdf_to_jsonl
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
import base64
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
@@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
|
||||
# Extract Entries from specified Pdf files
|
||||
# Read singlepage.pdf into memory as bytes
|
||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
@@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
Reference in New Issue
Block a user