From 8c3d5a49da58f0c631cfa80083cfd9289e2958a3 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Sat, 4 Nov 2023 19:27:18 -0700 Subject: [PATCH] Add try/except around image extraction step --- src/khoj/processor/pdf/pdf_to_entries.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/khoj/processor/pdf/pdf_to_entries.py b/src/khoj/processor/pdf/pdf_to_entries.py index 19d463eb..64e13031 100644 --- a/src/khoj/processor/pdf/pdf_to_entries.py +++ b/src/khoj/processor/pdf/pdf_to_entries.py @@ -68,13 +68,16 @@ class PdfToEntries(TextToEntries): with open(f"{tmp_file}", "wb") as f: bytes = pdf_files[pdf_file] f.write(bytes) - loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True) + try: + loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True) + except ModuleNotFoundError: + loader = PyMuPDFLoader(f"{tmp_file}") pdf_entries_per_file = [page.page_content for page in loader.load()] entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) entries.extend(pdf_entries_per_file) except Exception as e: logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") - logger.warning(e) + logger.warning(e, exc_info=True) finally: if os.path.exists(f"{tmp_file}"): os.remove(f"{tmp_file}")