mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Add try/except around image extraction step
This commit is contained in:
@@ -68,13 +68,16 @@ class PdfToEntries(TextToEntries):
|
|||||||
with open(f"{tmp_file}", "wb") as f:
|
with open(f"{tmp_file}", "wb") as f:
|
||||||
bytes = pdf_files[pdf_file]
|
bytes = pdf_files[pdf_file]
|
||||||
f.write(bytes)
|
f.write(bytes)
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True)
|
try:
|
||||||
|
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True)
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||||
entries.extend(pdf_entries_per_file)
|
entries.extend(pdf_entries_per_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
||||||
logger.warning(e)
|
logger.warning(e, exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(f"{tmp_file}"):
|
if os.path.exists(f"{tmp_file}"):
|
||||||
os.remove(f"{tmp_file}")
|
os.remove(f"{tmp_file}")
|
||||||
|
|||||||
Reference in New Issue
Block a user