mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Fix parsing for PDFs via content indexing API
This commit is contained in:
@@ -59,9 +59,14 @@ class PdfToEntries(TextToEntries):
|
||||
entries: List[str] = []
|
||||
entry_to_location_map: List[Tuple[str, str]] = []
|
||||
for pdf_file in pdf_files:
|
||||
pdf_entries_per_file = PdfToEntries.extract_text(pdf_file)
|
||||
entries.extend(pdf_entries_per_file)
|
||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||
try:
|
||||
pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file])
|
||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||
entries.extend(pdf_entries_per_file)
|
||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to extract entries from file: {pdf_file}")
|
||||
logger.warning(e, exc_info=True)
|
||||
|
||||
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
||||
|
||||
|
||||
@@ -450,11 +450,11 @@ async def indexer(
|
||||
for file in files:
|
||||
file_data = get_file_content(file)
|
||||
if file_data.file_type in index_files:
|
||||
index_files[file_data.file_type][file_data.filename] = (
|
||||
index_files[file_data.file_type][file_data.name] = (
|
||||
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
|
||||
|
||||
indexer_input = IndexerInput(
|
||||
org=index_files["org"],
|
||||
|
||||
Reference in New Issue
Block a user