mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Upgrade RapidOCR and enable for Python 3.12. Fix PDF OCR test
This commit is contained in:
@@ -73,7 +73,7 @@ dependencies = [
|
|||||||
"psycopg2-binary == 2.9.9",
|
"psycopg2-binary == 2.9.9",
|
||||||
"lxml == 4.9.3",
|
"lxml == 4.9.3",
|
||||||
"tzdata == 2023.3",
|
"tzdata == 2023.3",
|
||||||
"rapidocr-onnxruntime == 1.3.11; python_version<'3.12'",
|
"rapidocr-onnxruntime == 1.3.22",
|
||||||
"openai-whisper >= 20231117",
|
"openai-whisper >= 20231117",
|
||||||
"django-phonenumber-field == 7.3.0",
|
"django-phonenumber-field == 7.3.0",
|
||||||
"phonenumbers == 8.13.27",
|
"phonenumbers == 8.13.27",
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
||||||
from khoj.utils.fs_syncer import get_pdf_files
|
from khoj.utils.fs_syncer import get_pdf_files
|
||||||
@@ -38,16 +39,23 @@ def test_multi_page_pdf_to_jsonl():
|
|||||||
|
|
||||||
def test_ocr_page_pdf_to_jsonl():
|
def test_ocr_page_pdf_to_jsonl():
|
||||||
"Convert multiple pages from single PDF file to jsonl."
|
"Convert multiple pages from single PDF file to jsonl."
|
||||||
# Act
|
# Arrange
|
||||||
|
expected_str = "playing on a strip of marsh"
|
||||||
|
expected_str_with_variable_spaces = re.compile(expected_str.replace(" ", r"\s*"), re.IGNORECASE)
|
||||||
|
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
|
with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
|
||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
|
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
|
||||||
|
|
||||||
|
# Act
|
||||||
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
raw_entry = entries[1][0].raw
|
||||||
|
|
||||||
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 2
|
||||||
assert len(entries[1]) == 1
|
assert len(entries[1]) == 1
|
||||||
assert "playing on a strip of marsh" in entries[1][0].raw
|
assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
|
||||||
|
|
||||||
|
|
||||||
def test_get_pdf_files(tmp_path):
|
def test_get_pdf_files(tmp_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user