Support Indexing Images via OCR (#823)

- Added support for uploading .jpeg, .jpg, and .png files to Khoj from Web, Desktop app
- Updating indexer to generate raw text and entries using RapidOCR
- Details
  * added support for indexing images via ocr
  * fixed pyproject.toml
  * Update src/khoj/processor/content/images/image_to_entries.py
     Co-authored-by: Debanjum <debanjum@gmail.com>
  * Update src/khoj/processor/content/images/image_to_entries.py
     Co-authored-by: Debanjum <debanjum@gmail.com>
  * removed redudant try except blocks
  * updated desktop js file to support image formats
  * added tests for jpg and png
  * Fix processing for image to entries files
  * Update unit tests with working image indexer
  * Change png test from version verificaition to open-cv verification

---------

Co-authored-by: Debanjum <debanjum@gmail.com>
Co-authored-by: sabaimran <narmiabas@gmail.com>
This commit is contained in:
Raghav Tirumale
2024-07-01 09:00:00 -04:00
committed by GitHub
parent c83b8f2768
commit 8eccd8a5e4
10 changed files with 180 additions and 7 deletions

View File

@@ -19,7 +19,7 @@ const textFileTypes = [
'org', 'md', 'markdown', 'txt', 'html', 'xml',
// Other valid text file extensions from https://google.github.io/magika/model/config.json
'appleplist', 'asm', 'asp', 'batch', 'c', 'cs', 'css', 'csv', 'eml', 'go', 'html', 'ini', 'internetshortcut', 'java', 'javascript', 'json', 'latex', 'lisp', 'makefile', 'markdown', 'mht', 'mum', 'pem', 'perl', 'php', 'powershell', 'python', 'rdf', 'rst', 'rtf', 'ruby', 'rust', 'scala', 'shell', 'smali', 'sql', 'svg', 'symlinktext', 'txt', 'vba', 'winregistry', 'xml', 'yaml']
const binaryFileTypes = ['pdf']
const binaryFileTypes = ['pdf', 'jpg', 'jpeg', 'png']
const validFileTypes = textFileTypes.concat(binaryFileTypes);
const schema = {