Add front-end Electron application for Khoj local file syncing (#473)

* Initial version - setup a file-push architecture for generating embeddings with Khoj * Use state.host and state.port for configuring the URL for the indexer * Fix parsing of PDF files * Read markdown files from streamed data and update unit tests * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system * Init: refactor indexer/batch endpoint to support a generic file ingestion format * Add features to better support indexing from files sent by the desktop client * Initial commit with Electron application - Adds electron app * Add import for pymupdf, remove import for pypdf * Allow user to configure khoj host URL * Remove search type configuration from index.html * Use v1 path for current indexer routes
2026-03-07 21:29:13 +00:00 · 2023-09-06 12:04:18 -07:00
parent 205dc90746
commit 76562f4250
54 changed files with 20132 additions and 82 deletions
--- a/tests/test_pdf_to_jsonl.py
+++ b/tests/test_pdf_to_jsonl.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import json
 import os
+import base64

 # Internal Packages
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@@ -15,7 +16,7 @@ def test_single_page_pdf_to_jsonl():
    # Extract Entries from specified Pdf files
    # Read singlepage.pdf into memory as bytes
    with open("tests/data/pdf/singlepage.pdf", "rb") as f:
-        pdf_bytes = f.read()
+        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")

    data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@@ -35,7 +36,7 @@ def test_multi_page_pdf_to_jsonl():
    # Act
    # Extract Entries from specified Pdf files
    with open("tests/data/pdf/multipage.pdf", "rb") as f:
-        pdf_bytes = f.read()
+        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")

    data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)