mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Push Files to Index from Emacs, Obsidian & Desktop Clients using Multi-Part Forms (#499)
### Overview
- Add ability to push data to index from the Emacs, Obsidian client
- Switch to standard mechanism of syncing files via HTTP multi-part/form. Previously we were streaming the data as JSON
- Benefits of new mechanism
- No manual parsing of files to send or receive on clients or server is required as most have in-built mechanisms to send multi-part/form requests
- The whole response is not required to be kept in memory to parse content as JSON. As individual files arrive they're automatically pushed to disk to conserve memory if required
- Binary files don't need to be encoded on client and decoded on server
### Code Details
### Major
- Use multi-part form to receive files to index on server
- Use multi-part form to send files to index on desktop client
- Send files to index on server from the khoj.el emacs client
- Send content for indexing on server at a regular interval from khoj.el
- Send files to index on server from the khoj obsidian client
- Update tests to test multi-part/form method of pushing files to index
#### Minor
- Put indexer API endpoint under /api path segment
- Explicitly make GET request to /config/data from khoj.el:khoj-server-configure method
- Improve emoji, message on content index updated via logger
- Don't call khoj server on khoj.el load, only once khoj invoked explicitly by user
- Improve indexing of binary files
- Let fs_syncer pass PDF files directly as binary before indexing
- Use encoding of each file set in indexer request to read file
- Add CORS policy to khoj server. Allow requests from khoj apps, obsidian & localhost
- Update indexer API endpoint URL to` index/update` from `indexer/batch`
Resolves #471 #243
This commit is contained in:
@@ -6,6 +6,7 @@ from urllib.parse import quote
|
||||
|
||||
# External Packages
|
||||
from fastapi.testclient import TestClient
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from khoj.main import app
|
||||
@@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_index_batch(client):
|
||||
def test_index_update(client):
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
|
||||
response = client.post("/api/v1/index/update", files=files, headers=headers)
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
@@ -76,12 +77,11 @@ def test_index_batch(client):
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
|
||||
response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
@@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
|
||||
response = client.get(f"/api/update?force=true&t=github")
|
||||
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
|
||||
response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||
def test_get_configured_types_via_api(client):
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
@@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
|
||||
|
||||
def get_sample_files_data():
|
||||
return {
|
||||
"org": {
|
||||
"path/to/filename.org": "* practicing piano",
|
||||
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
|
||||
"path/to/filename2.org": "* how to build a search engine",
|
||||
},
|
||||
"pdf": {
|
||||
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
|
||||
"path/to/filename1.pdf": "The sun is a ball of helium",
|
||||
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
|
||||
},
|
||||
"plaintext": {
|
||||
"path/to/filename.txt": "data,column,value",
|
||||
"path/to/filename1.txt": "<html>my first web page</html>",
|
||||
"path/to/filename2.txt": "2021-02-02 Journal Entry",
|
||||
},
|
||||
"markdown": {
|
||||
"path/to/filename.md": "# Notes from client call",
|
||||
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
|
||||
"path/to/filename2.md": "**Understanding science through the lens of art**",
|
||||
},
|
||||
"files": ("path/to/filename.org", "* practicing piano", "text/org"),
|
||||
"files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
|
||||
"files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
|
||||
"files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
|
||||
"files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
|
||||
"files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
|
||||
"files": ("path/to/filename.txt", "data,column,value", "text/plain"),
|
||||
"files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
|
||||
"files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
|
||||
"files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
|
||||
"files": (
|
||||
"path/to/filename1.md",
|
||||
"## Studying anthropological records from the Fatimid caliphate",
|
||||
"text/markdown",
|
||||
),
|
||||
"files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
import base64
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
@@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
|
||||
# Extract Entries from specified Pdf files
|
||||
# Read singlepage.pdf into memory as bytes
|
||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
@@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
||||
Reference in New Issue
Block a user