From 4ded32cc644763ba857a5e0ed37e33d79e11c70a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 Jan 2024 22:03:08 +0530 Subject: [PATCH 1/8] Test 1000 file upload limit to index/update API endpoint Due to FastAPI limitation --- tests/test_client.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_client.py b/tests/test_client.py index 0bc3c02f..0a25bea2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -169,6 +169,7 @@ def test_index_update_normal_file_unsubscribed(client, api_user4: KhojApiUser): assert response.status_code == 200 +# ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) def test_index_update_big_files_no_billing(client): # Arrange @@ -197,6 +198,26 @@ def test_index_update(client): assert response.status_code == 200 +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db(transaction=True) +def test_index_update_fails_if_more_than_1000_files(client, api_user4: KhojApiUser): + # Arrange + api_token = api_user4.token + state.billing_enabled = True + files = [("files", (f"path/to/filename{i}.org", f"Symphony No {i}", "text/org")) for i in range(1001)] + + headers = {"Authorization": f"Bearer {api_token}"} + + # Act + ok_response = client.post("/api/v1/index/update", files=files[:1000], headers=headers) + bad_response = client.post("/api/v1/index/update", files=files, headers=headers) + + # Assert + assert ok_response.status_code == 200 + assert bad_response.status_code == 400 + assert bad_response.content.decode("utf-8") == '{"detail":"Too many files. Maximum number of files is 1000."}' + + # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) def test_regenerate_with_valid_content_type(client): From fca7a5ff32e3fe0a1caa041b6cea523517bed934 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 Jan 2024 22:16:54 +0530 Subject: [PATCH 2/8] Push 1000 files at a time from the Obsidian client for indexing FastAPI API endpoints only support uploading 1000 files at a time. So split all files to index into groups of 1000 for upload to index/update API endpoint --- src/interface/obsidian/src/utils.ts | 45 +++++++++++++++++++---------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 7eda8a47..8a75d6e6 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -31,39 +31,54 @@ function fileExtensionToMimeType (extension: string): string { export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise { // Get all markdown, pdf files in the vault console.log(`Khoj: Updating Khoj content index...`) - const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf'); - const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg'] + const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'markdown' || file.extension === 'pdf'); + const binaryFileTypes = ['pdf'] let countOfFilesToIndex = 0; let countOfFilesToDelete = 0; // Add all files to index as multipart form data - const formData = new FormData(); + const fileData = []; for (const file of files) { countOfFilesToIndex++; const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8"; const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : ""); const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file); - formData.append('files', new Blob([fileContent], { type: mimeType }), file.path); + fileData.push({blob: new Blob([fileContent], { type: mimeType }), path: file.path}); } // Add any previously synced files to be deleted to multipart form data for (const lastSyncedFile of lastSyncedFiles) { if (!files.includes(lastSyncedFile)) { countOfFilesToDelete++; - formData.append('files', new Blob([]), lastSyncedFile.path); + fileData.push({blob: new Blob([]), path: lastSyncedFile.path}); } } - // Call Khoj backend to update index with all markdown, pdf files - const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, { - method: 'POST', - headers: { - 'Authorization': `Bearer ${setting.khojApiKey}`, - }, - body: formData, - }); + // Iterate through all indexable files in vault, 1000 at a time + let batchResponseSuccess = true; + let batchResponseThrottled = false; + for (let i = 0; i < fileData.length && !batchResponseThrottled; i += 1000) { + const filesGroup = fileData.slice(i, i + 1000); + const formData = new FormData(); + filesGroup.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) }); + // Call Khoj backend to update index with all markdown, pdf files + const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${setting.khojApiKey}`, + }, + body: formData, + }); - if (!response.ok) { + if (!response.ok) { + batchResponseSuccess = false; + batchResponseThrottled = response.status === 429; + } + } + + if (batchResponseThrottled) { + new Notice(`❗️Failed to update Khoj content index. Requests were throttled. Upgrade your subscription or try again later.`); + } else if (!batchResponseSuccess) { new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`); } else { console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`); @@ -92,7 +107,7 @@ export async function createNote(name: string, newLeaf = false): Promise { console.error('Khoj: Could not create note.\n' + (e as any).message); throw e } - } +} export async function createNoteAndCloseModal(query: string, modal: Modal, opt?: { newLeaf: boolean }): Promise { try { From efe41aaacaca54c7ae664fec7d4c5f596ab44c28 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 Jan 2024 23:08:20 +0530 Subject: [PATCH 3/8] Push 1000 files at a time from the Desktop client for indexing FastAPI API endpoints only support uploading 1000 files at a time. So split all files to index into groups of 1000 for upload to index/update API endpoint --- src/interface/desktop/main.js | 73 +++++++++++++++++------------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index 4bb087d9..8e197dc3 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -151,7 +151,7 @@ function pushDataToKhoj (regenerate = false) { } const lastSync = store.get('lastSync') || []; - const formData = new FormData(); + const filesDataToPush = []; for (const file of filesToPush) { const stats = fs.statSync(file); if (!regenerate) { @@ -167,7 +167,7 @@ function pushDataToKhoj (regenerate = false) { let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : ""); let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding); let fileObj = new Blob([fileContent], { type: mimeType }); - formData.append('files', fileObj, file); + filesDataToPush.push({blob: fileObj, path: file}); state[file] = { success: true, } @@ -184,49 +184,48 @@ function pushDataToKhoj (regenerate = false) { for (const syncedFile of lastSync) { if (!filesToPush.includes(syncedFile.path)) { fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) }); - formData.append('files', fileObj, syncedFile.path); + filesDataToPush.push({blob: fileObj, path: syncedFile.path}); } } // Send collected files to Khoj server for indexing - if (!!formData?.entries()?.next().value) { - const hostURL = store.get('hostURL') || KHOJ_URL; - const headers = { - 'Authorization': `Bearer ${store.get("khojToken")}` - }; - axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers }) - .then(response => { - console.log(response.data); - let lastSync = []; - for (const file of filesToPush) { - lastSync.push({ - path: file, - datetime: new Date().toISOString() - }); - } - store.set('lastSync', lastSync); - }) - .catch(error => { - console.error(error); - if (error.response.status == 429) { - const win = BrowserWindow.getAllWindows()[0]; - if (win) win.webContents.send('needsSubscription', true); - if (win) win.webContents.send('update-state', state); - } - state['completed'] = false - }) - .finally(() => { - // Syncing complete - syncing = false; - const win = BrowserWindow.getAllWindows()[0]; - if (win) win.webContents.send('update-state', state); - }); - } else { + const hostURL = store.get('hostURL') || KHOJ_URL; + const headers = { 'Authorization': `Bearer ${store.get("khojToken")}` }; + let requests = []; + + // Request indexing files on server. With upto 1000 files in each request + for (let i = 0; i < filesDataToPush.length; i += 1000) { + const filesDataGroup = filesDataToPush.slice(i, i + 1000); + const formData = new FormData(); + filesDataGroup.forEach(fileData => { formData.append('files', fileData.blob, fileData.path) }); + let request = axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers }); + requests.push(request); + } + + // Wait for requests batch to finish + Promise + .all(requests) + .then(responses => { + const lastSync = filesToPush + .filter(file => responses.find(response => response.data.includes(file))) + .map(file => ({ path: file, datetime: new Date().toISOString() })); + store.set('lastSync', lastSync); + }) + .catch(error => { + console.error(error); + state["completed"] = false; + if (error.response.status === 429) { + win = BrowserWindow.getAllWindows()[0] + if (win) win.webContents.send('needsSubscription', true); + if (win) win.webContents.send('update-state', state); + } + }) + .finally(() => { // Syncing complete syncing = false; const win = BrowserWindow.getAllWindows()[0]; if (win) win.webContents.send('update-state', state); - } + }); } pushDataToKhoj(); From 5f9ac5a630378f752d72b317f62902ec4532e33d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 9 Jan 2024 15:10:41 +0530 Subject: [PATCH 4/8] Collect files to index in single dict to simplify index/update controller Simplifies code while maintaining typing --- src/khoj/routers/indexer.py | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index c7d4c01d..2f60d09d 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -63,37 +63,23 @@ async def update( ), ): user = request.user.object + index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}} try: logger.info(f"📬 Updating content index via API call by {client} client") - org_files: Dict[str, str] = {} - markdown_files: Dict[str, str] = {} - pdf_files: Dict[str, bytes] = {} - plaintext_files: Dict[str, str] = {} - for file in files: file_type, encoding = get_file_type(file.content_type) - dict_to_update = None - if file_type == "org": - dict_to_update = org_files - elif file_type == "markdown": - dict_to_update = markdown_files - elif file_type == "pdf": - dict_to_update = pdf_files # type: ignore - elif file_type == "plaintext": - dict_to_update = plaintext_files - - if dict_to_update is not None: - dict_to_update[file.filename] = ( + if file_type in index_files: + index_files[file_type][file.filename] = ( file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() # type: ignore ) else: logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") indexer_input = IndexerInput( - org=org_files, - markdown=markdown_files, - pdf=pdf_files, - plaintext=plaintext_files, + org=index_files["org"], + markdown=index_files["markdown"], + pdf=index_files["pdf"], + plaintext=index_files["plaintext"], ) if state.config == None: @@ -143,10 +129,10 @@ async def update( return Response(content="Failed", status_code=500) indexing_metadata = { - "num_org": len(org_files), - "num_markdown": len(markdown_files), - "num_pdf": len(pdf_files), - "num_plaintext": len(plaintext_files), + "num_org": len(index_files["org"]), + "num_markdown": len(index_files["markdown"]), + "num_pdf": len(index_files["pdf"]), + "num_plaintext": len(index_files["plaintext"]), } update_telemetry_state( From 43423432ce47b4037313d4bba1faba1751045380 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 9 Jan 2024 15:16:13 +0530 Subject: [PATCH 5/8] Pass indexed filenames in API response for client validation --- src/khoj/routers/indexer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index 2f60d09d..cd20ab69 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -148,7 +148,8 @@ async def update( logger.info(f"📪 Content index updated via API call by {client} client") - return Response(content="OK", status_code=200) + indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) + return Response(content=indexed_filenames, status_code=200) def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: From af9ceb00a0d7b564b39740ac5e851332463e073e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 9 Jan 2024 22:32:06 +0530 Subject: [PATCH 6/8] Show relevant error msg in desktop app, e.g when can't connect to server --- src/interface/desktop/main.js | 17 ++++++++++------- src/interface/desktop/renderer.js | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index 8e197dc3..a98b15c8 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -114,8 +114,8 @@ function processDirectory(filesToPush, folder) { const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true }); for (const file of files) { - console.log(file); if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { + console.log(`Add ${file.name} in ${folder.path} for indexing`); filesToPush.push(path.join(folder.path, file.name)); } @@ -214,17 +214,20 @@ function pushDataToKhoj (regenerate = false) { .catch(error => { console.error(error); state["completed"] = false; - if (error.response.status === 429) { - win = BrowserWindow.getAllWindows()[0] - if (win) win.webContents.send('needsSubscription', true); - if (win) win.webContents.send('update-state', state); + if (error?.response?.status === 429 && (win = BrowserWindow.getAllWindows()[0])) { + state["error"] = `Looks like you're out of space to sync your files. Upgrade your plan to unlock more space.`; + } else if (error?.code === 'ECONNREFUSED') { + state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`; + } else { + state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`; } }) .finally(() => { // Syncing complete syncing = false; - const win = BrowserWindow.getAllWindows()[0]; - if (win) win.webContents.send('update-state', state); + if (win = BrowserWindow.getAllWindows()[0]) { + win.webContents.send('update-state', state); + } }); } diff --git a/src/interface/desktop/renderer.js b/src/interface/desktop/renderer.js index 16df6d2f..02f92163 100644 --- a/src/interface/desktop/renderer.js +++ b/src/interface/desktop/renderer.js @@ -158,7 +158,7 @@ window.updateStateAPI.onUpdateState((event, state) => { nextSyncTime = new Date(); nextSyncTime.setMinutes(Math.ceil((nextSyncTime.getMinutes() + 1) / 10) * 10); if (state.completed == false) { - syncStatusElement.innerHTML = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`; + if (state.error) syncStatusElement.innerHTML = state.error; return; } const options = { hour: '2-digit', minute: '2-digit' }; From ba37b28fb51ff98ad5628ce57dc4f591ca30c992 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 13 Jan 2024 23:59:00 +0530 Subject: [PATCH 7/8] Improve batched error handling. Catch can't connect to server error Break out of batch processing when unable to connect to server or when requests throttled by server --- src/interface/obsidian/src/utils.ts | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 8a75d6e6..1f309ff6 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -55,9 +55,8 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las } // Iterate through all indexable files in vault, 1000 at a time - let batchResponseSuccess = true; - let batchResponseThrottled = false; - for (let i = 0; i < fileData.length && !batchResponseThrottled; i += 1000) { + let error_message = null; + for (let i = 0; i < fileData.length; i += 1000) { const filesGroup = fileData.slice(i, i + 1000); const formData = new FormData(); filesGroup.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) }); @@ -71,15 +70,20 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las }); if (!response.ok) { - batchResponseSuccess = false; - batchResponseThrottled = response.status === 429; + if (response.status === 429) { + error_message = `❗️Failed to sync your content with Khoj server. Requests were throttled. Upgrade your subscription or try again later.`; + break; + } else if (response.status === 404) { + error_message = `❗️Could not connect to Khoj server. Ensure you can connect to it.`; + break; + } else { + error_message = `❗️Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`; + } } } - if (batchResponseThrottled) { - new Notice(`❗️Failed to update Khoj content index. Requests were throttled. Upgrade your subscription or try again later.`); - } else if (!batchResponseSuccess) { - new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`); + if (error_message) { + new Notice(error_message); } else { console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`); } From 1ae6669fbf540af896c728c657e425a691781fdf Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 16 Jan 2024 11:57:40 +0530 Subject: [PATCH 8/8] Correctly handle API response when no files to index --- src/khoj/routers/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index cd20ab69..de80aed4 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -148,7 +148,7 @@ async def update( logger.info(f"📪 Content index updated via API call by {client} client") - indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) + indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) or "" return Response(content=indexed_filenames, status_code=200)