From 3675ab486471e96af2f478d5dec9ce118aafacb8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 24 Feb 2024 03:01:27 +0530 Subject: [PATCH] Only sync modified files from the Obsidian client Previously we'd send all files in vault and let the server deduplicate. This changes takes inspiration from the desktop app, and only pushes files which were modified after their previous sync with the server. This should reduce the processing load on the server --- src/interface/obsidian/src/main.ts | 4 +- src/interface/obsidian/src/settings.ts | 8 ++-- src/interface/obsidian/src/utils.ts | 52 ++++++++++++++++++++++++-- src/khoj/search_type/text_search.py | 2 +- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/interface/obsidian/src/main.ts b/src/interface/obsidian/src/main.ts index 6b24ee37..7e152c49 100644 --- a/src/interface/obsidian/src/main.ts +++ b/src/interface/obsidian/src/main.ts @@ -44,9 +44,7 @@ export default class Khoj extends Plugin { // Add scheduled job to update index every 60 minutes this.indexingTimer = setInterval(async () => { if (this.settings.autoConfigure) { - this.settings.lastSyncedFiles = await updateContentIndex( - this.app.vault, this.settings, this.settings.lastSyncedFiles - ); + this.settings.lastSync = await updateContentIndex(this.app.vault, this.settings, this.settings.lastSync); } }, 60 * 60 * 1000); } diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts index 4eb6553e..47e8e802 100644 --- a/src/interface/obsidian/src/settings.ts +++ b/src/interface/obsidian/src/settings.ts @@ -8,7 +8,8 @@ export interface KhojSetting { khojApiKey: string; connectedToBackend: boolean; autoConfigure: boolean; - lastSyncedFiles: TFile[]; + lastSyncedFiles: TFile[]; // Deprecated setting, will be removed in future + lastSync: Map; userEmail: string; } @@ -19,6 +20,7 @@ export const DEFAULT_SETTINGS: KhojSetting = { connectedToBackend: false, autoConfigure: true, lastSyncedFiles: [], + lastSync: new Map(), userEmail: '', } @@ -132,8 +134,8 @@ export class KhojSettingTab extends PluginSettingTab { }, 300); this.plugin.registerInterval(progress_indicator); - this.plugin.settings.lastSyncedFiles = await updateContentIndex( - this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true + this.plugin.settings.lastSync = await updateContentIndex( + this.app.vault, this.plugin.settings, this.plugin.settings.lastSync, true ); new Notice('✅ Updated Khoj index.'); diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 5c1061bd..91a398ac 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -28,17 +28,42 @@ function fileExtensionToMimeType (extension: string): string { } } -export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise { +function filenameToMimeType (filename: TFile): string { + switch (filename.extension) { + case 'pdf': + return 'application/pdf'; + case 'png': + return 'image/png'; + case 'jpg': + case 'jpeg': + return 'image/jpeg'; + case 'md': + case 'markdown': + return 'text/markdown'; + case 'org': + return 'text/org'; + default: + return 'text/plain'; + } +} + +export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSync: Map, regenerate: boolean = false): Promise> { // Get all markdown, pdf files in the vault console.log(`Khoj: Updating Khoj content index...`) const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'markdown' || file.extension === 'pdf'); const binaryFileTypes = ['pdf'] let countOfFilesToIndex = 0; let countOfFilesToDelete = 0; + lastSync = lastSync.size > 0 ? lastSync : new Map(); // Add all files to index as multipart form data const fileData = []; for (const file of files) { + // Only push files that have been modified since last sync if not regenerating + if (!regenerate && file.stat.mtime < (lastSync.get(file) ?? 0)){ + continue; + } + countOfFilesToIndex++; const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8"; const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : ""); @@ -47,14 +72,18 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las } // Add any previously synced files to be deleted to multipart form data - for (const lastSyncedFile of lastSyncedFiles) { + let filesToDelete: TFile[] = []; + for (const lastSyncedFile of lastSync.keys()) { if (!files.includes(lastSyncedFile)) { countOfFilesToDelete++; - fileData.push({blob: new Blob([]), path: lastSyncedFile.path}); + let fileObj = new Blob([""], { type: filenameToMimeType(lastSyncedFile) }); + fileData.push({blob: fileObj, path: lastSyncedFile.path}); + filesToDelete.push(lastSyncedFile); } } // Iterate through all indexable files in vault, 1000 at a time + let responses: string[] = []; let error_message = null; for (let i = 0; i < fileData.length; i += 1000) { const filesGroup = fileData.slice(i, i + 1000); @@ -79,16 +108,31 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las } else { error_message = `❗️Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`; } + } else { + responses.push(await response.text()); } } + // Update last sync time for each successfully indexed file + files + .filter(file => responses.find(response => response.includes(file.path))) + .reduce((newSync, file) => { + newSync.set(file, new Date().getTime()); + return newSync; + }, lastSync); + + // Remove files that were deleted from last sync + filesToDelete + .filter(file => responses.find(response => response.includes(file.path))) + .forEach(file => lastSync.delete(file)); + if (error_message) { new Notice(error_message); } else { console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`); } - return files; + return lastSync; } export async function createNote(name: string, newLeaf = false): Promise { diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 58547ca5..d5ea35e6 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -211,7 +211,7 @@ def setup( file_names = [file_name for file_name in files] logger.info( - f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names}" + f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..." )