Only sync modified files from the Obsidian client

Previously we'd send all files in vault and let the server
deduplicate.

This changes takes inspiration from the desktop app, and only pushes
files which were modified after their previous sync with the server.

This should reduce the processing load on the server
This commit is contained in:
Debanjum Singh Solanky
2024-02-24 03:01:27 +05:30
parent ddfbf31bc8
commit 3675ab4864
4 changed files with 55 additions and 11 deletions

View File

@@ -44,9 +44,7 @@ export default class Khoj extends Plugin {
// Add scheduled job to update index every 60 minutes
this.indexingTimer = setInterval(async () => {
if (this.settings.autoConfigure) {
this.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.settings, this.settings.lastSyncedFiles
);
this.settings.lastSync = await updateContentIndex(this.app.vault, this.settings, this.settings.lastSync);
}
}, 60 * 60 * 1000);
}

View File

@@ -8,7 +8,8 @@ export interface KhojSetting {
khojApiKey: string;
connectedToBackend: boolean;
autoConfigure: boolean;
lastSyncedFiles: TFile[];
lastSyncedFiles: TFile[]; // Deprecated setting, will be removed in future
lastSync: Map<TFile, number>;
userEmail: string;
}
@@ -19,6 +20,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
connectedToBackend: false,
autoConfigure: true,
lastSyncedFiles: [],
lastSync: new Map(),
userEmail: '',
}
@@ -132,8 +134,8 @@ export class KhojSettingTab extends PluginSettingTab {
}, 300);
this.plugin.registerInterval(progress_indicator);
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
this.plugin.settings.lastSync = await updateContentIndex(
this.app.vault, this.plugin.settings, this.plugin.settings.lastSync, true
);
new Notice('✅ Updated Khoj index.');

View File

@@ -28,17 +28,42 @@ function fileExtensionToMimeType (extension: string): string {
}
}
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
function filenameToMimeType (filename: TFile): string {
switch (filename.extension) {
case 'pdf':
return 'application/pdf';
case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
}
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSync: Map<TFile, number>, regenerate: boolean = false): Promise<Map<TFile, number>> {
// Get all markdown, pdf files in the vault
console.log(`Khoj: Updating Khoj content index...`)
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'markdown' || file.extension === 'pdf');
const binaryFileTypes = ['pdf']
let countOfFilesToIndex = 0;
let countOfFilesToDelete = 0;
lastSync = lastSync.size > 0 ? lastSync : new Map<TFile, number>();
// Add all files to index as multipart form data
const fileData = [];
for (const file of files) {
// Only push files that have been modified since last sync if not regenerating
if (!regenerate && file.stat.mtime < (lastSync.get(file) ?? 0)){
continue;
}
countOfFilesToIndex++;
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
@@ -47,14 +72,18 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
}
// Add any previously synced files to be deleted to multipart form data
for (const lastSyncedFile of lastSyncedFiles) {
let filesToDelete: TFile[] = [];
for (const lastSyncedFile of lastSync.keys()) {
if (!files.includes(lastSyncedFile)) {
countOfFilesToDelete++;
fileData.push({blob: new Blob([]), path: lastSyncedFile.path});
let fileObj = new Blob([""], { type: filenameToMimeType(lastSyncedFile) });
fileData.push({blob: fileObj, path: lastSyncedFile.path});
filesToDelete.push(lastSyncedFile);
}
}
// Iterate through all indexable files in vault, 1000 at a time
let responses: string[] = [];
let error_message = null;
for (let i = 0; i < fileData.length; i += 1000) {
const filesGroup = fileData.slice(i, i + 1000);
@@ -79,16 +108,31 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
} else {
error_message = `Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`;
}
} else {
responses.push(await response.text());
}
}
// Update last sync time for each successfully indexed file
files
.filter(file => responses.find(response => response.includes(file.path)))
.reduce((newSync, file) => {
newSync.set(file, new Date().getTime());
return newSync;
}, lastSync);
// Remove files that were deleted from last sync
filesToDelete
.filter(file => responses.find(response => response.includes(file.path)))
.forEach(file => lastSync.delete(file));
if (error_message) {
new Notice(error_message);
} else {
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
}
return files;
return lastSync;
}
export async function createNote(name: string, newLeaf = false): Promise<void> {

View File

@@ -211,7 +211,7 @@ def setup(
file_names = [file_name for file_name in files]
logger.info(
f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names}"
f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..."
)