mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Only sync modified files from the Obsidian client
Previously we'd send all files in vault and let the server deduplicate. This changes takes inspiration from the desktop app, and only pushes files which were modified after their previous sync with the server. This should reduce the processing load on the server
This commit is contained in:
@@ -44,9 +44,7 @@ export default class Khoj extends Plugin {
|
||||
// Add scheduled job to update index every 60 minutes
|
||||
this.indexingTimer = setInterval(async () => {
|
||||
if (this.settings.autoConfigure) {
|
||||
this.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.settings, this.settings.lastSyncedFiles
|
||||
);
|
||||
this.settings.lastSync = await updateContentIndex(this.app.vault, this.settings, this.settings.lastSync);
|
||||
}
|
||||
}, 60 * 60 * 1000);
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@ export interface KhojSetting {
|
||||
khojApiKey: string;
|
||||
connectedToBackend: boolean;
|
||||
autoConfigure: boolean;
|
||||
lastSyncedFiles: TFile[];
|
||||
lastSyncedFiles: TFile[]; // Deprecated setting, will be removed in future
|
||||
lastSync: Map<TFile, number>;
|
||||
userEmail: string;
|
||||
}
|
||||
|
||||
@@ -19,6 +20,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
connectedToBackend: false,
|
||||
autoConfigure: true,
|
||||
lastSyncedFiles: [],
|
||||
lastSync: new Map(),
|
||||
userEmail: '',
|
||||
}
|
||||
|
||||
@@ -132,8 +134,8 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||
}, 300);
|
||||
this.plugin.registerInterval(progress_indicator);
|
||||
|
||||
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
|
||||
this.plugin.settings.lastSync = await updateContentIndex(
|
||||
this.app.vault, this.plugin.settings, this.plugin.settings.lastSync, true
|
||||
);
|
||||
new Notice('✅ Updated Khoj index.');
|
||||
|
||||
|
||||
@@ -28,17 +28,42 @@ function fileExtensionToMimeType (extension: string): string {
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
||||
function filenameToMimeType (filename: TFile): string {
|
||||
switch (filename.extension) {
|
||||
case 'pdf':
|
||||
return 'application/pdf';
|
||||
case 'png':
|
||||
return 'image/png';
|
||||
case 'jpg':
|
||||
case 'jpeg':
|
||||
return 'image/jpeg';
|
||||
case 'md':
|
||||
case 'markdown':
|
||||
return 'text/markdown';
|
||||
case 'org':
|
||||
return 'text/org';
|
||||
default:
|
||||
return 'text/plain';
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSync: Map<TFile, number>, regenerate: boolean = false): Promise<Map<TFile, number>> {
|
||||
// Get all markdown, pdf files in the vault
|
||||
console.log(`Khoj: Updating Khoj content index...`)
|
||||
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'markdown' || file.extension === 'pdf');
|
||||
const binaryFileTypes = ['pdf']
|
||||
let countOfFilesToIndex = 0;
|
||||
let countOfFilesToDelete = 0;
|
||||
lastSync = lastSync.size > 0 ? lastSync : new Map<TFile, number>();
|
||||
|
||||
// Add all files to index as multipart form data
|
||||
const fileData = [];
|
||||
for (const file of files) {
|
||||
// Only push files that have been modified since last sync if not regenerating
|
||||
if (!regenerate && file.stat.mtime < (lastSync.get(file) ?? 0)){
|
||||
continue;
|
||||
}
|
||||
|
||||
countOfFilesToIndex++;
|
||||
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
||||
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||
@@ -47,14 +72,18 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
|
||||
}
|
||||
|
||||
// Add any previously synced files to be deleted to multipart form data
|
||||
for (const lastSyncedFile of lastSyncedFiles) {
|
||||
let filesToDelete: TFile[] = [];
|
||||
for (const lastSyncedFile of lastSync.keys()) {
|
||||
if (!files.includes(lastSyncedFile)) {
|
||||
countOfFilesToDelete++;
|
||||
fileData.push({blob: new Blob([]), path: lastSyncedFile.path});
|
||||
let fileObj = new Blob([""], { type: filenameToMimeType(lastSyncedFile) });
|
||||
fileData.push({blob: fileObj, path: lastSyncedFile.path});
|
||||
filesToDelete.push(lastSyncedFile);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate through all indexable files in vault, 1000 at a time
|
||||
let responses: string[] = [];
|
||||
let error_message = null;
|
||||
for (let i = 0; i < fileData.length; i += 1000) {
|
||||
const filesGroup = fileData.slice(i, i + 1000);
|
||||
@@ -79,16 +108,31 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
|
||||
} else {
|
||||
error_message = `❗️Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`;
|
||||
}
|
||||
} else {
|
||||
responses.push(await response.text());
|
||||
}
|
||||
}
|
||||
|
||||
// Update last sync time for each successfully indexed file
|
||||
files
|
||||
.filter(file => responses.find(response => response.includes(file.path)))
|
||||
.reduce((newSync, file) => {
|
||||
newSync.set(file, new Date().getTime());
|
||||
return newSync;
|
||||
}, lastSync);
|
||||
|
||||
// Remove files that were deleted from last sync
|
||||
filesToDelete
|
||||
.filter(file => responses.find(response => response.includes(file.path)))
|
||||
.forEach(file => lastSync.delete(file));
|
||||
|
||||
if (error_message) {
|
||||
new Notice(error_message);
|
||||
} else {
|
||||
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
||||
}
|
||||
|
||||
return files;
|
||||
return lastSync;
|
||||
}
|
||||
|
||||
export async function createNote(name: string, newLeaf = false): Promise<void> {
|
||||
|
||||
@@ -211,7 +211,7 @@ def setup(
|
||||
file_names = [file_name for file_name in files]
|
||||
|
||||
logger.info(
|
||||
f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names}"
|
||||
f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..."
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user