mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-04 13:20:17 +00:00
Push Files to Index from Emacs, Obsidian & Desktop Clients using Multi-Part Forms (#499)
### Overview
- Add ability to push data to index from the Emacs, Obsidian client
- Switch to standard mechanism of syncing files via HTTP multi-part/form. Previously we were streaming the data as JSON
- Benefits of new mechanism
- No manual parsing of files to send or receive on clients or server is required as most have in-built mechanisms to send multi-part/form requests
- The whole response is not required to be kept in memory to parse content as JSON. As individual files arrive they're automatically pushed to disk to conserve memory if required
- Binary files don't need to be encoded on client and decoded on server
### Code Details
### Major
- Use multi-part form to receive files to index on server
- Use multi-part form to send files to index on desktop client
- Send files to index on server from the khoj.el emacs client
- Send content for indexing on server at a regular interval from khoj.el
- Send files to index on server from the khoj obsidian client
- Update tests to test multi-part/form method of pushing files to index
#### Minor
- Put indexer API endpoint under /api path segment
- Explicitly make GET request to /config/data from khoj.el:khoj-server-configure method
- Improve emoji, message on content index updated via logger
- Don't call khoj server on khoj.el load, only once khoj invoked explicitly by user
- Improve indexing of binary files
- Let fs_syncer pass PDF files directly as binary before indexing
- Use encoding of each file set in indexer request to read file
- Add CORS policy to khoj server. Allow requests from khoj apps, obsidian & localhost
- Update indexer API endpoint URL to` index/update` from `indexer/batch`
Resolves #471 #243
This commit is contained in:
@@ -1,12 +1,13 @@
|
||||
import { Notice, Plugin } from 'obsidian';
|
||||
import { Notice, Plugin, TFile } from 'obsidian';
|
||||
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
|
||||
import { KhojSearchModal } from 'src/search_modal'
|
||||
import { KhojChatModal } from 'src/chat_modal'
|
||||
import { configureKhojBackend } from './utils';
|
||||
import { configureKhojBackend, updateContentIndex } from './utils';
|
||||
|
||||
|
||||
export default class Khoj extends Plugin {
|
||||
settings: KhojSetting;
|
||||
indexingTimer: NodeJS.Timeout;
|
||||
|
||||
async onload() {
|
||||
await this.loadSettings();
|
||||
@@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
|
||||
|
||||
// Add a settings tab so the user can configure khoj
|
||||
this.addSettingTab(new KhojSettingTab(this.app, this));
|
||||
|
||||
// Add scheduled job to update index every 60 minutes
|
||||
this.indexingTimer = setInterval(async () => {
|
||||
if (this.settings.autoConfigure) {
|
||||
this.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.settings, this.settings.lastSyncedFiles
|
||||
);
|
||||
}
|
||||
}, 60 * 60 * 1000);
|
||||
}
|
||||
|
||||
async loadSettings() {
|
||||
@@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
|
||||
}
|
||||
this.saveData(this.settings);
|
||||
}
|
||||
|
||||
async onunload() {
|
||||
// Remove scheduled job to update index at regular cadence
|
||||
if (this.indexingTimer)
|
||||
clearInterval(this.indexingTimer);
|
||||
|
||||
this.unload();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
|
||||
import { App, Notice, PluginSettingTab, request, Setting, TFile } from 'obsidian';
|
||||
import Khoj from 'src/main';
|
||||
import { updateContentIndex } from './utils';
|
||||
|
||||
export interface KhojSetting {
|
||||
enableOfflineChat: boolean;
|
||||
@@ -8,6 +9,7 @@ export interface KhojSetting {
|
||||
khojUrl: string;
|
||||
connectedToBackend: boolean;
|
||||
autoConfigure: boolean;
|
||||
lastSyncedFiles: TFile[];
|
||||
}
|
||||
|
||||
export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
@@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
connectedToBackend: false,
|
||||
autoConfigure: true,
|
||||
openaiApiKey: '',
|
||||
lastSyncedFiles: []
|
||||
}
|
||||
|
||||
export class KhojSettingTab extends PluginSettingTab {
|
||||
@@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||
}, 300);
|
||||
this.plugin.registerInterval(progress_indicator);
|
||||
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
|
||||
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
|
||||
);
|
||||
new Notice('✅ Updated Khoj index.');
|
||||
|
||||
// Reset button once index is updated
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
|
||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
|
||||
import { KhojSetting } from 'src/settings'
|
||||
|
||||
export function getVaultAbsolutePath(vault: Vault): string {
|
||||
@@ -22,10 +22,70 @@ interface ProcessorData {
|
||||
};
|
||||
}
|
||||
|
||||
function fileExtensionToMimeType (extension: string): string {
|
||||
switch (extension) {
|
||||
case 'pdf':
|
||||
return 'application/pdf';
|
||||
case 'png':
|
||||
return 'image/png';
|
||||
case 'jpg':
|
||||
case 'jpeg':
|
||||
return 'image/jpeg';
|
||||
case 'md':
|
||||
case 'markdown':
|
||||
return 'text/markdown';
|
||||
case 'org':
|
||||
return 'text/org';
|
||||
default:
|
||||
return 'text/plain';
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
||||
// Get all markdown, pdf files in the vault
|
||||
console.log(`Khoj: Updating Khoj content index...`)
|
||||
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
|
||||
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
|
||||
let countOfFilesToIndex = 0;
|
||||
let countOfFilesToDelete = 0;
|
||||
|
||||
// Add all files to index as multipart form data
|
||||
const formData = new FormData();
|
||||
for (const file of files) {
|
||||
countOfFilesToIndex++;
|
||||
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
||||
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||
const fileContent = await vault.read(file);
|
||||
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
|
||||
}
|
||||
|
||||
// Add any previously synced files to be deleted to multipart form data
|
||||
for (const lastSyncedFile of lastSyncedFiles) {
|
||||
if (!files.includes(lastSyncedFile)) {
|
||||
countOfFilesToDelete++;
|
||||
formData.append('files', new Blob([]), lastSyncedFile.path);
|
||||
}
|
||||
}
|
||||
|
||||
// Call Khoj backend to update index with all markdown, pdf files
|
||||
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'x-api-key': 'secret',
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
|
||||
} else {
|
||||
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
||||
let vaultPath = getVaultAbsolutePath(vault);
|
||||
let mdInVault = `${vaultPath}/**/*.md`;
|
||||
let pdfInVault = `${vaultPath}/**/*.pdf`;
|
||||
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
||||
|
||||
// Check if khoj backend is configured, note if cannot connect to backend
|
||||
@@ -43,11 +103,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
if (!setting.connectedToBackend) return;
|
||||
|
||||
// Set index name from the path of the current vault
|
||||
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
||||
// Get default config fields from khoj backend
|
||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
|
||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
|
||||
|
||||
@@ -55,99 +112,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
|
||||
.then(response => JSON.parse(response))
|
||||
.then(data => {
|
||||
khoj_already_configured = data["content-type"] != null;
|
||||
// If khoj backend not configured yet
|
||||
if (!khoj_already_configured) {
|
||||
// Create khoj content-type config with only markdown configured
|
||||
data["content-type"] = {
|
||||
"markdown": {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
}
|
||||
// Else if khoj config has no markdown content config
|
||||
else if (!data["content-type"]["markdown"]) {
|
||||
// Add markdown config to khoj content-type config
|
||||
// Set markdown config to index markdown files in configured obsidian vault
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
||||
else if (
|
||||
data["content-type"]["markdown"]["input-files"] != null ||
|
||||
data["content-type"]["markdown"]["input-filter"] == null ||
|
||||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
||||
// Update markdown config in khoj content-type config
|
||||
// Set markdown config to only index markdown files in configured obsidian vault
|
||||
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
if (khoj_already_configured && !data["content-type"]["pdf"]) {
|
||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
} else {
|
||||
data["content-type"]["pdf"] = null;
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index pdf files in configured obsidian vault
|
||||
else if (khoj_already_configured &&
|
||||
(
|
||||
data["content-type"]["pdf"]["input-files"] != null ||
|
||||
data["content-type"]["pdf"]["input-filter"] == null ||
|
||||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
|
||||
|
||||
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
// Update pdf config in khoj content-type config
|
||||
// Set pdf config to only index pdf files in configured obsidian vault
|
||||
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
} else {
|
||||
data["content-type"]["pdf"] = null;
|
||||
}
|
||||
}
|
||||
|
||||
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
|
||||
|
||||
let processorData: ProcessorData = {
|
||||
"conversation": {
|
||||
"conversation-logfile": conversationLogFile,
|
||||
@@ -158,9 +123,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||
|
||||
// If the Open AI API Key was configured in the plugin settings
|
||||
if (!!setting.openaiApiKey) {
|
||||
|
||||
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
|
||||
|
||||
processorData = {
|
||||
"conversation": {
|
||||
"conversation-logfile": conversationLogFile,
|
||||
|
||||
Reference in New Issue
Block a user