Push Files to Index from Emacs, Obsidian & Desktop Clients using Multi-Part Forms (#499)

### Overview
- Add ability to push data to index from the Emacs, Obsidian client
- Switch to standard mechanism of syncing files via HTTP multi-part/form. Previously we were streaming the data as JSON
  - Benefits of new mechanism
    - No manual parsing of files to send or receive on clients or server is required as most have in-built mechanisms to send multi-part/form requests
    - The whole response is not required to be kept in memory to parse content as JSON. As individual files arrive they're automatically pushed to disk to conserve memory if required
    - Binary files don't need to be encoded on client and decoded on server

### Code Details
### Major
- Use multi-part form to receive files to index on server
- Use multi-part form to send files to index on desktop client
- Send files to index on server from the khoj.el emacs client
  - Send content for indexing on server at a regular interval from khoj.el
- Send files to index on server from the khoj obsidian client
- Update tests to test multi-part/form method of pushing files to index

#### Minor
- Put indexer API endpoint under /api path segment
- Explicitly make GET request to /config/data from khoj.el:khoj-server-configure method
- Improve emoji, message on content index updated via logger
- Don't call khoj server on khoj.el load, only once khoj invoked explicitly by user
- Improve indexing of binary files
  - Let fs_syncer pass PDF files directly as binary before indexing
  - Use encoding of each file set in indexer request to read file 
- Add CORS policy to khoj server. Allow requests from khoj apps, obsidian & localhost
- Update indexer API endpoint URL to` index/update` from `indexer/batch`

Resolves #471 #243
This commit is contained in:
Debanjum
2023-10-17 06:05:15 -07:00
committed by GitHub
16 changed files with 445 additions and 307 deletions

View File

@@ -1,12 +1,13 @@
import { Notice, Plugin } from 'obsidian';
import { Notice, Plugin, TFile } from 'obsidian';
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
import { KhojSearchModal } from 'src/search_modal'
import { KhojChatModal } from 'src/chat_modal'
import { configureKhojBackend } from './utils';
import { configureKhojBackend, updateContentIndex } from './utils';
export default class Khoj extends Plugin {
settings: KhojSetting;
indexingTimer: NodeJS.Timeout;
async onload() {
await this.loadSettings();
@@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
// Add a settings tab so the user can configure khoj
this.addSettingTab(new KhojSettingTab(this.app, this));
// Add scheduled job to update index every 60 minutes
this.indexingTimer = setInterval(async () => {
if (this.settings.autoConfigure) {
this.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.settings, this.settings.lastSyncedFiles
);
}
}, 60 * 60 * 1000);
}
async loadSettings() {
@@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
}
this.saveData(this.settings);
}
async onunload() {
// Remove scheduled job to update index at regular cadence
if (this.indexingTimer)
clearInterval(this.indexingTimer);
this.unload();
}
}

View File

@@ -1,5 +1,6 @@
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
import { App, Notice, PluginSettingTab, request, Setting, TFile } from 'obsidian';
import Khoj from 'src/main';
import { updateContentIndex } from './utils';
export interface KhojSetting {
enableOfflineChat: boolean;
@@ -8,6 +9,7 @@ export interface KhojSetting {
khojUrl: string;
connectedToBackend: boolean;
autoConfigure: boolean;
lastSyncedFiles: TFile[];
}
export const DEFAULT_SETTINGS: KhojSetting = {
@@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
connectedToBackend: false,
autoConfigure: true,
openaiApiKey: '',
lastSyncedFiles: []
}
export class KhojSettingTab extends PluginSettingTab {
@@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
}, 300);
this.plugin.registerInterval(progress_indicator);
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
);
new Notice('✅ Updated Khoj index.');
// Reset button once index is updated

View File

@@ -1,4 +1,4 @@
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
import { KhojSetting } from 'src/settings'
export function getVaultAbsolutePath(vault: Vault): string {
@@ -22,10 +22,70 @@ interface ProcessorData {
};
}
function fileExtensionToMimeType (extension: string): string {
switch (extension) {
case 'pdf':
return 'application/pdf';
case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
}
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
// Get all markdown, pdf files in the vault
console.log(`Khoj: Updating Khoj content index...`)
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
let countOfFilesToIndex = 0;
let countOfFilesToDelete = 0;
// Add all files to index as multipart form data
const formData = new FormData();
for (const file of files) {
countOfFilesToIndex++;
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
const fileContent = await vault.read(file);
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
}
// Add any previously synced files to be deleted to multipart form data
for (const lastSyncedFile of lastSyncedFiles) {
if (!files.includes(lastSyncedFile)) {
countOfFilesToDelete++;
formData.append('files', new Blob([]), lastSyncedFile.path);
}
}
// Call Khoj backend to update index with all markdown, pdf files
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
method: 'POST',
headers: {
'x-api-key': 'secret',
},
body: formData,
});
if (!response.ok) {
new Notice(`Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
} else {
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
}
return files;
}
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
let vaultPath = getVaultAbsolutePath(vault);
let mdInVault = `${vaultPath}/**/*.md`;
let pdfInVault = `${vaultPath}/**/*.pdf`;
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
// Check if khoj backend is configured, note if cannot connect to backend
@@ -43,11 +103,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
if (!setting.connectedToBackend) return;
// Set index name from the path of the current vault
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
// Get default config fields from khoj backend
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
@@ -55,99 +112,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
.then(response => JSON.parse(response))
.then(data => {
khoj_already_configured = data["content-type"] != null;
// If khoj backend not configured yet
if (!khoj_already_configured) {
// Create khoj content-type config with only markdown configured
data["content-type"] = {
"markdown": {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
}
}
// Else if khoj config has no markdown content config
else if (!data["content-type"]["markdown"]) {
// Add markdown config to khoj content-type config
// Set markdown config to index markdown files in configured obsidian vault
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
// Else if khoj is not configured to index markdown files in configured obsidian vault
else if (
data["content-type"]["markdown"]["input-files"] != null ||
data["content-type"]["markdown"]["input-filter"] == null ||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
// Update markdown config in khoj content-type config
// Set markdown config to only index markdown files in configured obsidian vault
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
if (khoj_already_configured && !data["content-type"]["pdf"]) {
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
// Else if khoj is not configured to index pdf files in configured obsidian vault
else if (khoj_already_configured &&
(
data["content-type"]["pdf"]["input-files"] != null ||
data["content-type"]["pdf"]["input-filter"] == null ||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
// Update pdf config in khoj content-type config
// Set pdf config to only index pdf files in configured obsidian vault
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
let processorData: ProcessorData = {
"conversation": {
"conversation-logfile": conversationLogFile,
@@ -158,9 +123,7 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
// If the Open AI API Key was configured in the plugin settings
if (!!setting.openaiApiKey) {
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
processorData = {
"conversation": {
"conversation-logfile": conversationLogFile,