mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Remove the 1000 files limit when syncing from Desktop, Obsidian clients (#605)
### Major - Push 1000 files at a time from the Desktop client for indexing - Push 1000 files at a time from the Obsidian client for indexing - Test 1000 file upload limit to index/update API endpoint ### Minor - Show relevant error message in desktop app, e.g when can't connect to server - Pass indexed filenames in API response for client validation - Collect files to index in single dict to simplify index/update controller Resolves #573
This commit is contained in:
@@ -114,8 +114,8 @@ function processDirectory(filesToPush, folder) {
|
|||||||
const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true });
|
const files = fs.readdirSync(folder.path, { withFileTypes: true, recursive: true });
|
||||||
|
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
console.log(file);
|
|
||||||
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
||||||
|
console.log(`Add ${file.name} in ${folder.path} for indexing`);
|
||||||
filesToPush.push(path.join(folder.path, file.name));
|
filesToPush.push(path.join(folder.path, file.name));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,7 +151,7 @@ function pushDataToKhoj (regenerate = false) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const lastSync = store.get('lastSync') || [];
|
const lastSync = store.get('lastSync') || [];
|
||||||
const formData = new FormData();
|
const filesDataToPush = [];
|
||||||
for (const file of filesToPush) {
|
for (const file of filesToPush) {
|
||||||
const stats = fs.statSync(file);
|
const stats = fs.statSync(file);
|
||||||
if (!regenerate) {
|
if (!regenerate) {
|
||||||
@@ -167,7 +167,7 @@ function pushDataToKhoj (regenerate = false) {
|
|||||||
let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||||
let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
|
let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
|
||||||
let fileObj = new Blob([fileContent], { type: mimeType });
|
let fileObj = new Blob([fileContent], { type: mimeType });
|
||||||
formData.append('files', fileObj, file);
|
filesDataToPush.push({blob: fileObj, path: file});
|
||||||
state[file] = {
|
state[file] = {
|
||||||
success: true,
|
success: true,
|
||||||
}
|
}
|
||||||
@@ -184,52 +184,51 @@ function pushDataToKhoj (regenerate = false) {
|
|||||||
for (const syncedFile of lastSync) {
|
for (const syncedFile of lastSync) {
|
||||||
if (!filesToPush.includes(syncedFile.path)) {
|
if (!filesToPush.includes(syncedFile.path)) {
|
||||||
fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
|
fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
|
||||||
formData.append('files', fileObj, syncedFile.path);
|
filesDataToPush.push({blob: fileObj, path: syncedFile.path});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send collected files to Khoj server for indexing
|
// Send collected files to Khoj server for indexing
|
||||||
if (!!formData?.entries()?.next().value) {
|
const hostURL = store.get('hostURL') || KHOJ_URL;
|
||||||
const hostURL = store.get('hostURL') || KHOJ_URL;
|
const headers = { 'Authorization': `Bearer ${store.get("khojToken")}` };
|
||||||
const headers = {
|
let requests = [];
|
||||||
'Authorization': `Bearer ${store.get("khojToken")}`
|
|
||||||
};
|
// Request indexing files on server. With upto 1000 files in each request
|
||||||
axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
|
for (let i = 0; i < filesDataToPush.length; i += 1000) {
|
||||||
.then(response => {
|
const filesDataGroup = filesDataToPush.slice(i, i + 1000);
|
||||||
console.log(response.data);
|
const formData = new FormData();
|
||||||
let lastSync = [];
|
filesDataGroup.forEach(fileData => { formData.append('files', fileData.blob, fileData.path) });
|
||||||
for (const file of filesToPush) {
|
let request = axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers });
|
||||||
lastSync.push({
|
requests.push(request);
|
||||||
path: file,
|
}
|
||||||
datetime: new Date().toISOString()
|
|
||||||
});
|
// Wait for requests batch to finish
|
||||||
}
|
Promise
|
||||||
store.set('lastSync', lastSync);
|
.all(requests)
|
||||||
})
|
.then(responses => {
|
||||||
.catch(error => {
|
const lastSync = filesToPush
|
||||||
console.error(error);
|
.filter(file => responses.find(response => response.data.includes(file)))
|
||||||
if (error.code == 'ECONNREFUSED') {
|
.map(file => ({ path: file, datetime: new Date().toISOString() }));
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
store.set('lastSync', lastSync);
|
||||||
if (win) win.webContents.send('update-state', state);
|
})
|
||||||
} else if (error.response.status == 429) {
|
.catch(error => {
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
console.error(error);
|
||||||
if (win) win.webContents.send('needsSubscription', true);
|
state["completed"] = false;
|
||||||
if (win) win.webContents.send('update-state', state);
|
if (error?.response?.status === 429 && (win = BrowserWindow.getAllWindows()[0])) {
|
||||||
}
|
state["error"] = `Looks like you're out of space to sync your files. <a href="https://app.khoj.dev/config">Upgrade your plan</a> to unlock more space.`;
|
||||||
state['completed'] = false
|
} else if (error?.code === 'ECONNREFUSED') {
|
||||||
})
|
state["error"] = `Could not connect to Khoj server. Ensure you can connect to it at ${error.address}:${error.port}.`;
|
||||||
.finally(() => {
|
} else {
|
||||||
// Syncing complete
|
state["error"] = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`;
|
||||||
syncing = false;
|
}
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
})
|
||||||
if (win) win.webContents.send('update-state', state);
|
.finally(() => {
|
||||||
});
|
|
||||||
} else {
|
|
||||||
// Syncing complete
|
// Syncing complete
|
||||||
syncing = false;
|
syncing = false;
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
if (win = BrowserWindow.getAllWindows()[0]) {
|
||||||
if (win) win.webContents.send('update-state', state);
|
win.webContents.send('update-state', state);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
pushDataToKhoj();
|
pushDataToKhoj();
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ window.updateStateAPI.onUpdateState((event, state) => {
|
|||||||
nextSyncTime = new Date();
|
nextSyncTime = new Date();
|
||||||
nextSyncTime.setMinutes(Math.ceil((nextSyncTime.getMinutes() + 1) / 10) * 10);
|
nextSyncTime.setMinutes(Math.ceil((nextSyncTime.getMinutes() + 1) / 10) * 10);
|
||||||
if (state.completed == false) {
|
if (state.completed == false) {
|
||||||
syncStatusElement.innerHTML = `Sync was unsuccessful at ${currentTime.toLocaleTimeString()}. Contact team@khoj.dev to report this issue.`;
|
if (state.error) syncStatusElement.innerHTML = state.error;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const options = { hour: '2-digit', minute: '2-digit' };
|
const options = { hour: '2-digit', minute: '2-digit' };
|
||||||
|
|||||||
@@ -31,40 +31,59 @@ function fileExtensionToMimeType (extension: string): string {
|
|||||||
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
||||||
// Get all markdown, pdf files in the vault
|
// Get all markdown, pdf files in the vault
|
||||||
console.log(`Khoj: Updating Khoj content index...`)
|
console.log(`Khoj: Updating Khoj content index...`)
|
||||||
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
|
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'markdown' || file.extension === 'pdf');
|
||||||
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
|
const binaryFileTypes = ['pdf']
|
||||||
let countOfFilesToIndex = 0;
|
let countOfFilesToIndex = 0;
|
||||||
let countOfFilesToDelete = 0;
|
let countOfFilesToDelete = 0;
|
||||||
|
|
||||||
// Add all files to index as multipart form data
|
// Add all files to index as multipart form data
|
||||||
const formData = new FormData();
|
const fileData = [];
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
countOfFilesToIndex++;
|
countOfFilesToIndex++;
|
||||||
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
||||||
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||||
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
|
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
|
||||||
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
|
fileData.push({blob: new Blob([fileContent], { type: mimeType }), path: file.path});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add any previously synced files to be deleted to multipart form data
|
// Add any previously synced files to be deleted to multipart form data
|
||||||
for (const lastSyncedFile of lastSyncedFiles) {
|
for (const lastSyncedFile of lastSyncedFiles) {
|
||||||
if (!files.includes(lastSyncedFile)) {
|
if (!files.includes(lastSyncedFile)) {
|
||||||
countOfFilesToDelete++;
|
countOfFilesToDelete++;
|
||||||
formData.append('files', new Blob([]), lastSyncedFile.path);
|
fileData.push({blob: new Blob([]), path: lastSyncedFile.path});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call Khoj backend to update index with all markdown, pdf files
|
// Iterate through all indexable files in vault, 1000 at a time
|
||||||
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
|
let error_message = null;
|
||||||
method: 'POST',
|
for (let i = 0; i < fileData.length; i += 1000) {
|
||||||
headers: {
|
const filesGroup = fileData.slice(i, i + 1000);
|
||||||
'Authorization': `Bearer ${setting.khojApiKey}`,
|
const formData = new FormData();
|
||||||
},
|
filesGroup.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) });
|
||||||
body: formData,
|
// Call Khoj backend to update index with all markdown, pdf files
|
||||||
});
|
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Authorization': `Bearer ${setting.khojApiKey}`,
|
||||||
|
},
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
|
if (response.status === 429) {
|
||||||
|
error_message = `❗️Failed to sync your content with Khoj server. Requests were throttled. Upgrade your subscription or try again later.`;
|
||||||
|
break;
|
||||||
|
} else if (response.status === 404) {
|
||||||
|
error_message = `❗️Could not connect to Khoj server. Ensure you can connect to it.`;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
error_message = `❗️Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error_message) {
|
||||||
|
new Notice(error_message);
|
||||||
} else {
|
} else {
|
||||||
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
||||||
}
|
}
|
||||||
@@ -92,7 +111,7 @@ export async function createNote(name: string, newLeaf = false): Promise<void> {
|
|||||||
console.error('Khoj: Could not create note.\n' + (e as any).message);
|
console.error('Khoj: Could not create note.\n' + (e as any).message);
|
||||||
throw e
|
throw e
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function createNoteAndCloseModal(query: string, modal: Modal, opt?: { newLeaf: boolean }): Promise<void> {
|
export async function createNoteAndCloseModal(query: string, modal: Modal, opt?: { newLeaf: boolean }): Promise<void> {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -63,37 +63,23 @@ async def update(
|
|||||||
),
|
),
|
||||||
):
|
):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}}
|
||||||
try:
|
try:
|
||||||
logger.info(f"📬 Updating content index via API call by {client} client")
|
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||||
org_files: Dict[str, str] = {}
|
|
||||||
markdown_files: Dict[str, str] = {}
|
|
||||||
pdf_files: Dict[str, bytes] = {}
|
|
||||||
plaintext_files: Dict[str, str] = {}
|
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
file_type, encoding = get_file_type(file.content_type)
|
file_type, encoding = get_file_type(file.content_type)
|
||||||
dict_to_update = None
|
if file_type in index_files:
|
||||||
if file_type == "org":
|
index_files[file_type][file.filename] = (
|
||||||
dict_to_update = org_files
|
|
||||||
elif file_type == "markdown":
|
|
||||||
dict_to_update = markdown_files
|
|
||||||
elif file_type == "pdf":
|
|
||||||
dict_to_update = pdf_files # type: ignore
|
|
||||||
elif file_type == "plaintext":
|
|
||||||
dict_to_update = plaintext_files
|
|
||||||
|
|
||||||
if dict_to_update is not None:
|
|
||||||
dict_to_update[file.filename] = (
|
|
||||||
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() # type: ignore
|
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() # type: ignore
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||||
|
|
||||||
indexer_input = IndexerInput(
|
indexer_input = IndexerInput(
|
||||||
org=org_files,
|
org=index_files["org"],
|
||||||
markdown=markdown_files,
|
markdown=index_files["markdown"],
|
||||||
pdf=pdf_files,
|
pdf=index_files["pdf"],
|
||||||
plaintext=plaintext_files,
|
plaintext=index_files["plaintext"],
|
||||||
)
|
)
|
||||||
|
|
||||||
if state.config == None:
|
if state.config == None:
|
||||||
@@ -143,10 +129,10 @@ async def update(
|
|||||||
return Response(content="Failed", status_code=500)
|
return Response(content="Failed", status_code=500)
|
||||||
|
|
||||||
indexing_metadata = {
|
indexing_metadata = {
|
||||||
"num_org": len(org_files),
|
"num_org": len(index_files["org"]),
|
||||||
"num_markdown": len(markdown_files),
|
"num_markdown": len(index_files["markdown"]),
|
||||||
"num_pdf": len(pdf_files),
|
"num_pdf": len(index_files["pdf"]),
|
||||||
"num_plaintext": len(plaintext_files),
|
"num_plaintext": len(index_files["plaintext"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
update_telemetry_state(
|
update_telemetry_state(
|
||||||
@@ -162,7 +148,8 @@ async def update(
|
|||||||
|
|
||||||
logger.info(f"📪 Content index updated via API call by {client} client")
|
logger.info(f"📪 Content index updated via API call by {client} client")
|
||||||
|
|
||||||
return Response(content="OK", status_code=200)
|
indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) or ""
|
||||||
|
return Response(content=indexed_filenames, status_code=200)
|
||||||
|
|
||||||
|
|
||||||
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
|
||||||
|
|||||||
@@ -169,6 +169,7 @@ def test_index_update_normal_file_unsubscribed(client, api_user4: KhojApiUser):
|
|||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_index_update_big_files_no_billing(client):
|
def test_index_update_big_files_no_billing(client):
|
||||||
# Arrange
|
# Arrange
|
||||||
@@ -197,6 +198,26 @@ def test_index_update(client):
|
|||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
@pytest.mark.django_db(transaction=True)
|
||||||
|
def test_index_update_fails_if_more_than_1000_files(client, api_user4: KhojApiUser):
|
||||||
|
# Arrange
|
||||||
|
api_token = api_user4.token
|
||||||
|
state.billing_enabled = True
|
||||||
|
files = [("files", (f"path/to/filename{i}.org", f"Symphony No {i}", "text/org")) for i in range(1001)]
|
||||||
|
|
||||||
|
headers = {"Authorization": f"Bearer {api_token}"}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
ok_response = client.post("/api/v1/index/update", files=files[:1000], headers=headers)
|
||||||
|
bad_response = client.post("/api/v1/index/update", files=files, headers=headers)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert ok_response.status_code == 200
|
||||||
|
assert bad_response.status_code == 400
|
||||||
|
assert bad_response.content.decode("utf-8") == '{"detail":"Too many files. Maximum number of files is 1000."}'
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_regenerate_with_valid_content_type(client):
|
def test_regenerate_with_valid_content_type(client):
|
||||||
|
|||||||
Reference in New Issue
Block a user