From 192cd53003d36f59860f4b953a4bc0e7f467dd1b Mon Sep 17 00:00:00 2001 From: Desmond Date: Wed, 8 May 2024 13:38:40 +0800 Subject: [PATCH 1/5] Batch send of index files --- src/interface/emacs/khoj.el | 93 ++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index ceed1136..632c9355 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -416,10 +416,47 @@ Auto invokes setup steps on calling main entrypoint." (files-to-index (or file-paths (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.\\(org\\|md\\|markdown\\|pdf\\|txt\\|rst\\|xml\\|htm\\|html\\)$")) content-directories) content-files))) (type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type))) + (delete-files (khoj--get-delete-file-list khoj--indexed-files files-to-index)) (inhibit-message t) - (message-log-max nil)) - (let ((url-request-method "POST") - (url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary)) + (message-log-max nil) + (batch-size 50)) + (dolist (files (khoj--split-file-list files-to-index batch-size)) + (khoj--send-index-update-request (khoj--render-update-files-as-request-body files boundary) boundary content-type type-query force)) + (when delete-files + (khoj--send-index-update-request (khoj--render-delete-files-as-request-body delete-files boundary) boundary content-type type-query force)) + (setq khoj--indexed-files files-to-index))) + +(defun khoj--get-delete-file-list (indexed-files upload-files) + "Get delete file list. when `INDEXED-FILES' no longer in `UPLOAD-FILES'. +delete them. return delete-file-list." + (let (delete-files '()) + (dolist (indexed-file indexed-files) + (when (not (member indexed-file upload-files)) + (push indexed-file delete-files))) + delete-files)) + +(defun khoj--split-file-list (file-list size) + "Split `FILE-LIST' into subgroups of `SIZE' files each." + (let ((subgroups '()) + (current-group '())) + (dolist (file file-list) + (if (= (length current-group) size) + ;; If the current group has size files, start a new group + (progn + (push current-group subgroups) + (setq current-group '())) + (push file current-group))) + ;; Add the last group if it's not empty + (when current-group + (push (nreverse current-group) subgroups)) + (nreverse subgroups))) ; Reverse to maintain the original order of file-list + +(defun khoj--send-index-update-request (body boundary &optional content-type type-query force) + "Send `BODY' request to khoj server. 'TYPE-QUERY' is appended to the URL. +Use `BOUNDARY' to add headder conte +nt-type." + (let ((url-request-method "POST") + (url-request-data body) (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary)) ("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (with-current-buffer @@ -437,10 +474,9 @@ Auto invokes setup steps on calling main entrypoint." (if content-type (format "%s " content-type) "all") (string-trim (format "%s %s" (nth 1 (nth 1 status)) (nth 2 (nth 1 status)))) (if (> (- (point-max) (point)) 0) (format ". Response: %s" (string-trim (buffer-substring-no-properties (point) (point-max)))) "")))))) - nil t t))) - (setq khoj--indexed-files files-to-index))) + nil t t)))) -(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary) +(defun khoj--render-update-files-as-request-body (files-to-index boundary) "Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body. Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request." (with-temp-buffer @@ -448,32 +484,43 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request. (insert "\n") (dolist (file-to-index files-to-index) ;; find file content-type. Choose from org, markdown, pdf, plaintext - (let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org") - ((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown") - ((string-match "\\.pdf$" file-to-index) "application/pdf") - (t "text/plain")))) + (let ((content-type (khoj--filename-to-mime-type file-to-index)) + (file-name (encode-coding-string file-to-index 'utf-8))) (insert (format "--%s\r\n" boundary)) - (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name)) (insert (format "Content-Type: %s\r\n\r\n" content-type)) (insert (with-temp-buffer (insert-file-contents-literally file-to-index) (buffer-string))) (insert "\r\n"))) - (dolist (file-to-index previously-indexed-files) - (when (not (member file-to-index files-to-index)) - ;; find file content-type. Choose from org, markdown, pdf, plaintext - (let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org") - ((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown") - ((string-match "\\.pdf$" file-to-index) "application/pdf") - (t "text/plain")))) - (insert (format "--%s\r\n" boundary)) - (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) - (insert "Content-Type: text/org\r\n\r\n") - (insert "") - (insert "\r\n")))) (insert (format "--%s--\r\n" boundary)) (buffer-string))) +(defun khoj--render-delete-files-as-request-body (delete-files boundary) + "Render `DELETE-FILES' as multi-part form body. +Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request." + (with-temp-buffer + (set-buffer-multibyte nil) + (insert "\n") + (debug delete-files) + (dolist (file-to-index delete-files) + (let ((content-type (khoj--filename-to-mime-type file-to-index)) + (file-name (encode-coding-string file-to-index 'utf-8))) + (insert (format "--%s\r\n" boundary)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name)) + (insert "Content-Type: %s\r\n\r\n" content-type) + (insert "") + (insert "\r\n"))) + (insert (format "--%s--\r\n" boundary)) + (buffer-string))) + +(defun khoj--filename-to-mime-type (file-name) + "`FILE-NAME' to mimeType." + (cond ((string-match "\\.org$" file-name) "text/org") + ((string-match "\\.\\(md\\|markdown\\)$" file-name) "text/markdown") + ((string-match "\\.pdf$" file-name) "application/pdf") + (t "text/plain"))) + ;; Cancel any running indexing timer, first (when khoj--index-timer (cancel-timer khoj--index-timer)) From 150cd18bf34a2db92c47043f3b5406032c797fd6 Mon Sep 17 00:00:00 2001 From: Desmond Date: Wed, 8 May 2024 13:44:22 +0800 Subject: [PATCH 2/5] Update batch-size --- src/interface/emacs/khoj.el | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 632c9355..43226ebe 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -419,7 +419,7 @@ Auto invokes setup steps on calling main entrypoint." (delete-files (khoj--get-delete-file-list khoj--indexed-files files-to-index)) (inhibit-message t) (message-log-max nil) - (batch-size 50)) + (batch-size 30)) (dolist (files (khoj--split-file-list files-to-index batch-size)) (khoj--send-index-update-request (khoj--render-update-files-as-request-body files boundary) boundary content-type type-query force)) (when delete-files From b0630c1a986f148ef25e9a8852e8073692e80286 Mon Sep 17 00:00:00 2001 From: Desmond Date: Tue, 21 May 2024 21:52:01 +0800 Subject: [PATCH 3/5] Simplify partition --- src/interface/emacs/khoj.el | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 43226ebe..f3083a76 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -420,7 +420,7 @@ Auto invokes setup steps on calling main entrypoint." (inhibit-message t) (message-log-max nil) (batch-size 30)) - (dolist (files (khoj--split-file-list files-to-index batch-size)) + (dolist (files (-partition-all batch-size files-to-index)) (khoj--send-index-update-request (khoj--render-update-files-as-request-body files boundary) boundary content-type type-query force)) (when delete-files (khoj--send-index-update-request (khoj--render-delete-files-as-request-body delete-files boundary) boundary content-type type-query force)) @@ -435,22 +435,6 @@ delete them. return delete-file-list." (push indexed-file delete-files))) delete-files)) -(defun khoj--split-file-list (file-list size) - "Split `FILE-LIST' into subgroups of `SIZE' files each." - (let ((subgroups '()) - (current-group '())) - (dolist (file file-list) - (if (= (length current-group) size) - ;; If the current group has size files, start a new group - (progn - (push current-group subgroups) - (setq current-group '())) - (push file current-group))) - ;; Add the last group if it's not empty - (when current-group - (push (nreverse current-group) subgroups)) - (nreverse subgroups))) ; Reverse to maintain the original order of file-list - (defun khoj--send-index-update-request (body boundary &optional content-type type-query force) "Send `BODY' request to khoj server. 'TYPE-QUERY' is appended to the URL. Use `BOUNDARY' to add headder conte @@ -502,7 +486,6 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request. (with-temp-buffer (set-buffer-multibyte nil) (insert "\n") - (debug delete-files) (dolist (file-to-index delete-files) (let ((content-type (khoj--filename-to-mime-type file-to-index)) (file-name (encode-coding-string file-to-index 'utf-8))) From 3f49b5a4ab3c92e5a0cc4187c30859db64b650ab Mon Sep 17 00:00:00 2001 From: Desmond Date: Mon, 27 May 2024 10:42:09 +0800 Subject: [PATCH 4/5] fix: emacs tests --- src/interface/emacs/tests/khoj-tests.el | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/interface/emacs/tests/khoj-tests.el b/src/interface/emacs/tests/khoj-tests.el index c0d9f4a6..4df5f348 100644 --- a/src/interface/emacs/tests/khoj-tests.el +++ b/src/interface/emacs/tests/khoj-tests.el @@ -219,7 +219,7 @@ Rule everything\n") (progn (should (equal - (khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj") + (khoj--render-update-files-as-request-body (list upgrade-file act-file) "khoj") (format "\n--khoj\r\n\ Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ @@ -244,7 +244,7 @@ Rule everything\n\n\r\n\ (progn (should (equal - (khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj") + (khoj--render-update-files-as-request-body (list upgrade-file act-file "/tmp/deleted-file.org") "khoj") (format "\n--khoj\r\n\ Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ From 70fea6c6b64bf82ffed3dd380fb3e140bd05be20 Mon Sep 17 00:00:00 2001 From: Desmond Date: Mon, 27 May 2024 14:46:26 +0800 Subject: [PATCH 5/5] fix: delete file request --- src/interface/emacs/khoj.el | 2 +- src/interface/emacs/tests/khoj-tests.el | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 392d581e..1a9f7efd 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -491,7 +491,7 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request. (file-name (encode-coding-string file-to-index 'utf-8))) (insert (format "--%s\r\n" boundary)) (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name)) - (insert "Content-Type: %s\r\n\r\n" content-type) + (insert (format "Content-Type: %s\r\n\r\n" content-type)) (insert "") (insert "\r\n"))) (insert (format "--%s--\r\n" boundary)) diff --git a/src/interface/emacs/tests/khoj-tests.el b/src/interface/emacs/tests/khoj-tests.el index 4df5f348..cd2a1f02 100644 --- a/src/interface/emacs/tests/khoj-tests.el +++ b/src/interface/emacs/tests/khoj-tests.el @@ -244,19 +244,16 @@ Rule everything\n\n\r\n\ (progn (should (equal - (khoj--render-update-files-as-request-body (list upgrade-file act-file "/tmp/deleted-file.org") "khoj") + (khoj--render-delete-files-as-request-body (list upgrade-file act-file "/tmp/deleted-file.org") "khoj") (format "\n--khoj\r\n\ Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ Content-Type: text/org\r\n\r\n\ -# Become God\n\ -## Upgrade\n\n\ -Penance to Immortality\n\n\r +\r --khoj\r\n\ Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ Content-Type: text/org\r\n\r\n\ -## Act\n\n\ -Rule everything\n\n\r +\r --khoj\r\n\ Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ Content-Type: text/org\r\n\r\n\