diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index ceed1136..632c9355 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -416,10 +416,47 @@ Auto invokes setup steps on calling main entrypoint." (files-to-index (or file-paths (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.\\(org\\|md\\|markdown\\|pdf\\|txt\\|rst\\|xml\\|htm\\|html\\)$")) content-directories) content-files))) (type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type))) + (delete-files (khoj--get-delete-file-list khoj--indexed-files files-to-index)) (inhibit-message t) - (message-log-max nil)) - (let ((url-request-method "POST") - (url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary)) + (message-log-max nil) + (batch-size 50)) + (dolist (files (khoj--split-file-list files-to-index batch-size)) + (khoj--send-index-update-request (khoj--render-update-files-as-request-body files boundary) boundary content-type type-query force)) + (when delete-files + (khoj--send-index-update-request (khoj--render-delete-files-as-request-body delete-files boundary) boundary content-type type-query force)) + (setq khoj--indexed-files files-to-index))) + +(defun khoj--get-delete-file-list (indexed-files upload-files) + "Get delete file list. when `INDEXED-FILES' no longer in `UPLOAD-FILES'. +delete them. return delete-file-list." + (let (delete-files '()) + (dolist (indexed-file indexed-files) + (when (not (member indexed-file upload-files)) + (push indexed-file delete-files))) + delete-files)) + +(defun khoj--split-file-list (file-list size) + "Split `FILE-LIST' into subgroups of `SIZE' files each." + (let ((subgroups '()) + (current-group '())) + (dolist (file file-list) + (if (= (length current-group) size) + ;; If the current group has size files, start a new group + (progn + (push current-group subgroups) + (setq current-group '())) + (push file current-group))) + ;; Add the last group if it's not empty + (when current-group + (push (nreverse current-group) subgroups)) + (nreverse subgroups))) ; Reverse to maintain the original order of file-list + +(defun khoj--send-index-update-request (body boundary &optional content-type type-query force) + "Send `BODY' request to khoj server. 'TYPE-QUERY' is appended to the URL. +Use `BOUNDARY' to add headder conte +nt-type." + (let ((url-request-method "POST") + (url-request-data body) (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary)) ("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (with-current-buffer @@ -437,10 +474,9 @@ Auto invokes setup steps on calling main entrypoint." (if content-type (format "%s " content-type) "all") (string-trim (format "%s %s" (nth 1 (nth 1 status)) (nth 2 (nth 1 status)))) (if (> (- (point-max) (point)) 0) (format ". Response: %s" (string-trim (buffer-substring-no-properties (point) (point-max)))) "")))))) - nil t t))) - (setq khoj--indexed-files files-to-index))) + nil t t)))) -(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary) +(defun khoj--render-update-files-as-request-body (files-to-index boundary) "Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body. Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request." (with-temp-buffer @@ -448,32 +484,43 @@ Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request. (insert "\n") (dolist (file-to-index files-to-index) ;; find file content-type. Choose from org, markdown, pdf, plaintext - (let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org") - ((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown") - ((string-match "\\.pdf$" file-to-index) "application/pdf") - (t "text/plain")))) + (let ((content-type (khoj--filename-to-mime-type file-to-index)) + (file-name (encode-coding-string file-to-index 'utf-8))) (insert (format "--%s\r\n" boundary)) - (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name)) (insert (format "Content-Type: %s\r\n\r\n" content-type)) (insert (with-temp-buffer (insert-file-contents-literally file-to-index) (buffer-string))) (insert "\r\n"))) - (dolist (file-to-index previously-indexed-files) - (when (not (member file-to-index files-to-index)) - ;; find file content-type. Choose from org, markdown, pdf, plaintext - (let ((content-type (cond ((string-match "\\.org$" file-to-index) "text/org") - ((string-match "\\.\\(md\\|markdown\\)$" file-to-index) "text/markdown") - ((string-match "\\.pdf$" file-to-index) "application/pdf") - (t "text/plain")))) - (insert (format "--%s\r\n" boundary)) - (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) - (insert "Content-Type: text/org\r\n\r\n") - (insert "") - (insert "\r\n")))) (insert (format "--%s--\r\n" boundary)) (buffer-string))) +(defun khoj--render-delete-files-as-request-body (delete-files boundary) + "Render `DELETE-FILES' as multi-part form body. +Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request." + (with-temp-buffer + (set-buffer-multibyte nil) + (insert "\n") + (debug delete-files) + (dolist (file-to-index delete-files) + (let ((content-type (khoj--filename-to-mime-type file-to-index)) + (file-name (encode-coding-string file-to-index 'utf-8))) + (insert (format "--%s\r\n" boundary)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-name)) + (insert "Content-Type: %s\r\n\r\n" content-type) + (insert "") + (insert "\r\n"))) + (insert (format "--%s--\r\n" boundary)) + (buffer-string))) + +(defun khoj--filename-to-mime-type (file-name) + "`FILE-NAME' to mimeType." + (cond ((string-match "\\.org$" file-name) "text/org") + ((string-match "\\.\\(md\\|markdown\\)$" file-name) "text/markdown") + ((string-match "\\.pdf$" file-name) "application/pdf") + (t "text/plain"))) + ;; Cancel any running indexing timer, first (when khoj--index-timer (cancel-timer khoj--index-timer))