From 65dade4838bede062328fbae3cabb47bffd80137 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 16 Jul 2024 16:34:46 +0530 Subject: [PATCH 1/9] Create API endpoints to get user content configurations This is to be used by the new Next.js web client --- src/khoj/routers/api_config.py | 78 ++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/src/khoj/routers/api_config.py b/src/khoj/routers/api_config.py index edf2a648..96269f5a 100644 --- a/src/khoj/routers/api_config.py +++ b/src/khoj/routers/api_config.py @@ -10,19 +10,28 @@ from fastapi.responses import Response from starlette.authentication import has_required_scope, requires from khoj.database import adapters -from khoj.database.adapters import ConversationAdapters, EntryAdapters +from khoj.database.adapters import ( + ConversationAdapters, + EntryAdapters, + get_user_github_config, + get_user_notion_config, +) from khoj.database.models import Entry as DbEntry from khoj.database.models import ( GithubConfig, + GithubRepoConfig, KhojUser, LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig, NotionConfig, - Subscription, ) -from khoj.routers.helpers import CommonQueryParams, update_telemetry_state +from khoj.routers.helpers import ( + CommonQueryParams, + get_user_config, + update_telemetry_state, +) from khoj.utils import constants, state from khoj.utils.rawconfig import ( FullConfig, @@ -98,6 +107,69 @@ def _initialize_config(): state.config.search_type = SearchConfig.model_validate(constants.default_config["search-type"]) +@api_config.get("", response_class=Response) +@requires(["authenticated"]) +def get_config(request: Request, detailed: Optional[bool] = False) -> Response: + user = request.user.object + user_config = get_user_config(user, request, is_detailed=detailed) + del user_config["request"] + + # Return config data as a JSON response + return Response(content=json.dumps(user_config), media_type="application/json", status_code=200) + + +@api_config.get("/content/github", response_class=Response) +@requires(["authenticated"]) +def get_content_github(request: Request) -> Response: + user = request.user.object + user_config = get_user_config(user, request) + del user_config["request"] + + current_github_config = get_user_github_config(user) + + if current_github_config: + raw_repos = current_github_config.githubrepoconfig.all() + repos = [] + for repo in raw_repos: + repos.append( + GithubRepoConfig( + name=repo.name, + owner=repo.owner, + branch=repo.branch, + ) + ) + current_config = GithubContentConfig( + pat_token=current_github_config.pat_token, + repos=repos, + ) + current_config = json.loads(current_config.json()) + else: + current_config = {} # type: ignore + + user_config["current_config"] = current_config + + # Return config data as a JSON response + return Response(content=json.dumps(user_config), media_type="application/json", status_code=200) + + +@api_config.get("/content/notion", response_class=Response) +@requires(["authenticated"]) +def get_content_notion(request: Request) -> Response: + user = request.user.object + user_config = get_user_config(user, request) + del user_config["request"] + + current_notion_config = get_user_notion_config(user) + token = current_notion_config.token if current_notion_config else "" + current_config = NotionContentConfig(token=token) + current_config = json.loads(current_config.model_dump_json()) + + user_config["current_config"] = current_config + + # Return config data as a JSON response + return Response(content=json.dumps(user_config), media_type="application/json", status_code=200) + + @api_config.post("/content/github", status_code=200) @requires(["authenticated"]) async def set_content_github( From 5923b6d89eaa4a0c0d51888343350ab2cbc983b0 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 18 Jul 2024 22:30:52 +0530 Subject: [PATCH 2/9] Split /api/v1/index/update into /api/content PUT, PATCH API endpoints - This utilizes PUT, PATCH HTTP method semantics to remove need for the "regenerate" query param and "/update" url suffix - This should make the url more succinct and API request intent more understandable by using existing HTTP method semantics --- src/interface/desktop/main.js | 8 ++- src/interface/emacs/khoj.el | 4 +- src/interface/obsidian/src/utils.ts | 5 +- src/interface/web/app/common/chatFunctions.ts | 4 +- src/khoj/configure.py | 6 +-- src/khoj/interface/web/chat.html | 4 +- .../routers/{indexer.py => api_content.py} | 54 +++++++++++++++---- tests/conftest.py | 2 +- tests/test_client.py | 22 ++++---- tests/test_multiple_users.py | 4 +- 10 files changed, 76 insertions(+), 37 deletions(-) rename src/khoj/routers/{indexer.py => api_content.py} (74%) diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index 76abe63c..618825e1 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -233,11 +233,15 @@ function pushDataToKhoj (regenerate = false) { // Request indexing files on server. With upto 1000 files in each request for (let i = 0; i < filesDataToPush.length; i += 1000) { + const syncUrl = `${hostURL}/api/content?client=desktop`; const filesDataGroup = filesDataToPush.slice(i, i + 1000); const formData = new FormData(); filesDataGroup.forEach(fileData => { formData.append('files', fileData.blob, fileData.path) }); - let request = axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers }); - requests.push(request); + requests.push( + regenerate + ? axios.put(syncUrl, formData, { headers }) + : axios.patch(syncUrl, formData, { headers }) + ); } // Wait for requests batch to finish diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index c5a07cde..04c821e1 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -424,12 +424,12 @@ Auto invokes setup steps on calling main entrypoint." "Send multi-part form `BODY' of `CONTENT-TYPE' in request to khoj server. Append 'TYPE-QUERY' as query parameter in request url. Specify `BOUNDARY' used to separate files in request header." - (let ((url-request-method "POST") + (let ((url-request-method ((if force) "PUT" "PATCH")) (url-request-data body) (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary)) ("Authorization" . ,(format "Bearer %s" khoj-api-key))))) (with-current-buffer - (url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false")) + (url-retrieve (format "%s/api/content?%s&client=emacs" khoj-server-url type-query) ;; render response from indexing API endpoint on server (lambda (status) (if (not (plist-get status :error)) diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 5c8b3cf9..55e3f63a 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -89,10 +89,11 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las for (let i = 0; i < fileData.length; i += 1000) { const filesGroup = fileData.slice(i, i + 1000); const formData = new FormData(); + const method = regenerate ? "PUT" : "PATCH"; filesGroup.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) }); // Call Khoj backend to update index with all markdown, pdf files - const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, { - method: 'POST', + const response = await fetch(`${setting.khojUrl}/api/content?client=obsidian`, { + method: method, headers: { 'Authorization': `Bearer ${setting.khojApiKey}`, }, diff --git a/src/interface/web/app/common/chatFunctions.ts b/src/interface/web/app/common/chatFunctions.ts index 480f6746..0e299823 100644 --- a/src/interface/web/app/common/chatFunctions.ts +++ b/src/interface/web/app/common/chatFunctions.ts @@ -275,8 +275,8 @@ export function uploadDataForIndexing( // Wait for all files to be read before making the fetch request Promise.all(fileReadPromises) .then(() => { - return fetch("/api/v1/index/update?force=false&client=web", { - method: "POST", + return fetch("/api/content?client=web", { + method: "PATCH", body: formData, }); }) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 819fc15d..44dcf584 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -42,7 +42,7 @@ from khoj.database.adapters import ( ) from khoj.database.models import ClientApplication, KhojUser, ProcessLock, Subscription from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel -from khoj.routers.indexer import configure_content, configure_search +from khoj.routers.api_content import configure_content, configure_search from khoj.routers.twilio import is_twilio_enabled from khoj.utils import constants, state from khoj.utils.config import SearchType @@ -309,7 +309,7 @@ def configure_routes(app): from khoj.routers.api_agents import api_agents from khoj.routers.api_chat import api_chat from khoj.routers.api_config import api_config - from khoj.routers.indexer import indexer + from khoj.routers.api_content import api_content from khoj.routers.notion import notion_router from khoj.routers.web_client import web_client @@ -317,7 +317,7 @@ def configure_routes(app): app.include_router(api_chat, prefix="/api/chat") app.include_router(api_agents, prefix="/api/agents") app.include_router(api_config, prefix="/api/configure") - app.include_router(indexer, prefix="/api/v1/index") + app.include_router(api_content, prefix="/api/content") app.include_router(notion_router, prefix="/api/notion") app.include_router(web_client) diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 38007ce2..fbbe6a3a 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -998,8 +998,8 @@ To get started, just start typing below. You can also type / to see a list of co // Wait for all files to be read before making the fetch request Promise.all(fileReadPromises) .then(() => { - return fetch("/api/v1/index/update?force=false&client=web", { - method: "POST", + return fetch("/api/content?client=web", { + method: "PATCH", body: formData, }); }) diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/api_content.py similarity index 74% rename from src/khoj/routers/indexer.py rename to src/khoj/routers/api_content.py index 5c080cd4..d4f9f6ec 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/api_content.py @@ -19,7 +19,7 @@ from khoj.utils.yaml import save_config_to_file_updated_state logger = logging.getLogger(__name__) -indexer = APIRouter() +api_content = APIRouter() class File(BaseModel): @@ -40,12 +40,11 @@ class IndexerInput(BaseModel): docx: Optional[dict[str, bytes]] = None -@indexer.post("/update") +@api_content.put("") @requires(["authenticated"]) -async def update( +async def put_content( request: Request, files: list[UploadFile], - force: bool = False, t: Optional[Union[state.SearchType, str]] = state.SearchType.All, client: Optional[str] = None, user_agent: Optional[str] = Header(None), @@ -59,8 +58,44 @@ async def update( subscribed_total_entries_size_limit=100, ) ), +): + return await indexer(request, files, t, True, client, user_agent, referer, host) + + +@api_content.patch("") +@requires(["authenticated"]) +async def patch_content( + request: Request, + files: list[UploadFile], + t: Optional[Union[state.SearchType, str]] = state.SearchType.All, + client: Optional[str] = None, + user_agent: Optional[str] = Header(None), + referer: Optional[str] = Header(None), + host: Optional[str] = Header(None), + indexed_data_limiter: ApiIndexedDataLimiter = Depends( + ApiIndexedDataLimiter( + incoming_entries_size_limit=10, + subscribed_incoming_entries_size_limit=25, + total_entries_size_limit=10, + subscribed_total_entries_size_limit=100, + ) + ), +): + return await indexer(request, files, t, False, client, user_agent, referer, host) + + +async def indexer( + request: Request, + files: list[UploadFile], + t: Optional[Union[state.SearchType, str]] = state.SearchType.All, + regenerate: bool = False, + client: Optional[str] = None, + user_agent: Optional[str] = Header(None), + referer: Optional[str] = Header(None), + host: Optional[str] = Header(None), ): user = request.user.object + method = "regenerate" if regenerate else "sync" index_files: Dict[str, Dict[str, str]] = { "org": {}, "markdown": {}, @@ -116,18 +151,17 @@ async def update( None, configure_content, indexer_input.model_dump(), - force, + regenerate, t, - False, user, ) if not success: - raise RuntimeError("Failed to update content index") - logger.info(f"Finished processing batch indexing request") + raise RuntimeError(f"Failed to {method} {t} data sent by {client} client into content index") + logger.info(f"Finished {method} {t} data sent by {client} client into content index") except Exception as e: - logger.error(f"Failed to process batch indexing request: {e}", exc_info=True) + logger.error(f"Failed to {method} {t} data sent by {client} client into content index: {e}", exc_info=True) logger.error( - f'🚨 Failed to {"force " if force else ""}update {t} content index triggered via API call by {client} client: {e}', + f"🚨 Failed to {method} {t} data sent by {client} client into content index: {e}", exc_info=True, ) return Response(content="Failed", status_code=500) diff --git a/tests/conftest.py b/tests/conftest.py index a16413a0..61578ce2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ from khoj.database.models import ( from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel -from khoj.routers.indexer import configure_content +from khoj.routers.api_content import configure_content from khoj.search_type import text_search from khoj.utils import fs_syncer, state from khoj.utils.config import SearchModels diff --git a/tests/test_client.py b/tests/test_client.py index 0fcdb733..b7c11590 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -75,7 +75,7 @@ def test_index_update_with_no_auth_key(client): files = get_sample_files_data() # Act - response = client.post("/api/v1/index/update", files=files) + response = client.patch("/api/content", files=files) # Assert assert response.status_code == 403 @@ -89,7 +89,7 @@ def test_index_update_with_invalid_auth_key(client): headers = {"Authorization": "Bearer kk-invalid-token"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 403 @@ -130,7 +130,7 @@ def test_index_update_big_files(client): headers = {"Authorization": "Bearer kk-secret"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 429 @@ -146,7 +146,7 @@ def test_index_update_medium_file_unsubscribed(client, api_user4: KhojApiUser): headers = {"Authorization": f"Bearer {api_token}"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 429 @@ -162,7 +162,7 @@ def test_index_update_normal_file_unsubscribed(client, api_user4: KhojApiUser): headers = {"Authorization": f"Bearer {api_token}"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 200 @@ -177,7 +177,7 @@ def test_index_update_big_files_no_billing(client): headers = {"Authorization": "Bearer kk-secret"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 200 @@ -191,7 +191,7 @@ def test_index_update(client): headers = {"Authorization": "Bearer kk-secret"} # Act - response = client.post("/api/v1/index/update", files=files, headers=headers) + response = client.patch("/api/content", files=files, headers=headers) # Assert assert response.status_code == 200 @@ -208,8 +208,8 @@ def test_index_update_fails_if_more_than_1000_files(client, api_user4: KhojApiUs headers = {"Authorization": f"Bearer {api_token}"} # Act - ok_response = client.post("/api/v1/index/update", files=files[:1000], headers=headers) - bad_response = client.post("/api/v1/index/update", files=files, headers=headers) + ok_response = client.patch("/api/content", files=files[:1000], headers=headers) + bad_response = client.patch("/api/content", files=files, headers=headers) # Assert assert ok_response.status_code == 200 @@ -226,7 +226,7 @@ def test_regenerate_with_valid_content_type(client): headers = {"Authorization": "Bearer kk-secret"} # Act - response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers) + response = client.patch(f"/api/content?t={content_type}", files=files, headers=headers) # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" @@ -243,7 +243,7 @@ def test_regenerate_with_github_fails_without_pat(client): files = get_sample_files_data() # Act - response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers) + response = client.patch(f"/api/content?t=github", files=files, headers=headers) # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github" diff --git a/tests/test_multiple_users.py b/tests/test_multiple_users.py index 4e8e456a..d8f8725e 100644 --- a/tests/test_multiple_users.py +++ b/tests/test_multiple_users.py @@ -29,7 +29,7 @@ def test_index_update_with_user2(client, api_user2: KhojApiUser): source_file_symbol = set([f[1][0] for f in files]) headers = {"Authorization": f"Bearer {api_user2.token}"} - update_response = client.post("/api/v1/index/update", files=files, headers=headers) + update_response = client.patch("/api/content", files=files, headers=headers) search_response = client.get("/api/search?q=hardware&t=all", headers=headers) results = search_response.json() @@ -47,7 +47,7 @@ def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUs source_file_symbol = set([f[1][0] for f in files]) headers = {"Authorization": f"Bearer {api_user2.token}"} - update_response = client.post("/api/v1/index/update", files=files, headers=headers) + update_response = client.patch("/api/content", files=files, headers=headers) # Act headers = {"Authorization": f"Bearer {api_user.token}"} From bba4e0b529b30e4c1785efd227c99140e1c0dfbd Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 18 Jul 2024 23:24:12 +0530 Subject: [PATCH 3/9] Accept file deletion requests by clients during sync - Remove unused full_corpus boolean. The full_corpus=False code path wasn't being used (accept for in a test) - The full_corpus=True code path used was ignoring file deletion requests sent by clients during sync. Unclear why this was done - Added unit test to prevent regression and show file deletion by clients during sync not ignored now --- .../processor/content/docx/docx_to_entries.py | 13 ++--- .../content/github/github_to_entries.py | 4 +- .../content/images/image_to_entries.py | 13 ++--- .../content/markdown/markdown_to_entries.py | 13 ++--- .../content/notion/notion_to_entries.py | 4 +- .../content/org_mode/org_to_entries.py | 13 ++--- .../processor/content/pdf/pdf_to_entries.py | 13 ++--- .../content/plaintext/plaintext_to_entries.py | 13 ++--- src/khoj/processor/content/text_to_entries.py | 4 +- src/khoj/routers/helpers.py | 9 ---- src/khoj/routers/notion.py | 2 +- src/khoj/search_type/text_search.py | 9 ++-- src/khoj/utils/fs_syncer.py | 2 +- tests/test_text_search.py | 52 ++++++++++++++++++- 14 files changed, 84 insertions(+), 80 deletions(-) diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index ab28066d..00ed3ca4 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -19,16 +19,11 @@ class DocxToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: # Extract required fields from config - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == b""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + deletion_file_names = set([file for file in files if files[file] == b""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} # Extract Entries from specified Docx files with timer("Extract entries from specified DOCX files", logger): diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 2aa63d4e..1f3dea00 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -48,9 +48,7 @@ class GithubToEntries(TextToEntries): else: return - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: if self.config.pat_token is None or self.config.pat_token == "": logger.error(f"Github PAT token is not set. Skipping github content") raise ValueError("Github PAT token is not set. Skipping github content") diff --git a/src/khoj/processor/content/images/image_to_entries.py b/src/khoj/processor/content/images/image_to_entries.py index 20705a0f..d28518b7 100644 --- a/src/khoj/processor/content/images/image_to_entries.py +++ b/src/khoj/processor/content/images/image_to_entries.py @@ -20,16 +20,11 @@ class ImageToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: # Extract required fields from config - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == b""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + deletion_file_names = set([file for file in files if files[file] == b""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} # Extract Entries from specified image files with timer("Extract entries from specified Image files", logger): diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index f18e1e21..fdb0c549 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -19,16 +19,11 @@ class MarkdownToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: # Extract required fields from config - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == ""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + deletion_file_names = set([file for file in files if files[file] == ""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} max_tokens = 256 # Extract Entries from specified Markdown files diff --git a/src/khoj/processor/content/notion/notion_to_entries.py b/src/khoj/processor/content/notion/notion_to_entries.py index 57456ed5..c53d4020 100644 --- a/src/khoj/processor/content/notion/notion_to_entries.py +++ b/src/khoj/processor/content/notion/notion_to_entries.py @@ -78,9 +78,7 @@ class NotionToEntries(TextToEntries): self.body_params = {"page_size": 100} - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: current_entries = [] # Get all pages diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index c528244d..1272da11 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -20,15 +20,10 @@ class OrgToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == ""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: + deletion_file_names = set([file for file in files if files[file] == ""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} # Extract Entries from specified Org files max_tokens = 256 diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 45ff7261..59ffc388 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -22,16 +22,11 @@ class PdfToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: # Extract required fields from config - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == b""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + deletion_file_names = set([file for file in files if files[file] == b""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} # Extract Entries from specified Pdf files with timer("Extract entries from specified PDF files", logger): diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 2c994899..483e752f 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -20,15 +20,10 @@ class PlaintextToEntries(TextToEntries): super().__init__() # Define Functions - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: - if not full_corpus: - deletion_file_names = set([file for file in files if files[file] == ""]) - files_to_process = set(files) - deletion_file_names - files = {file: files[file] for file in files_to_process} - else: - deletion_file_names = None + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: + deletion_file_names = set([file for file in files if files[file] == ""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} # Extract Entries from specified plaintext files with timer("Extract entries from specified Plaintext files", logger): diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index cdb2e207..6fee9c0c 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -31,9 +31,7 @@ class TextToEntries(ABC): self.date_filter = DateFilter() @abstractmethod - def process( - self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False - ) -> Tuple[int, int]: + def process(self, files: dict[str, str] = None, user: KhojUser = None, regenerate: bool = False) -> Tuple[int, int]: ... @staticmethod diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 9bff0dc6..10984fdd 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -1313,7 +1313,6 @@ def configure_content( files: Optional[dict[str, dict[str, str]]], regenerate: bool = False, t: Optional[state.SearchType] = state.SearchType.All, - full_corpus: bool = True, user: KhojUser = None, ) -> bool: success = True @@ -1344,7 +1343,6 @@ def configure_content( OrgToEntries, files.get("org"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) except Exception as e: @@ -1362,7 +1360,6 @@ def configure_content( MarkdownToEntries, files.get("markdown"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) @@ -1379,7 +1376,6 @@ def configure_content( PdfToEntries, files.get("pdf"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) @@ -1398,7 +1394,6 @@ def configure_content( PlaintextToEntries, files.get("plaintext"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) @@ -1418,7 +1413,6 @@ def configure_content( GithubToEntries, None, regenerate=regenerate, - full_corpus=full_corpus, user=user, config=github_config, ) @@ -1439,7 +1433,6 @@ def configure_content( NotionToEntries, None, regenerate=regenerate, - full_corpus=full_corpus, user=user, config=notion_config, ) @@ -1459,7 +1452,6 @@ def configure_content( ImageToEntries, files.get("image"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) except Exception as e: @@ -1472,7 +1464,6 @@ def configure_content( DocxToEntries, files.get("docx"), regenerate=regenerate, - full_corpus=full_corpus, user=user, ) except Exception as e: diff --git a/src/khoj/routers/notion.py b/src/khoj/routers/notion.py index 9f5d803f..e61b5fd7 100644 --- a/src/khoj/routers/notion.py +++ b/src/khoj/routers/notion.py @@ -80,6 +80,6 @@ async def notion_auth_callback(request: Request, background_tasks: BackgroundTas notion_redirect = str(request.app.url_path_for("notion_config_page")) # Trigger an async job to configure_content. Let it run without blocking the response. - background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, True, user) + background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, user) return RedirectResponse(notion_redirect) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index f3ce7110..93a2b724 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -199,17 +199,16 @@ def setup( text_to_entries: Type[TextToEntries], files: dict[str, str], regenerate: bool, - full_corpus: bool = True, user: KhojUser = None, config=None, -) -> None: +) -> Tuple[int, int]: if config: num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process( - files=files, full_corpus=full_corpus, user=user, regenerate=regenerate + files=files, user=user, regenerate=regenerate ) else: num_new_embeddings, num_deleted_embeddings = text_to_entries().process( - files=files, full_corpus=full_corpus, user=user, regenerate=regenerate + files=files, user=user, regenerate=regenerate ) if files: @@ -219,6 +218,8 @@ def setup( f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..." ) + return num_new_embeddings, num_deleted_embeddings + def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]: """Score all retrieved entries using the cross-encoder""" diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index 5a20f418..ade55f34 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -122,7 +122,7 @@ def get_org_files(config: TextContentConfig): logger.debug("At least one of org-files or org-file-filter is required to be specified") return {} - "Get Org files to process" + # Get Org files to process absolute_org_files, filtered_org_files = set(), set() if org_files: absolute_org_files = {get_absolute_path(org_file) for org_file in org_files} diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 915425bf..4529aa53 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -6,9 +6,16 @@ from pathlib import Path import pytest +from khoj.database.adapters import EntryAdapters from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig +from khoj.processor.content.docx.docx_to_entries import DocxToEntries from khoj.processor.content.github.github_to_entries import GithubToEntries +from khoj.processor.content.images.image_to_entries import ImageToEntries +from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries from khoj.processor.content.org_mode.org_to_entries import OrgToEntries +from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries +from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries +from khoj.processor.content.text_to_entries import TextToEntries from khoj.search_type import text_search from khoj.utils.fs_syncer import collect_files, get_org_files from khoj.utils.rawconfig import ContentConfig, SearchConfig @@ -151,7 +158,6 @@ async def test_text_search(search_config: SearchConfig): OrgToEntries, data, True, - True, default_user, ) @@ -240,7 +246,6 @@ conda activate khoj OrgToEntries, data, regenerate=False, - full_corpus=False, user=default_user, ) @@ -396,6 +401,49 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file verify_embeddings(3, default_user) +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.django_db +@pytest.mark.parametrize( + "text_to_entries", + [ + (OrgToEntries), + ], +) +def test_update_index_with_deleted_file( + org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser +): + "Delete entries associated with new file when file path with empty content passed." + # Arrange + file_to_index = "test" + new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + initial_data = {file_to_index: new_entry} + final_data = {file_to_index: ""} + + # Act + # load entries after adding file + initial_added_entries, _ = text_search.setup(text_to_entries, initial_data, regenerate=True, user=default_user) + initial_total_entries = EntryAdapters.get_existing_entry_hashes_by_file(default_user, file_to_index).count() + + # load entries after deleting file + final_added_entries, final_deleted_entries = text_search.setup( + text_to_entries, final_data, regenerate=False, user=default_user + ) + final_total_entries = EntryAdapters.get_existing_entry_hashes_by_file(default_user, file_to_index).count() + + # Assert + assert initial_total_entries > 0, "File entries not indexed" + assert initial_added_entries > 0, "No entries got added" + + assert final_total_entries == 0, "File did not get deleted" + assert final_added_entries == 0, "Entries were unexpectedly added in delete entries pass" + assert final_deleted_entries == initial_added_entries, "All added entries were not deleted" + + verify_embeddings(0, default_user), "Embeddings still exist for user" + + # Clean up + EntryAdapters.delete_all_entries(default_user) + + # ---------------------------------------------------------------------------------------------------- @pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set") def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser): From 469a1cb6a235aeb28d4d8cdbd8ad59adaab0e74b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 19 Jul 2024 00:00:49 +0530 Subject: [PATCH 4/9] Move API endpoints under /api/configure/content/ to /api/content/ Pull out /api/configure/content API endpoints into /api/content to allow for more logical organization of API path hierarchy This should make the url more succinct and API request intent more understandable by using existing HTTP method semantics along with the path. The /configure URL path segment was either - redundant (e.g POST /configure/notion) or - incorrect (e.g GET /configure/files) Some example of naming improvements: - GET /configure/types -> GET /content/types - GET /configure/files -> GET /content/files - DELETE /configure/files -> DELETE /content/files This should also align, merge better the the content indexing API triggered via PUT, PATCH /content Refactor Flow 1. Rename /api/configure/types -> /api/content/types 2. Rename /api/configure -> /api 3. Move /api/content to api_content from under api_config --- src/interface/desktop/search.html | 2 +- src/interface/emacs/khoj.el | 2 +- src/khoj/interface/web/chat.html | 2 +- .../web/content_source_computer_input.html | 6 +- .../web/content_source_github_input.html | 2 +- .../web/content_source_notion_input.html | 2 +- src/khoj/interface/web/search.html | 2 +- src/khoj/interface/web/settings.html | 8 +- src/khoj/routers/api_config.py | 324 +----------------- src/khoj/routers/api_content.py | 315 ++++++++++++++++- tests/test_client.py | 6 +- 11 files changed, 331 insertions(+), 340 deletions(-) diff --git a/src/interface/desktop/search.html b/src/interface/desktop/search.html index 6a6cf694..792470a6 100644 --- a/src/interface/desktop/search.html +++ b/src/interface/desktop/search.html @@ -212,7 +212,7 @@ const headers = { 'Authorization': `Bearer ${khojToken}` }; // Populate type dropdown field with enabled content types only - fetch(`${hostURL}/api/configure/types`, { headers }) + fetch(`${hostURL}/api/content/types`, { headers }) .then(response => response.json()) .then(enabled_types => { // Show warning if no content types are enabled diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 04c821e1..6f6747a8 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -697,7 +697,7 @@ Optionally apply CALLBACK with JSON parsed response and CBARGS." (defun khoj--get-enabled-content-types () "Get content types enabled for search from API." - (khoj--call-api "/api/configure/types" "GET" nil `(lambda (item) (mapcar #'intern item)))) + (khoj--call-api "/api/content/types" "GET" nil `(lambda (item) (mapcar #'intern item)))) (defun khoj--query-search-api-and-render-results (query content-type buffer-name &optional rerank is-find-similar) "Query Khoj Search API with QUERY, CONTENT-TYPE and RERANK as query params. diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index fbbe6a3a..149a0a66 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -1954,7 +1954,7 @@ To get started, just start typing below. You can also type / to see a list of co } var allFiles; function renderAllFiles() { - fetch('/api/configure/content/computer') + fetch('/api/content/computer') .then(response => response.json()) .then(data => { var indexedFiles = document.getElementsByClassName("indexed-files")[0]; diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html index e0ffc4e9..fd66360d 100644 --- a/src/khoj/interface/web/content_source_computer_input.html +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -32,7 +32,7 @@