From 1ab59865b5d1ed6847777f7606718ff01fbeb2c8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 12:55:51 +0530 Subject: [PATCH 1/7] Improve scaling admin flow to delete all entries for user --- src/khoj/app/settings.py | 2 +- src/khoj/database/admin.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/khoj/app/settings.py b/src/khoj/app/settings.py index 2cb4440b..93f449ab 100644 --- a/src/khoj/app/settings.py +++ b/src/khoj/app/settings.py @@ -112,7 +112,7 @@ ASGI_APPLICATION = "app.asgi.application" # Database # https://docs.djangoproject.com/en/4.2/ref/settings/#databases - +DATA_UPLOAD_MAX_NUMBER_FIELDS = 20000 DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 95e3508c..d3e630c3 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -125,7 +125,10 @@ class EntryAdmin(admin.ModelAdmin): "file_path", ) search_fields = ("id", "user__email", "user__username", "file_path") - list_filter = ("file_type",) + list_filter = ( + "file_type", + "user__email", + ) ordering = ("-created_at",) From e6ffb6b52c0d374f1ca3e3db250c8818928ebbc3 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 14:41:47 +0530 Subject: [PATCH 2/7] Improve scaling user flow to delete all entries - Delete entries by batch to improve efficiency of query at scale - Share code to delete all user entries between it's async, sync methods - Add indicator to show when files being deleted on web config page --- src/khoj/database/adapters/__init__.py | 40 +++++++++++-------- .../web/content_source_computer_input.html | 18 +++++---- src/khoj/processor/content/text_to_entries.py | 2 +- src/khoj/routers/api_config.py | 2 +- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 2ea9e9af..3a21a919 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1012,27 +1012,35 @@ class EntryAdapters: return deleted_count @staticmethod - def delete_all_entries_by_type(user: KhojUser, file_type: str = None): - if file_type is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete() + def get_entries_by_batch(user: KhojUser, batch_size: int, file_type: str = None, file_source: str = None): + queryset = Entry.objects.filter(user=user) + + if file_type is not None: + queryset = queryset.filter(file_type=file_type) + + if file_source is not None: + queryset = queryset.filter(file_source=file_source) + + while queryset.exists(): + batch_ids = list(queryset.values_list("id", flat=True)[:batch_size]) + yield Entry.objects.filter(id__in=batch_ids) + + @staticmethod + def delete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = batch.delete() + deleted_count += count return deleted_count @staticmethod - def delete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_source=file_source).delete() + async def adelete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + async for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = await batch.adelete() + deleted_count += count return deleted_count - @staticmethod - async def adelete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - return await Entry.objects.filter(user=user).adelete() - return await Entry.objects.filter(user=user, file_source=file_source).adelete() - @staticmethod def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str): return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True) diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html index 77816f35..77ce2287 100644 --- a/src/khoj/interface/web/content_source_computer_input.html +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -12,7 +12,7 @@
- +
@@ -112,9 +112,13 @@ // Get all currently indexed files on page load getAllComputerFilenames(); - let deleteAllComputerFilesButton = document.getElementById("delete-all-files"); + let deleteAllComputerFilesButton = document.getElementById("delete-all-files-button"); deleteAllComputerFilesButton.addEventListener("click", function(event) { event.preventDefault(); + originalDeleteAllComputerFilesButtonText = deleteAllComputerFilesButton.textContent; + deleteAllComputerFilesButton.textContent = "🗑️ Deleting..."; + deleteAllComputerFilesButton.disabled = true; + fetch('/api/config/data/content-source/computer', { method: 'DELETE', headers: { @@ -122,11 +126,11 @@ } }) .then(response => response.json()) - .then(data => { - if (data.status == "ok") { - getAllComputerFilenames(); - } - }) + .finally(() => { + getAllComputerFilenames(); + deleteAllComputerFilesButton.textContent = originalDeleteAllComputerFilesButtonText; + deleteAllComputerFilesButton.disabled = false; + }); }); {% endblock %} diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 49331d6b..af0f95d9 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -137,7 +137,7 @@ class TextToEntries(ABC): if regenerate: with timer("Cleared existing dataset for regeneration in", logger): logger.debug(f"Deleting all entries for file type {file_type}") - num_deleted_entries = EntryAdapters.delete_all_entries_by_type(user, file_type) + num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type) hashes_to_process = set() with timer("Identified entries to add to database in", logger): diff --git a/src/khoj/routers/api_config.py b/src/khoj/routers/api_config.py index 10b1044c..58a8abae 100644 --- a/src/khoj/routers/api_config.py +++ b/src/khoj/routers/api_config.py @@ -183,7 +183,7 @@ async def remove_content_source_data( raise ValueError(f"Invalid content source: {content_source}") elif content_object != "Computer": await content_object.objects.filter(user=user).adelete() - await sync_to_async(EntryAdapters.delete_all_entries)(user, content_source) + await sync_to_async(EntryAdapters.delete_all_entries)(user, file_source=content_source) enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user) return {"status": "ok"} From 6a135b1ed795e0c788ecb7535b7f73946901e782 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 15:38:10 +0530 Subject: [PATCH 3/7] Fix degrade in speed of indexing large files. Improve summarization Adding files to the DB for summarization was slow, buggy in two ways: - We were updating same text of modified files in DB = no of chunks per file times - The `" ".join(file_content)' code was breaking each character in the file content by a space. This formats the original file content incorrectly before storing in the DB Because this code ran in the main file indexing path, it was slowing down file indexing. Knowledge bases with larger files were impacted more strongly --- src/khoj/database/adapters/__init__.py | 2 +- src/khoj/processor/content/text_to_entries.py | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 3a21a919..cfbe7ca6 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -956,7 +956,7 @@ class FileObjectAdapters: return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text) @staticmethod - def get_file_objects_by_name(user: KhojUser, file_name: str): + def get_file_object_by_name(user: KhojUser, file_name: str): return FileObject.objects.filter(user=user, file_name=file_name).first() @staticmethod diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index af0f95d9..cdb2e207 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -124,7 +124,7 @@ class TextToEntries(ABC): deletion_filenames: Set[str] = None, user: KhojUser = None, regenerate: bool = False, - file_to_text_map: dict[str, List[str]] = None, + file_to_text_map: dict[str, str] = None, ): with timer("Constructed current entry hashes in", logger): hashes_by_file = dict[str, set[str]]() @@ -192,16 +192,17 @@ class TextToEntries(ABC): logger.debug(f"Added {len(added_entries)} {file_type} entries to database") if file_to_text_map: - # get the list of file_names using added_entries - filenames_to_update = [entry.file_path for entry in added_entries] - # for each file_name in filenames_to_update, try getting the file object and updating raw_text and if it fails create a new file object - for file_name in filenames_to_update: - raw_text = " ".join(file_to_text_map[file_name]) - file_object = FileObjectAdapters.get_file_objects_by_name(user, file_name) - if file_object: - FileObjectAdapters.update_raw_text(file_object, raw_text) - else: - FileObjectAdapters.create_file_object(user, file_name, raw_text) + with timer("Indexed text of modified file in", logger): + # get the set of modified files from added_entries + modified_files = {entry.file_path for entry in added_entries} + # create or update text of each updated file indexed on DB + for modified_file in modified_files: + raw_text = file_to_text_map[modified_file] + file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file) + if file_object: + FileObjectAdapters.update_raw_text(file_object, raw_text) + else: + FileObjectAdapters.create_file_object(user, modified_file, raw_text) new_dates = [] with timer("Indexed dates from added entries in", logger): From 010486fb3617349a0f29eadaa0077c1b3d5ffbef Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 16:58:05 +0530 Subject: [PATCH 4/7] Split current section once by heading to resolve org-mode indexing bug - Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections --- .../content/org_mode/org_to_entries.py | 12 ++++++-- tests/test_org_to_entries.py | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index af85a6bd..c528244d 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries): return entries, entry_to_file_map # Split this entry tree into sections by the next heading level in it - # Increment heading level until able to split entry into sections + # Increment heading level until able to split entry into sections or reach max heading level # A successful split will result in at least 2 sections + max_heading_level = 100 next_heading_level = len(ancestry) sections: List[str] = [] - while len(sections) < 2: + while len(sections) < 2 and next_heading_level < max_heading_level: next_heading_level += 1 sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE) + # If unable to split entry into sections, log error and skip indexing it + if next_heading_level == max_heading_level: + logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.") + return entries, entry_to_file_map + # Recurse down each non-empty section after parsing its body, heading and ancestry for section in sections: # Skip empty sections @@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries): # If first non-empty line is a heading with expected heading level if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_non_empty_line)[1:]) + current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:]) # Parse the section heading into current section ancestry current_section_title = first_non_empty_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index e8940269..a84fe6e8 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -1,5 +1,6 @@ import os import re +import time from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.text_to_entries import TextToEntries @@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path): assert is_none_or_empty(entries[1]) +def test_extract_entries_when_child_headings_have_same_prefix(): + """Extract org entries from entries having child headings with same prefix. + Prevents regressions like the one fixed in PR #840. + """ + # Arrange + tmp_path = "tests/data/org/same_prefix_headings.org" + entry: str = """ +** 1 +*** 1.1 +**** 1.1.2 +""".strip() + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Org files + start = time.time() + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2) + end = time.time() + indexing_time = end - start + + # Assert + explanation_msg = ( + "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop." + ) + assert indexing_time < 6 * len(entries), explanation_msg + + def test_entry_split_when_exceeds_max_tokens(): "Ensure entries with compiled words exceeding max_tokens are split." # Arrange From 1baebb8d0e837014671e0f007359a91c3a1c17c2 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 19:05:55 +0530 Subject: [PATCH 5/7] Identify markdown headings by any whitespace character after ^#+ Previously only markdown headings with space characters after # would be considered a heading. So ^##\t wouldn't be considered a valid heading --- src/khoj/processor/content/markdown/markdown_to_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index ae0bd822..f18e1e21 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -146,7 +146,7 @@ class MarkdownToEntries(TextToEntries): else: entry_filename = str(Path(raw_filename)) - heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" + heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model # Increment heading level for heading entries and make filename as its top level heading prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n" From d693baccbc3451c152382e2ca0afbb4ae4ebd3ee Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 5 Jul 2024 16:07:42 +0530 Subject: [PATCH 6/7] Make it optional to set the encoder, cross-encoder configs via admin UI --- src/khoj/database/models/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 62afdd2b..096d14bc 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -215,11 +215,11 @@ class SearchModelConfig(BaseModel): # Bi-encoder model of sentence-transformer type to load from HuggingFace bi_encoder = models.CharField(max_length=200, default="thenlper/gte-small") # Config passed to the sentence-transformer model constructor. E.g. device="cuda:0", trust_remote_server=True etc. - bi_encoder_model_config = models.JSONField(default=dict) + bi_encoder_model_config = models.JSONField(default=dict, blank=True) # Query encode configs like prompt, precision, normalize_embeddings, etc. for sentence-transformer models - bi_encoder_query_encode_config = models.JSONField(default=dict) + bi_encoder_query_encode_config = models.JSONField(default=dict, blank=True) # Docs encode configs like prompt, precision, normalize_embeddings, etc. for sentence-transformer models - bi_encoder_docs_encode_config = models.JSONField(default=dict) + bi_encoder_docs_encode_config = models.JSONField(default=dict, blank=True) # Cross-encoder model of sentence-transformer type to load from HuggingFace cross_encoder = models.CharField(max_length=200, default="mixedbread-ai/mxbai-rerank-xsmall-v1") # Inference server API endpoint to use for embeddings inference. Bi-encoder model should be hosted on this server From 4a471979ebc8c5ad0306920954439516c129943d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 18:44:38 +0530 Subject: [PATCH 7/7] Upgrade sentence-transformer package to version 3.0.1 Add einops dependency for some sentence transformer models like the nomic-embed --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3b4af35..de82586f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,8 @@ dependencies = [ "pyyaml ~= 6.0", "rich >= 13.3.1", "schedule == 1.1.0", - "sentence-transformers == 2.5.1", + "sentence-transformers == 3.0.1", + "einops == 0.8.0", "transformers >= 4.28.0", "torch == 2.2.2", "uvicorn == 0.17.6",