From 58d86d7876dff846c98a5cd3fe7d08c5418f58ba Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 12:01:37 -0700 Subject: [PATCH 01/16] Use single func to configure server via API and on server start Improve error messages on failure to configure server components --- src/khoj/configure.py | 66 +++++++++++++++++++++++++++-------------- src/khoj/main.py | 6 ++-- src/khoj/routers/api.py | 50 +++++++++++++------------------ 3 files changed, 67 insertions(+), 55 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 18c5ac8a..087245f8 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -37,23 +37,40 @@ from khoj.search_filter.file_filter import FileFilter logger = logging.getLogger(__name__) -def configure_server(args, required=False): - if args.config is None: - if required: - logger.error( - f"Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}." - ) - sys.exit(1) - else: - logger.warning( - f"Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}." - ) - return - else: - state.config = args.config +def initialize_server( + config: Optional[FullConfig], regenerate: bool, type: Optional[SearchType] = None, required=False +): + if config is None and required: + logger.error( + f"🚨 Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}." + ) + sys.exit(1) + elif config is None: + logger.warning( + f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}." + ) + return None + + try: + configure_server(config, regenerate, type) + except Exception as e: + logger.error(f"🚨 Failed to configure server on app load: {e}") + raise e + + +def configure_server(config: FullConfig, regenerate: bool, search_type: Optional[SearchType] = None): + # Update Config + state.config = config # Initialize Processor from Config - state.processor_config = configure_processor(args.config.processor) + try: + state.search_index_lock.acquire() + state.processor_config = configure_processor(state.config.processor) + except Exception as e: + logger.error(f"🚨 Failed to configure processor") + raise e + finally: + state.search_index_lock.release() # Initialize Search Models from Config try: @@ -61,7 +78,8 @@ def configure_server(args, required=False): state.SearchType = configure_search_types(state.config) state.search_models = configure_search(state.search_models, state.config.search_type) except Exception as e: - logger.error(f"🚨 Error configuring search models on app load: {e}") + logger.error(f"🚨 Failed to configure search models") + raise e finally: state.search_index_lock.release() @@ -70,10 +88,11 @@ def configure_server(args, required=False): try: state.search_index_lock.acquire() state.content_index = configure_content( - state.content_index, state.config.content_type, state.search_models, args.regenerate + state.content_index, state.config.content_type, state.search_models, regenerate, search_type ) except Exception as e: - logger.error(f"🚨 Error configuring content index on app load: {e}") + logger.error(f"🚨 Failed to index content") + raise e finally: state.search_index_lock.release() @@ -118,10 +137,10 @@ def configure_search_types(config: FullConfig): return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types)) -def configure_search(search_models: SearchModels, search_config: SearchConfig) -> Optional[SearchModels]: +def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]: # Run Validation Checks if search_config is None: - logger.warning("🚨 No Search type is configured.") + logger.warning("🚨 No Search configuration available.") return None if search_models is None: search_models = SearchModels() @@ -147,7 +166,7 @@ def configure_content( ) -> Optional[ContentIndex]: # Run Validation Checks if content_config is None: - logger.warning("🚨 No Content type is configured.") + logger.warning("🚨 No Content configuration available.") return None if content_index is None: content_index = ContentIndex() @@ -242,9 +261,10 @@ def configure_content( return content_index -def configure_processor(processor_config: ProcessorConfig): +def configure_processor(processor_config: Optional[ProcessorConfig]): if not processor_config: - return + logger.warning("🚨 No Processor configuration available.") + return None processor = ProcessorConfigModel() diff --git a/src/khoj/main.py b/src/khoj/main.py index 1e4d407d..5b24bc05 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -27,7 +27,7 @@ from rich.logging import RichHandler import schedule # Internal Packages -from khoj.configure import configure_routes, configure_server +from khoj.configure import configure_routes, initialize_server from khoj.utils import state from khoj.utils.cli import cli from khoj.interface.desktop.main_window import MainWindow @@ -75,7 +75,7 @@ def run(): poll_task_scheduler() # Start Server - configure_server(args, required=False) + initialize_server(args.config, args.regenerate, required=False) configure_routes(app) start_server(app, host=args.host, port=args.port, socket=args.socket) else: @@ -93,7 +93,7 @@ def run(): tray.show() # Setup Server - configure_server(args, required=False) + initialize_server(args.config, args.regenerate, required=False) configure_routes(app) server = ServerThread(app, args.host, args.port, args.socket) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 50e8e1f2..0269987d 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -5,20 +5,20 @@ import time import yaml import logging import json -from typing import List, Optional, Union +from typing import Iterable, List, Optional, Union # External Packages from fastapi import APIRouter, HTTPException, Header, Request from sentence_transformers import util # Internal Packages -from khoj.configure import configure_content, configure_processor, configure_search +from khoj.configure import configure_processor, configure_server from khoj.search_type import image_search, text_search from khoj.search_filter.date_filter import DateFilter from khoj.search_filter.file_filter import FileFilter from khoj.search_filter.word_filter import WordFilter from khoj.utils.config import TextSearchModel -from khoj.utils.helpers import log_telemetry, timer +from khoj.utils.helpers import timer from khoj.utils.rawconfig import ( ContentConfig, FullConfig, @@ -524,34 +524,26 @@ def update( referer: Optional[str] = Header(None), host: Optional[str] = Header(None), ): + if not state.config: + error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}." + logger.warning(error_msg) + raise HTTPException(status_code=500, detail=error_msg) try: - state.search_index_lock.acquire() - try: - if state.config and state.config.search_type: - state.search_models = configure_search(state.search_models, state.config.search_type) - if state.search_models: - state.content_index = configure_content( - state.content_index, state.config.content_type, state.search_models, regenerate=force or False, t=t - ) - except Exception as e: - logger.error(e) - raise HTTPException(status_code=500, detail=str(e)) - finally: - state.search_index_lock.release() - except ValueError as e: - logger.error(e) - raise HTTPException(status_code=500, detail=str(e)) + configure_server(state.config, regenerate=force or False, search_type=t) + except Exception as e: + error_msg = f"🚨 Failed to update server via API: {e}" + logger.error(error_msg) + raise HTTPException(status_code=500, detail=error_msg) else: - logger.info("📬 Search index updated via API") - - try: - if state.config and state.config.processor: - state.processor_config = configure_processor(state.config.processor) - except ValueError as e: - logger.error(e) - raise HTTPException(status_code=500, detail=str(e)) - else: - logger.info("📬 Processor reconfigured via API") + components = [] + if state.search_models: + components.append("Search models") + if state.content_index: + components.append("Content index") + if state.processor_config: + components.append("Conversation processor") + components_msg = ", ".join(components) + logger.info(f"📬 {components_msg} updated via API") update_telemetry_state( request=request, From 7ad96036b0ff09f4f72056740ac9cdb51e947492 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 12:07:07 -0700 Subject: [PATCH 02/16] Improve lock name to config_lock instead of search_index_lock It is used to lock updates to all app config state, including processor --- src/khoj/configure.py | 16 ++++++++-------- src/khoj/utils/state.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 087245f8..680c5417 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -64,29 +64,29 @@ def configure_server(config: FullConfig, regenerate: bool, search_type: Optional # Initialize Processor from Config try: - state.search_index_lock.acquire() + state.config_lock.acquire() state.processor_config = configure_processor(state.config.processor) except Exception as e: logger.error(f"🚨 Failed to configure processor") raise e finally: - state.search_index_lock.release() + state.config_lock.release() # Initialize Search Models from Config try: - state.search_index_lock.acquire() + state.config_lock.acquire() state.SearchType = configure_search_types(state.config) state.search_models = configure_search(state.search_models, state.config.search_type) except Exception as e: logger.error(f"🚨 Failed to configure search models") raise e finally: - state.search_index_lock.release() + state.config_lock.release() # Initialize Content from Config if state.search_models: try: - state.search_index_lock.acquire() + state.config_lock.acquire() state.content_index = configure_content( state.content_index, state.config.content_type, state.search_models, regenerate, search_type ) @@ -94,7 +94,7 @@ def configure_server(config: FullConfig, regenerate: bool, search_type: Optional logger.error(f"🚨 Failed to index content") raise e finally: - state.search_index_lock.release() + state.config_lock.release() def configure_routes(app): @@ -114,7 +114,7 @@ if not state.demo: @schedule.repeat(schedule.every(61).minutes) def update_search_index(): try: - state.search_index_lock.acquire() + state.config_lock.acquire() state.content_index = configure_content( state.content_index, state.config.content_type, state.search_models, regenerate=False ) @@ -122,7 +122,7 @@ if not state.demo: except Exception as e: logger.error(f"🚨 Error updating content index via Scheduler: {e}") finally: - state.search_index_lock.release() + state.config_lock.release() def configure_search_types(config: FullConfig): diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 89688e15..40b3daae 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -24,7 +24,7 @@ host: str = None port: int = None cli_args: List[str] = None query_cache = LRU() -search_index_lock = threading.Lock() +config_lock = threading.Lock() SearchType = utils_config.SearchType telemetry: List[Dict[str, str]] = [] previous_query: str = None From da98b92dd46e6eabadcf5df54f81af5a0c1cf734 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 14:33:15 -0700 Subject: [PATCH 03/16] Create helper function to test value, order of entries & embeddings This helper should be used to observe if the current embeddings are stable sorted on regenerate and incremental update of index in text search tests --- tests/test_text_search.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index c18a4c42..5809f327 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -5,6 +5,7 @@ import os # External Packages import pytest +import torch from khoj.utils.config import SearchModels # Internal Packages @@ -202,3 +203,25 @@ def test_asymmetric_setup_github(content_config: ContentConfig, search_models: S # Assert assert len(github_model.entries) > 1 + + +def compare_index(initial_notes_model, final_notes_model): + mismatched_entries, mismatched_embeddings = [], [] + for index in range(len(initial_notes_model.entries)): + if initial_notes_model.entries[index].to_json() != final_notes_model.entries[index].to_json(): + mismatched_entries.append(index) + + # verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings + for index in range(len(initial_notes_model.corpus_embeddings)): + if not torch.equal(final_notes_model.corpus_embeddings[index], initial_notes_model.corpus_embeddings[index]): + mismatched_embeddings.append(index) + + error_details = "" + if mismatched_entries: + mismatched_entries_str = ",".join(map(str, mismatched_entries)) + error_details += f"Entries at {mismatched_entries_str} not equal\n" + if mismatched_embeddings: + mismatched_embeddings_str = ", ".join(map(str, mismatched_embeddings)) + error_details += f"Embeddings at {mismatched_embeddings_str} not equal\n" + + return error_details From 88d1a29a849e491f5fcd3c2b3a993d56df6d6103 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 17:18:07 -0700 Subject: [PATCH 04/16] Test index is stable for duplicate entries across regenerate, update - Current incorrect behavior: All entries with duplicate compiled form are kept on regenerate but on update only the last of the duplicated entries is kept This divergent behavior is not ideal to prevent index corruption across reconfigure and update --- tests/test_text_search.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 5809f327..3e8f7d3d 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -161,6 +161,40 @@ def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchM content_config.org.input_files = [] +# ---------------------------------------------------------------------------------------------------- +def test_update_index_with_duplicate_entries_in_stable_order( + org_config_with_only_new_file: TextContentConfig, search_models: SearchModels +): + # Arrange + new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) + + # Insert org-mode entries with same compiled form into new org file + new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + with open(new_file_to_index, "w") as f: + f.write(f"{new_entry}{new_entry}") + + # Act + # load embeddings, entries, notes model after adding new org-mode file + initial_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True + ) + + # update embeddings, entries, notes model after adding new org-mode file + updated_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False + ) + + # Assert + # verify only 1 entry added even if there are multiple duplicate entries + assert len(initial_index.entries) == len(updated_index.entries) == 1 + assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) == 1 + + # verify the same entry is added even when there are multiple duplicate entries + error_details = compare_index(initial_index, updated_index) + if error_details: + pytest.fail(error_details) + + # ---------------------------------------------------------------------------------------------------- def test_incremental_update(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): # Arrange From 1673bb55587e56c3e306e4dd2390632c8f75880e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 19:55:40 -0700 Subject: [PATCH 05/16] Add todo state to compiled form of each org-mode entry --- src/khoj/processor/org_mode/org_to_jsonl.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 664427d9..608fdd41 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -125,9 +125,13 @@ class OrgToJsonl(TextToJsonl): # Ignore title notes i.e notes with just headings and empty body continue + todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else "" # Prepend filename as top heading to entry filename = Path(entry_to_file_map[parsed_entry]).stem - heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}." + if parsed_entry.heading: + heading = f"* {filename}\n** {todo_str}{parsed_entry.heading}." + else: + heading = f"* {filename}." compiled = heading if state.verbose > 2: From 9bcca432996fdd3366760a999f518d6681038903 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 20:03:19 -0700 Subject: [PATCH 06/16] Use single func to handle indexing from scratch and incrementally Previous regenerate mechanism did not deduplicate entries with same key So entries looked different between regenerate and update Having single func, mark_entries_for_update, to handle both scenarios will avoid this divergence Update all text_to_jsonl methods to use the above method for generating index from scratch --- src/khoj/processor/github/github_to_jsonl.py | 3 +-- src/khoj/processor/jsonl/jsonl_to_jsonl.py | 14 ++++---------- src/khoj/processor/markdown/markdown_to_jsonl.py | 11 ++++------- src/khoj/processor/notion/notion_to_jsonl.py | 11 ++++------- src/khoj/processor/org_mode/org_to_jsonl.py | 6 ++---- src/khoj/processor/pdf/pdf_to_jsonl.py | 11 ++++------- src/khoj/processor/text_to_jsonl.py | 2 +- src/khoj/search_type/text_search.py | 8 ++++---- 8 files changed, 24 insertions(+), 42 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index dd797c31..9dbdc093 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -15,7 +15,6 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data from khoj.utils.rawconfig import Entry -from khoj.utils import state logger = logging.getLogger(__name__) @@ -38,7 +37,7 @@ class GithubToJsonl(TextToJsonl): else: return - def process(self, previous_entries=None): + def process(self, previous_entries=[]): current_entries = [] for repo in self.config.repos: current_entries += self.process_repo(repo) diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index f743d5d5..c033f522 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class JsonlToJsonl(TextToJsonl): # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config input_jsonl_files, input_jsonl_filter, output_file = ( self.config.input_files, @@ -38,15 +38,9 @@ class JsonlToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, - previous_entries, - key="compiled", - logger=logger, - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 21cbda72..2da5bd4c 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -23,7 +23,7 @@ class MarkdownToJsonl(TextToJsonl): self.config = config # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config markdown_files, markdown_file_filter, output_file = ( self.config.input_files, @@ -51,12 +51,9 @@ class MarkdownToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write markdown entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py index 20a11cd7..d4cd78f3 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_jsonl.py @@ -80,7 +80,7 @@ class NotionToJsonl(TextToJsonl): self.body_params = {"page_size": 100} - def process(self, previous_entries=None): + def process(self, previous_entries=[]): current_entries = [] # Get all pages @@ -240,12 +240,9 @@ class NotionToJsonl(TextToJsonl): def update_entries_with_ids(self, current_entries, previous_entries): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write Notion entries to JSONL file", logger): # Process Each Entry from all Notion entries diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 608fdd41..b00a6c50 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -22,7 +22,7 @@ class OrgToJsonl(TextToJsonl): self.config = config # Define Functions - def process(self, previous_entries: List[Entry] = None): + def process(self, previous_entries: List[Entry] = []): # Extract required fields from config org_files, org_file_filter, output_file = ( self.config.input_files, @@ -51,9 +51,7 @@ class OrgToJsonl(TextToJsonl): current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) # Identify, mark and merge any new entries with previous entries - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: + with timer("Identify new or updated entries", logger): entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index c5c395bc..e41fd976 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) class PdfToJsonl(TextToJsonl): # Define Functions - def process(self, previous_entries=None): + def process(self, previous_entries=[]): # Extract required fields from config pdf_files, pdf_file_filter, output_file = ( self.config.input_files, @@ -45,12 +45,9 @@ class PdfToJsonl(TextToJsonl): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = TextToJsonl.mark_entries_for_update( - current_entries, previous_entries, key="compiled", logger=logger - ) + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) with timer("Write PDF entries to JSONL file", logger): # Process Each Entry from All Notes Files diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index a4d01cf5..ff169fac 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -17,7 +17,7 @@ class TextToJsonl(ABC): self.config = config @abstractmethod - def process(self, previous_entries: List[Entry] = None) -> List[Tuple[int, Entry]]: + def process(self, previous_entries: List[Entry] = []) -> List[Tuple[int, Entry]]: ... @staticmethod diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index a77be6e1..edc735f2 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -176,10 +176,10 @@ def setup( ) -> TextContent: # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - previous_entries = ( - extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None - ) - entries_with_indices = text_to_jsonl(config).process(previous_entries or []) + previous_entries = [] + if config.compressed_jsonl.exists() and not regenerate: + previous_entries = extract_entries(config.compressed_jsonl) + entries_with_indices = text_to_jsonl(config).process(previous_entries) # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) From 6e70b914c2e8792dc7c8fb44d555b4864dcc691c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 20:42:26 -0700 Subject: [PATCH 07/16] Remove unused dump_jsonl method The entries index is stored ingzipped jsonl files for each content type --- src/khoj/processor/github/github_to_jsonl.py | 7 ++----- src/khoj/processor/jsonl/jsonl_to_jsonl.py | 7 ++----- src/khoj/processor/markdown/markdown_to_jsonl.py | 7 ++----- src/khoj/processor/notion/notion_to_jsonl.py | 7 ++----- src/khoj/processor/org_mode/org_to_jsonl.py | 7 ++----- src/khoj/processor/pdf/pdf_to_jsonl.py | 7 ++----- src/khoj/utils/jsonl.py | 13 +------------ tests/conftest.py | 6 +++--- 8 files changed, 16 insertions(+), 45 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 9dbdc093..91dbd6da 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl): jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data - if self.config.compressed_jsonl.suffix == ".gz": - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - elif self.config.compressed_jsonl.suffix == ".jsonl": - dump_jsonl(jsonl_data, self.config.compressed_jsonl) + compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) return entries_with_ids diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index c033f522..3c824545 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -7,7 +7,7 @@ from typing import List # Internal Packages from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, timer -from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import load_jsonl, compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl): jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 2da5bd4c..b6acbfbb 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -10,7 +10,7 @@ from typing import List from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer from khoj.utils.constants import empty_escape_sequences -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry, TextContentConfig @@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl): jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py index d4cd78f3..489f0341 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_jsonl.py @@ -8,7 +8,7 @@ import requests from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, NotionContentConfig from khoj.processor.text_to_jsonl import TextToJsonl -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry from enum import Enum @@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl): jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries) # Compress JSONL formatted Data - if self.config.compressed_jsonl.suffix == ".gz": - compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) - elif self.config.compressed_jsonl.suffix == ".jsonl": - dump_jsonl(jsonl_data, self.config.compressed_jsonl) + compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) return entries_with_ids diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index b00a6c50..b3bc06fd 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -8,7 +8,7 @@ from typing import Iterable, List from khoj.processor.org_mode import orgnode from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry, TextContentConfig from khoj.utils import state @@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl): jsonl_data = self.convert_org_entries_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index e41fd976..f8a20692 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader # Internal Packages from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer -from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.jsonl import compress_jsonl_data from khoj.utils.rawconfig import Entry @@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl): jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries) # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) + compress_jsonl_data(jsonl_data, output_file) return entries_with_ids diff --git a/src/khoj/utils/jsonl.py b/src/khoj/utils/jsonl.py index c9576810..ed779e79 100644 --- a/src/khoj/utils/jsonl.py +++ b/src/khoj/utils/jsonl.py @@ -20,7 +20,7 @@ def load_jsonl(input_path): # Open JSONL file if input_path.suffix == ".gz": jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8") - elif input_path.suffix == ".jsonl": + else: jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8") # Read JSONL file @@ -36,17 +36,6 @@ def load_jsonl(input_path): return data -def dump_jsonl(jsonl_data, output_path): - "Write List of JSON objects to JSON line file" - # Create output directory, if it doesn't exist - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - f.write(jsonl_data) - - logger.debug(f"Wrote jsonl data to {output_path}") - - def compress_jsonl_data(jsonl_data, output_path): # Create output directory, if it doesn't exist output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/conftest.py b/tests/conftest.py index a92d33ca..07c5156f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config: content_config.org = TextContentConfig( input_files=None, input_filter=["tests/data/org/*.org"], - compressed_jsonl=content_dir.joinpath("notes.jsonl"), + compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"), embeddings_file=content_dir.joinpath("note_embeddings.pt"), ) @@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config: content_config.plugins = { "plugin1": TextContentConfig( - input_files=[content_dir.joinpath("notes.jsonl")], + input_files=[content_dir.joinpath("notes.jsonl.gz")], input_filter=None, compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"), embeddings_file=content_dir.joinpath("plugin_embeddings.pt"), @@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory): content_config.markdown = TextContentConfig( input_files=None, input_filter=["tests/data/markdown/*.markdown"], - compressed_jsonl=content_dir.joinpath("markdown.jsonl"), + compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"), embeddings_file=content_dir.joinpath("markdown_embeddings.pt"), ) From 7669b85da648ec02be508685a35d78bbdde4ff1d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 00:47:11 -0700 Subject: [PATCH 08/16] Test index is stable sorted on regenerate with new entry --- tests/test_text_search.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 3e8f7d3d..9a692cf9 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -122,7 +122,9 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent # ---------------------------------------------------------------------------------------------------- -def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): +def test_regenerate_index_with_new_entry( + content_config: ContentConfig, search_models: SearchModels, new_org_file: Path +): # Arrange initial_notes_model = text_search.setup( OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True @@ -136,25 +138,20 @@ def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchM with open(new_org_file, "w") as f: f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") + # Act # regenerate notes jsonl, model embeddings and model to include entry from new file regenerated_notes_model = text_search.setup( OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True ) - # Act - # reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files - initial_notes_model = text_search.setup( - OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False - ) - # Assert assert len(regenerated_notes_model.entries) == 11 assert len(regenerated_notes_model.corpus_embeddings) == 11 - # Assert - # verify new entry loaded from updated embeddings, entries - assert len(initial_notes_model.entries) == 11 - assert len(initial_notes_model.corpus_embeddings) == 11 + # verify new entry appended to index, without disrupting order or content of existing entries + error_details = compare_index(initial_notes_model, regenerated_notes_model) + if error_details: + pytest.fail(error_details, False) # Cleanup # reset input_files in config to empty list From 6a0297cc86b898646d406d2a4fae732539a04658 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 15 Jul 2023 23:58:13 -0700 Subject: [PATCH 09/16] Stable sort new entries when marking entries for update --- src/khoj/processor/text_to_jsonl.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index ff169fac..f92ab7b1 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -78,16 +78,23 @@ class TextToJsonl(ABC): # All entries that exist in both current and previous sets are kept existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) + # load new entries in the order in which they are processed for a stable sort + new_entries = [ + (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash]) + for entry_hash in new_entry_hashes + ] + new_entries_sorted = sorted(new_entries, key=lambda e: e[0]) # Mark new entries with -1 id to flag for later embeddings generation - new_entries = [(-1, hash_to_current_entries[entry_hash]) for entry_hash in new_entry_hashes] + new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted] + # Set id of existing entries to their previous ids to reuse their existing encoded embeddings existing_entries = [ (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) for entry_hash in existing_entry_hashes ] - existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) - entries_with_ids = existing_entries_sorted + new_entries + + entries_with_ids = existing_entries_sorted + new_entries_sorted return entries_with_ids From 89c7819cb79cd2a51a73c611f83e51916be24337 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 00:22:14 -0700 Subject: [PATCH 10/16] Unify logic to generate embeddings from scratch and incrementally This simplifies the `compute_embeddings' method and avoids potential later divergence in handling the index regenerate vs update scenarios --- src/khoj/search_type/text_search.py | 40 +++++++++++++---------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index edc735f2..ed3be33c 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -62,36 +62,32 @@ def compute_embeddings( ): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" new_entries = [] + create_index_msg = "" # Load pre-computed embeddings from file if exists and update them if required if embeddings_file.exists() and not regenerate: corpus_embeddings: torch.Tensor = torch.load(get_absolute_path(embeddings_file), map_location=state.device) logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}") - - # Encode any new entries in the corpus and update corpus embeddings - new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1] - if new_entries: - logger.info(f"📩 Indexing {len(new_entries)} text entries.") - new_embeddings = bi_encoder.encode( - new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True - ) - existing_entry_ids = [id for id, _ in entries_with_ids if id != -1] - if existing_entry_ids: - existing_embeddings = torch.index_select( - corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device) - ) - else: - existing_embeddings = torch.tensor([], device=state.device) - corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) - # Else compute the corpus embeddings from scratch else: - new_entries = [entry.compiled for _, entry in entries_with_ids] - logger.info(f"📩 Indexing {len(new_entries)} text entries. Creating index from scratch.") - corpus_embeddings = bi_encoder.encode( + corpus_embeddings = torch.tensor([], device=state.device) + create_index_msg = " Creating index from scratch." + + # Encode any new entries in the corpus and update corpus embeddings + new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1] + if new_entries: + logger.info(f"📩 Indexing {len(new_entries)} text entries.{create_index_msg}") + new_embeddings = bi_encoder.encode( new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True ) + existing_entry_ids = [id for id, _ in entries_with_ids if id != -1] + if existing_entry_ids: + existing_embeddings = torch.index_select( + corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device) + ) + else: + existing_embeddings = torch.tensor([], device=state.device) + corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) - # Save regenerated or updated embeddings to file - if new_entries: + # Save regenerated or updated embeddings to file corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, embeddings_file) logger.info(f"📩 Saved computed text embeddings to {embeddings_file}") From b02323ade6713b97857c7073f9ccee94aa5e1303 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 01:23:22 -0700 Subject: [PATCH 11/16] Improve name of text search test functions Asymmetric was older name used to differentiate between symmetric, asymmetric search. Now that text search just uses asymmetric search stick to simpler name --- tests/test_text_search.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 9a692cf9..bdd1c5c4 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -18,7 +18,7 @@ from khoj.processor.github.github_to_jsonl import GithubToJsonl # Test # ---------------------------------------------------------------------------------------------------- -def test_asymmetric_setup_with_missing_file_raises_error( +def test_text_search_setup_with_missing_file_raises_error( org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig ): # Arrange @@ -33,7 +33,7 @@ def test_asymmetric_setup_with_missing_file_raises_error( # ---------------------------------------------------------------------------------------------------- -def test_asymmetric_setup_with_empty_file_raises_error( +def test_text_search_setup_with_empty_file_raises_error( org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig ): # Act @@ -43,7 +43,7 @@ def test_asymmetric_setup_with_empty_file_raises_error( # ---------------------------------------------------------------------------------------------------- -def test_asymmetric_setup(content_config: ContentConfig, search_models: SearchModels): +def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels): # Act # Regenerate notes embeddings during asymmetric setup notes_model = text_search.setup( @@ -56,7 +56,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_models: SearchMo # ---------------------------------------------------------------------------------------------------- -def test_text_content_index_only_updates_on_changes(content_config: ContentConfig, search_models: SearchModels, caplog): +def test_text_index_same_if_content_unchanged(content_config: ContentConfig, search_models: SearchModels, caplog): # Arrange caplog.set_level(logging.INFO, logger="khoj") @@ -193,7 +193,7 @@ def test_update_index_with_duplicate_entries_in_stable_order( # ---------------------------------------------------------------------------------------------------- -def test_incremental_update(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): +def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): # Arrange initial_notes_model = text_search.setup( OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True From 1482fd4d4d6e14008871bea185964754d1092706 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 01:24:03 -0700 Subject: [PATCH 12/16] Test index is stable sorted on incremental update with new entry Ensure order of new embedding, entry insertion on incremental update is stable --- tests/test_text_search.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index bdd1c5c4..6496a80d 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -199,24 +199,27 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True ) - assert len(initial_notes_model.entries) == 10 - assert len(initial_notes_model.corpus_embeddings) == 10 - # append org-mode entry to first org input file in config with open(new_org_file, "w") as f: - f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") + new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + f.write(new_entry) # Act # update embeddings, entries with the newly added note content_config.org.input_files = [f"{new_org_file}"] - initial_notes_model = text_search.setup( + final_notes_model = text_search.setup( OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False ) # Assert - # verify new entry added in updated embeddings, entries - assert len(initial_notes_model.entries) == 11 - assert len(initial_notes_model.corpus_embeddings) == 11 + assert len(final_notes_model.entries) == len(initial_notes_model.entries) + 1 + assert len(final_notes_model.corpus_embeddings) == len(initial_notes_model.corpus_embeddings) + 1 + + # verify new entry appended to index, without disrupting order or content of existing entries + error_details = compare_index(initial_notes_model, final_notes_model) + if error_details: + # fails at embeddings index 4, 7. These are not swapped with the new entry embedding or each other + pytest.fail(error_details, False) # Cleanup # reset input_files in config to empty list From ad41ef39918120463f6af7e53de0a5b50a1a8230 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 02:16:33 -0700 Subject: [PATCH 13/16] Make normalizing embeddings configurable --- src/khoj/search_type/text_search.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index ed3be33c..c123974a 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -58,7 +58,11 @@ def extract_entries(jsonl_file) -> List[Entry]: def compute_embeddings( - entries_with_ids: List[Tuple[int, Entry]], bi_encoder: BaseEncoder, embeddings_file: Path, regenerate=False + entries_with_ids: List[Tuple[int, Entry]], + bi_encoder: BaseEncoder, + embeddings_file: Path, + regenerate=False, + normalize=True, ): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" new_entries = [] @@ -87,8 +91,11 @@ def compute_embeddings( existing_embeddings = torch.tensor([], device=state.device) corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) + if normalize: + # Normalize embeddings for faster lookup via dot product when querying + corpus_embeddings = util.normalize_embeddings(corpus_embeddings) + # Save regenerated or updated embeddings to file - corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, embeddings_file) logger.info(f"📩 Saved computed text embeddings to {embeddings_file}") @@ -169,6 +176,7 @@ def setup( bi_encoder: BaseEncoder, regenerate: bool, filters: List[BaseFilter] = [], + normalize: bool = True, ) -> TextContent: # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) @@ -186,7 +194,7 @@ def setup( # Compute or Load Embeddings config.embeddings_file = resolve_absolute_path(config.embeddings_file) corpus_embeddings = compute_embeddings( - entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate + entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate, normalize=normalize ) for filter in filters: From c73feebf257442325d193e47645901271d436a2f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 02:16:58 -0700 Subject: [PATCH 14/16] Test index embeddings are stable on incremental update & no norm Ensure order of new embedding insertion on incremental update does not affect the order and value of existing embeddings when normalization is turned off --- tests/test_text_search.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 6496a80d..830b0da5 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -196,7 +196,7 @@ def test_update_index_with_duplicate_entries_in_stable_order( def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): # Arrange initial_notes_model = text_search.setup( - OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True + OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False ) # append org-mode entry to first org input file in config @@ -208,7 +208,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model # update embeddings, entries with the newly added note content_config.org.input_files = [f"{new_org_file}"] final_notes_model = text_search.setup( - OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False + OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False ) # Assert @@ -218,7 +218,6 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model # verify new entry appended to index, without disrupting order or content of existing entries error_details = compare_index(initial_notes_model, final_notes_model) if error_details: - # fails at embeddings index 4, 7. These are not swapped with the new entry embedding or each other pytest.fail(error_details, False) # Cleanup From ef6a0044f45aa26b89f40415d1fb19006799a35a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 16 Jul 2023 03:47:05 -0700 Subject: [PATCH 15/16] Drop embeddings of deleted text entries from index Previously the deleted embeddings would continue to be in the index, even after the entry was deleted --- src/khoj/search_type/text_search.py | 32 ++++++++++++----------- tests/test_text_search.py | 39 +++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index c123974a..09174186 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -65,7 +65,8 @@ def compute_embeddings( normalize=True, ): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" - new_entries = [] + new_embeddings = torch.tensor([], device=state.device) + existing_embeddings = torch.tensor([], device=state.device) create_index_msg = "" # Load pre-computed embeddings from file if exists and update them if required if embeddings_file.exists() and not regenerate: @@ -82,22 +83,23 @@ def compute_embeddings( new_embeddings = bi_encoder.encode( new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True ) - existing_entry_ids = [id for id, _ in entries_with_ids if id != -1] - if existing_entry_ids: - existing_embeddings = torch.index_select( - corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device) - ) - else: - existing_embeddings = torch.tensor([], device=state.device) - corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) - if normalize: - # Normalize embeddings for faster lookup via dot product when querying - corpus_embeddings = util.normalize_embeddings(corpus_embeddings) + # Extract existing embeddings from previous corpus embeddings + existing_entry_ids = [id for id, _ in entries_with_ids if id != -1] + if existing_entry_ids: + existing_embeddings = torch.index_select( + corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device) + ) - # Save regenerated or updated embeddings to file - torch.save(corpus_embeddings, embeddings_file) - logger.info(f"📩 Saved computed text embeddings to {embeddings_file}") + # Set corpus embeddings to merger of existing and new embeddings + corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) + if normalize: + # Normalize embeddings for faster lookup via dot product when querying + corpus_embeddings = util.normalize_embeddings(corpus_embeddings) + + # Save regenerated or updated embeddings to file + torch.save(corpus_embeddings, embeddings_file) + logger.info(f"📩 Saved computed text embeddings to {embeddings_file}") return corpus_embeddings diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 830b0da5..1ae7e770 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -71,8 +71,8 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea final_logs = caplog.text # Assert - assert "📩 Saved computed text embeddings to" in initial_logs - assert "📩 Saved computed text embeddings to" not in final_logs + assert "Creating index from scratch." in initial_logs + assert "Creating index from scratch." not in final_logs # ---------------------------------------------------------------------------------------------------- @@ -192,6 +192,41 @@ def test_update_index_with_duplicate_entries_in_stable_order( pytest.fail(error_details) +# ---------------------------------------------------------------------------------------------------- +def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels): + # Arrange + new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) + + # Insert org-mode entries with same compiled form into new org file + new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + with open(new_file_to_index, "w") as f: + f.write(f"{new_entry}{new_entry} -- Tatooine") + + # load embeddings, entries, notes model after adding new org file with 2 entries + initial_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True + ) + + # update embeddings, entries, notes model after removing an entry from the org file + with open(new_file_to_index, "w") as f: + f.write(f"{new_entry}") + + # Act + updated_index = text_search.setup( + OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False + ) + + # Assert + # verify only 1 entry added even if there are multiple duplicate entries + assert len(initial_index.entries) == len(updated_index.entries) + 1 + assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) + 1 + + # verify the same entry is added even when there are multiple duplicate entries + error_details = compare_index(updated_index, initial_index) + if error_details: + pytest.fail(error_details) + + # ---------------------------------------------------------------------------------------------------- def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path): # Arrange From 3e3a1ecbc87c43ff20945b7233671cc37dd48746 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 17 Jul 2023 14:33:02 -0700 Subject: [PATCH 16/16] Start app even if server init fails to let user fix it Show stacktrace on error to help debugging --- src/khoj/configure.py | 3 +-- src/khoj/routers/api.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 680c5417..74f7bd9b 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -54,8 +54,7 @@ def initialize_server( try: configure_server(config, regenerate, type) except Exception as e: - logger.error(f"🚨 Failed to configure server on app load: {e}") - raise e + logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True) def configure_server(config: FullConfig, regenerate: bool, search_type: Optional[SearchType] = None): diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 0269987d..834e8997 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -532,7 +532,7 @@ def update( configure_server(state.config, regenerate=force or False, search_type=t) except Exception as e: error_msg = f"🚨 Failed to update server via API: {e}" - logger.error(error_msg) + logger.error(error_msg, exc_info=True) raise HTTPException(status_code=500, detail=error_msg) else: components = []