From 58d86d7876dff846c98a5cd3fe7d08c5418f58ba Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 12:01:37 -0700
Subject: [PATCH 01/16] Use single func to configure server via API and on
 server start

Improve error messages on failure to configure server components
---
 src/khoj/configure.py   | 66 +++++++++++++++++++++++++++--------------
 src/khoj/main.py        |  6 ++--
 src/khoj/routers/api.py | 50 +++++++++++++------------------
 3 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 18c5ac8a..087245f8 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -37,23 +37,40 @@ from khoj.search_filter.file_filter import FileFilter
 logger = logging.getLogger(__name__)
 
 
-def configure_server(args, required=False):
-    if args.config is None:
-        if required:
-            logger.error(
-                f"Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}."
-            )
-            sys.exit(1)
-        else:
-            logger.warning(
-                f"Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
-            )
-            return
-    else:
-        state.config = args.config
+def initialize_server(
+    config: Optional[FullConfig], regenerate: bool, type: Optional[SearchType] = None, required=False
+):
+    if config is None and required:
+        logger.error(
+            f"🚨 Exiting as Khoj is not configured.\nConfigure it via http://localhost:42110/config or by editing {state.config_file}."
+        )
+        sys.exit(1)
+    elif config is None:
+        logger.warning(
+            f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
+        )
+        return None
+
+    try:
+        configure_server(config, regenerate, type)
+    except Exception as e:
+        logger.error(f"🚨 Failed to configure server on app load: {e}")
+        raise e
+
+
+def configure_server(config: FullConfig, regenerate: bool, search_type: Optional[SearchType] = None):
+    # Update Config
+    state.config = config
 
     # Initialize Processor from Config
-    state.processor_config = configure_processor(args.config.processor)
+    try:
+        state.search_index_lock.acquire()
+        state.processor_config = configure_processor(state.config.processor)
+    except Exception as e:
+        logger.error(f"🚨 Failed to configure processor")
+        raise e
+    finally:
+        state.search_index_lock.release()
 
     # Initialize Search Models from Config
     try:
@@ -61,7 +78,8 @@ def configure_server(args, required=False):
         state.SearchType = configure_search_types(state.config)
         state.search_models = configure_search(state.search_models, state.config.search_type)
     except Exception as e:
-        logger.error(f"🚨 Error configuring search models on app load: {e}")
+        logger.error(f"🚨 Failed to configure search models")
+        raise e
     finally:
         state.search_index_lock.release()
 
@@ -70,10 +88,11 @@ def configure_server(args, required=False):
         try:
             state.search_index_lock.acquire()
             state.content_index = configure_content(
-                state.content_index, state.config.content_type, state.search_models, args.regenerate
+                state.content_index, state.config.content_type, state.search_models, regenerate, search_type
             )
         except Exception as e:
-            logger.error(f"🚨 Error configuring content index on app load: {e}")
+            logger.error(f"🚨 Failed to index content")
+            raise e
         finally:
             state.search_index_lock.release()
 
@@ -118,10 +137,10 @@ def configure_search_types(config: FullConfig):
     return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
 
 
-def configure_search(search_models: SearchModels, search_config: SearchConfig) -> Optional[SearchModels]:
+def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
     # Run Validation Checks
     if search_config is None:
-        logger.warning("🚨 No Search type is configured.")
+        logger.warning("🚨 No Search configuration available.")
         return None
     if search_models is None:
         search_models = SearchModels()
@@ -147,7 +166,7 @@ def configure_content(
 ) -> Optional[ContentIndex]:
     # Run Validation Checks
     if content_config is None:
-        logger.warning("🚨 No Content type is configured.")
+        logger.warning("🚨 No Content configuration available.")
         return None
     if content_index is None:
         content_index = ContentIndex()
@@ -242,9 +261,10 @@ def configure_content(
     return content_index
 
 
-def configure_processor(processor_config: ProcessorConfig):
+def configure_processor(processor_config: Optional[ProcessorConfig]):
     if not processor_config:
-        return
+        logger.warning("🚨 No Processor configuration available.")
+        return None
 
     processor = ProcessorConfigModel()
 
diff --git a/src/khoj/main.py b/src/khoj/main.py
index 1e4d407d..5b24bc05 100644
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -27,7 +27,7 @@ from rich.logging import RichHandler
 import schedule
 
 # Internal Packages
-from khoj.configure import configure_routes, configure_server
+from khoj.configure import configure_routes, initialize_server
 from khoj.utils import state
 from khoj.utils.cli import cli
 from khoj.interface.desktop.main_window import MainWindow
@@ -75,7 +75,7 @@ def run():
         poll_task_scheduler()
 
         # Start Server
-        configure_server(args, required=False)
+        initialize_server(args.config, args.regenerate, required=False)
         configure_routes(app)
         start_server(app, host=args.host, port=args.port, socket=args.socket)
     else:
@@ -93,7 +93,7 @@ def run():
             tray.show()
 
         # Setup Server
-        configure_server(args, required=False)
+        initialize_server(args.config, args.regenerate, required=False)
         configure_routes(app)
         server = ServerThread(app, args.host, args.port, args.socket)
 
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 50e8e1f2..0269987d 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -5,20 +5,20 @@ import time
 import yaml
 import logging
 import json
-from typing import List, Optional, Union
+from typing import Iterable, List, Optional, Union
 
 # External Packages
 from fastapi import APIRouter, HTTPException, Header, Request
 from sentence_transformers import util
 
 # Internal Packages
-from khoj.configure import configure_content, configure_processor, configure_search
+from khoj.configure import configure_processor, configure_server
 from khoj.search_type import image_search, text_search
 from khoj.search_filter.date_filter import DateFilter
 from khoj.search_filter.file_filter import FileFilter
 from khoj.search_filter.word_filter import WordFilter
 from khoj.utils.config import TextSearchModel
-from khoj.utils.helpers import log_telemetry, timer
+from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import (
     ContentConfig,
     FullConfig,
@@ -524,34 +524,26 @@ def update(
     referer: Optional[str] = Header(None),
     host: Optional[str] = Header(None),
 ):
+    if not state.config:
+        error_msg = f"🚨 Khoj is not configured.\nConfigure it via http://localhost:42110/config, plugins or by editing {state.config_file}."
+        logger.warning(error_msg)
+        raise HTTPException(status_code=500, detail=error_msg)
     try:
-        state.search_index_lock.acquire()
-        try:
-            if state.config and state.config.search_type:
-                state.search_models = configure_search(state.search_models, state.config.search_type)
-            if state.search_models:
-                state.content_index = configure_content(
-                    state.content_index, state.config.content_type, state.search_models, regenerate=force or False, t=t
-                )
-        except Exception as e:
-            logger.error(e)
-            raise HTTPException(status_code=500, detail=str(e))
-        finally:
-            state.search_index_lock.release()
-    except ValueError as e:
-        logger.error(e)
-        raise HTTPException(status_code=500, detail=str(e))
+        configure_server(state.config, regenerate=force or False, search_type=t)
+    except Exception as e:
+        error_msg = f"🚨 Failed to update server via API: {e}"
+        logger.error(error_msg)
+        raise HTTPException(status_code=500, detail=error_msg)
     else:
-        logger.info("📬 Search index updated via API")
-
-    try:
-        if state.config and state.config.processor:
-            state.processor_config = configure_processor(state.config.processor)
-    except ValueError as e:
-        logger.error(e)
-        raise HTTPException(status_code=500, detail=str(e))
-    else:
-        logger.info("📬 Processor reconfigured via API")
+        components = []
+        if state.search_models:
+            components.append("Search models")
+        if state.content_index:
+            components.append("Content index")
+        if state.processor_config:
+            components.append("Conversation processor")
+        components_msg = ", ".join(components)
+        logger.info(f"📬 {components_msg} updated via API")
 
     update_telemetry_state(
         request=request,

From 7ad96036b0ff09f4f72056740ac9cdb51e947492 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 12:07:07 -0700
Subject: [PATCH 02/16] Improve lock name to config_lock instead of
 search_index_lock

It is used to lock updates to all app config state, including processor
---
 src/khoj/configure.py   | 16 ++++++++--------
 src/khoj/utils/state.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 087245f8..680c5417 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -64,29 +64,29 @@ def configure_server(config: FullConfig, regenerate: bool, search_type: Optional
 
     # Initialize Processor from Config
     try:
-        state.search_index_lock.acquire()
+        state.config_lock.acquire()
         state.processor_config = configure_processor(state.config.processor)
     except Exception as e:
         logger.error(f"🚨 Failed to configure processor")
         raise e
     finally:
-        state.search_index_lock.release()
+        state.config_lock.release()
 
     # Initialize Search Models from Config
     try:
-        state.search_index_lock.acquire()
+        state.config_lock.acquire()
         state.SearchType = configure_search_types(state.config)
         state.search_models = configure_search(state.search_models, state.config.search_type)
     except Exception as e:
         logger.error(f"🚨 Failed to configure search models")
         raise e
     finally:
-        state.search_index_lock.release()
+        state.config_lock.release()
 
     # Initialize Content from Config
     if state.search_models:
         try:
-            state.search_index_lock.acquire()
+            state.config_lock.acquire()
             state.content_index = configure_content(
                 state.content_index, state.config.content_type, state.search_models, regenerate, search_type
             )
@@ -94,7 +94,7 @@ def configure_server(config: FullConfig, regenerate: bool, search_type: Optional
             logger.error(f"🚨 Failed to index content")
             raise e
         finally:
-            state.search_index_lock.release()
+            state.config_lock.release()
 
 
 def configure_routes(app):
@@ -114,7 +114,7 @@ if not state.demo:
     @schedule.repeat(schedule.every(61).minutes)
     def update_search_index():
         try:
-            state.search_index_lock.acquire()
+            state.config_lock.acquire()
             state.content_index = configure_content(
                 state.content_index, state.config.content_type, state.search_models, regenerate=False
             )
@@ -122,7 +122,7 @@ if not state.demo:
         except Exception as e:
             logger.error(f"🚨 Error updating content index via Scheduler: {e}")
         finally:
-            state.search_index_lock.release()
+            state.config_lock.release()
 
 
 def configure_search_types(config: FullConfig):
diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py
index 89688e15..40b3daae 100644
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@@ -24,7 +24,7 @@ host: str = None
 port: int = None
 cli_args: List[str] = None
 query_cache = LRU()
-search_index_lock = threading.Lock()
+config_lock = threading.Lock()
 SearchType = utils_config.SearchType
 telemetry: List[Dict[str, str]] = []
 previous_query: str = None

From da98b92dd46e6eabadcf5df54f81af5a0c1cf734 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 14:33:15 -0700
Subject: [PATCH 03/16] Create helper function to test value, order of entries
 & embeddings

This helper should be used to observe if the current embeddings are
stable sorted on regenerate and incremental update of index in text
search tests
---
 tests/test_text_search.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index c18a4c42..5809f327 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -5,6 +5,7 @@ import os
 
 # External Packages
 import pytest
+import torch
 from khoj.utils.config import SearchModels
 
 # Internal Packages
@@ -202,3 +203,25 @@ def test_asymmetric_setup_github(content_config: ContentConfig, search_models: S
 
     # Assert
     assert len(github_model.entries) > 1
+
+
+def compare_index(initial_notes_model, final_notes_model):
+    mismatched_entries, mismatched_embeddings = [], []
+    for index in range(len(initial_notes_model.entries)):
+        if initial_notes_model.entries[index].to_json() != final_notes_model.entries[index].to_json():
+            mismatched_entries.append(index)
+
+    # verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings
+    for index in range(len(initial_notes_model.corpus_embeddings)):
+        if not torch.equal(final_notes_model.corpus_embeddings[index], initial_notes_model.corpus_embeddings[index]):
+            mismatched_embeddings.append(index)
+
+    error_details = ""
+    if mismatched_entries:
+        mismatched_entries_str = ",".join(map(str, mismatched_entries))
+        error_details += f"Entries at {mismatched_entries_str} not equal\n"
+    if mismatched_embeddings:
+        mismatched_embeddings_str = ", ".join(map(str, mismatched_embeddings))
+        error_details += f"Embeddings at {mismatched_embeddings_str} not equal\n"
+
+    return error_details

From 88d1a29a849e491f5fcd3c2b3a993d56df6d6103 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 17:18:07 -0700
Subject: [PATCH 04/16] Test index is stable for duplicate entries across
 regenerate, update

- Current incorrect behavior:
  All entries with duplicate compiled form are kept on regenerate
  but on update only the last of the duplicated entries is kept

This divergent behavior is not ideal to prevent index corruption
across reconfigure and update
---
 tests/test_text_search.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 5809f327..3e8f7d3d 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -161,6 +161,40 @@ def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchM
     content_config.org.input_files = []
 
 
+# ----------------------------------------------------------------------------------------------------
+def test_update_index_with_duplicate_entries_in_stable_order(
+    org_config_with_only_new_file: TextContentConfig, search_models: SearchModels
+):
+    # Arrange
+    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
+
+    # Insert org-mode entries with same compiled form into new org file
+    new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+    with open(new_file_to_index, "w") as f:
+        f.write(f"{new_entry}{new_entry}")
+
+    # Act
+    # load embeddings, entries, notes model after adding new org-mode file
+    initial_index = text_search.setup(
+        OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
+    )
+
+    # update embeddings, entries, notes model after adding new org-mode file
+    updated_index = text_search.setup(
+        OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
+    )
+
+    # Assert
+    # verify only 1 entry added even if there are multiple duplicate entries
+    assert len(initial_index.entries) == len(updated_index.entries) == 1
+    assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) == 1
+
+    # verify the same entry is added even when there are multiple duplicate entries
+    error_details = compare_index(initial_index, updated_index)
+    if error_details:
+        pytest.fail(error_details)
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_incremental_update(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
     # Arrange

From 1673bb55587e56c3e306e4dd2390632c8f75880e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 19:55:40 -0700
Subject: [PATCH 05/16] Add todo state to compiled form of each org-mode entry

---
 src/khoj/processor/org_mode/org_to_jsonl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index 664427d9..608fdd41 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -125,9 +125,13 @@ class OrgToJsonl(TextToJsonl):
                 # Ignore title notes i.e notes with just headings and empty body
                 continue
 
+            todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
             # Prepend filename as top heading to entry
             filename = Path(entry_to_file_map[parsed_entry]).stem
-            heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
+            if parsed_entry.heading:
+                heading = f"* {filename}\n** {todo_str}{parsed_entry.heading}."
+            else:
+                heading = f"* {filename}."
 
             compiled = heading
             if state.verbose > 2:

From 9bcca432996fdd3366760a999f518d6681038903 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 20:03:19 -0700
Subject: [PATCH 06/16] Use single func to handle indexing from scratch and
 incrementally

Previous regenerate mechanism did not deduplicate entries with same key
So entries looked different between regenerate and update
Having single func, mark_entries_for_update, to handle both scenarios
will avoid this divergence

Update all text_to_jsonl methods to use the above method for
generating index from scratch
---
 src/khoj/processor/github/github_to_jsonl.py     |  3 +--
 src/khoj/processor/jsonl/jsonl_to_jsonl.py       | 14 ++++----------
 src/khoj/processor/markdown/markdown_to_jsonl.py | 11 ++++-------
 src/khoj/processor/notion/notion_to_jsonl.py     | 11 ++++-------
 src/khoj/processor/org_mode/org_to_jsonl.py      |  6 ++----
 src/khoj/processor/pdf/pdf_to_jsonl.py           | 11 ++++-------
 src/khoj/processor/text_to_jsonl.py              |  2 +-
 src/khoj/search_type/text_search.py              |  8 ++++----
 8 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index dd797c31..9dbdc093 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -15,7 +15,6 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
 from khoj.utils.rawconfig import Entry
-from khoj.utils import state
 
 
 logger = logging.getLogger(__name__)
@@ -38,7 +37,7 @@ class GithubToJsonl(TextToJsonl):
         else:
             return
 
-    def process(self, previous_entries=None):
+    def process(self, previous_entries=[]):
         current_entries = []
         for repo in self.config.repos:
             current_entries += self.process_repo(repo)
diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
index f743d5d5..c033f522 100644
--- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py
+++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 
 class JsonlToJsonl(TextToJsonl):
     # Define Functions
-    def process(self, previous_entries=None):
+    def process(self, previous_entries=[]):
         # Extract required fields from config
         input_jsonl_files, input_jsonl_filter, output_file = (
             self.config.input_files,
@@ -38,15 +38,9 @@ class JsonlToJsonl(TextToJsonl):
 
         # Identify, mark and merge any new entries with previous entries
         with timer("Identify new or updated entries", logger):
-            if not previous_entries:
-                entries_with_ids = list(enumerate(current_entries))
-            else:
-                entries_with_ids = TextToJsonl.mark_entries_for_update(
-                    current_entries,
-                    previous_entries,
-                    key="compiled",
-                    logger=logger,
-                )
+            entries_with_ids = TextToJsonl.mark_entries_for_update(
+                current_entries, previous_entries, key="compiled", logger=logger
+            )
 
         with timer("Write entries to JSONL file", logger):
             # Process Each Entry from All Notes Files
diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index 21cbda72..2da5bd4c 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -23,7 +23,7 @@ class MarkdownToJsonl(TextToJsonl):
         self.config = config
 
     # Define Functions
-    def process(self, previous_entries=None):
+    def process(self, previous_entries=[]):
         # Extract required fields from config
         markdown_files, markdown_file_filter, output_file = (
             self.config.input_files,
@@ -51,12 +51,9 @@ class MarkdownToJsonl(TextToJsonl):
 
         # Identify, mark and merge any new entries with previous entries
         with timer("Identify new or updated entries", logger):
-            if not previous_entries:
-                entries_with_ids = list(enumerate(current_entries))
-            else:
-                entries_with_ids = TextToJsonl.mark_entries_for_update(
-                    current_entries, previous_entries, key="compiled", logger=logger
-                )
+            entries_with_ids = TextToJsonl.mark_entries_for_update(
+                current_entries, previous_entries, key="compiled", logger=logger
+            )
 
         with timer("Write markdown entries to JSONL file", logger):
             # Process Each Entry from All Notes Files
diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py
index 20a11cd7..d4cd78f3 100644
--- a/src/khoj/processor/notion/notion_to_jsonl.py
+++ b/src/khoj/processor/notion/notion_to_jsonl.py
@@ -80,7 +80,7 @@ class NotionToJsonl(TextToJsonl):
 
         self.body_params = {"page_size": 100}
 
-    def process(self, previous_entries=None):
+    def process(self, previous_entries=[]):
         current_entries = []
 
         # Get all pages
@@ -240,12 +240,9 @@ class NotionToJsonl(TextToJsonl):
     def update_entries_with_ids(self, current_entries, previous_entries):
         # Identify, mark and merge any new entries with previous entries
         with timer("Identify new or updated entries", logger):
-            if not previous_entries:
-                entries_with_ids = list(enumerate(current_entries))
-            else:
-                entries_with_ids = TextToJsonl.mark_entries_for_update(
-                    current_entries, previous_entries, key="compiled", logger=logger
-                )
+            entries_with_ids = TextToJsonl.mark_entries_for_update(
+                current_entries, previous_entries, key="compiled", logger=logger
+            )
 
         with timer("Write Notion entries to JSONL file", logger):
             # Process Each Entry from all Notion entries
diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index 608fdd41..b00a6c50 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -22,7 +22,7 @@ class OrgToJsonl(TextToJsonl):
         self.config = config
 
     # Define Functions
-    def process(self, previous_entries: List[Entry] = None):
+    def process(self, previous_entries: List[Entry] = []):
         # Extract required fields from config
         org_files, org_file_filter, output_file = (
             self.config.input_files,
@@ -51,9 +51,7 @@ class OrgToJsonl(TextToJsonl):
             current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
 
         # Identify, mark and merge any new entries with previous entries
-        if not previous_entries:
-            entries_with_ids = list(enumerate(current_entries))
-        else:
+        with timer("Identify new or updated entries", logger):
             entries_with_ids = TextToJsonl.mark_entries_for_update(
                 current_entries, previous_entries, key="compiled", logger=logger
             )
diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py
index c5c395bc..e41fd976 100644
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
 
 class PdfToJsonl(TextToJsonl):
     # Define Functions
-    def process(self, previous_entries=None):
+    def process(self, previous_entries=[]):
         # Extract required fields from config
         pdf_files, pdf_file_filter, output_file = (
             self.config.input_files,
@@ -45,12 +45,9 @@ class PdfToJsonl(TextToJsonl):
 
         # Identify, mark and merge any new entries with previous entries
         with timer("Identify new or updated entries", logger):
-            if not previous_entries:
-                entries_with_ids = list(enumerate(current_entries))
-            else:
-                entries_with_ids = TextToJsonl.mark_entries_for_update(
-                    current_entries, previous_entries, key="compiled", logger=logger
-                )
+            entries_with_ids = TextToJsonl.mark_entries_for_update(
+                current_entries, previous_entries, key="compiled", logger=logger
+            )
 
         with timer("Write PDF entries to JSONL file", logger):
             # Process Each Entry from All Notes Files
diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index a4d01cf5..ff169fac 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -17,7 +17,7 @@ class TextToJsonl(ABC):
         self.config = config
 
     @abstractmethod
-    def process(self, previous_entries: List[Entry] = None) -> List[Tuple[int, Entry]]:
+    def process(self, previous_entries: List[Entry] = []) -> List[Tuple[int, Entry]]:
         ...
 
     @staticmethod
diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py
index a77be6e1..edc735f2 100644
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -176,10 +176,10 @@ def setup(
 ) -> TextContent:
     # Map notes in text files to (compressed) JSONL formatted file
     config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
-    previous_entries = (
-        extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
-    )
-    entries_with_indices = text_to_jsonl(config).process(previous_entries or [])
+    previous_entries = []
+    if config.compressed_jsonl.exists() and not regenerate:
+        previous_entries = extract_entries(config.compressed_jsonl)
+    entries_with_indices = text_to_jsonl(config).process(previous_entries)
 
     # Extract Updated Entries
     entries = extract_entries(config.compressed_jsonl)

From 6e70b914c2e8792dc7c8fb44d555b4864dcc691c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 20:42:26 -0700
Subject: [PATCH 07/16] Remove unused dump_jsonl method

The entries index is stored ingzipped jsonl files for each content type
---
 src/khoj/processor/github/github_to_jsonl.py     |  7 ++-----
 src/khoj/processor/jsonl/jsonl_to_jsonl.py       |  7 ++-----
 src/khoj/processor/markdown/markdown_to_jsonl.py |  7 ++-----
 src/khoj/processor/notion/notion_to_jsonl.py     |  7 ++-----
 src/khoj/processor/org_mode/org_to_jsonl.py      |  7 ++-----
 src/khoj/processor/pdf/pdf_to_jsonl.py           |  7 ++-----
 src/khoj/utils/jsonl.py                          | 13 +------------
 tests/conftest.py                                |  6 +++---
 8 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 9dbdc093..91dbd6da 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -13,7 +13,7 @@ from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
-from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 
 
@@ -97,10 +97,7 @@ class GithubToJsonl(TextToJsonl):
             jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if self.config.compressed_jsonl.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
-            elif self.config.compressed_jsonl.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, self.config.compressed_jsonl)
+            compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
 
         return entries_with_ids
 
diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
index c033f522..3c824545 100644
--- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py
+++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
@@ -7,7 +7,7 @@ from typing import List
 # Internal Packages
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, timer
-from khoj.utils.jsonl import load_jsonl, dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 
 
@@ -48,10 +48,7 @@ class JsonlToJsonl(TextToJsonl):
             jsonl_data = JsonlToJsonl.convert_entries_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if output_file.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, output_file)
-            elif output_file.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, output_file)
+            compress_jsonl_data(jsonl_data, output_file)
 
         return entries_with_ids
 
diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index 2da5bd4c..b6acbfbb 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -10,7 +10,7 @@ from typing import List
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
 from khoj.utils.constants import empty_escape_sequences
-from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import compress_jsonl_data
 from khoj.utils.rawconfig import Entry, TextContentConfig
 
 
@@ -61,10 +61,7 @@ class MarkdownToJsonl(TextToJsonl):
             jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if output_file.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, output_file)
-            elif output_file.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, output_file)
+            compress_jsonl_data(jsonl_data, output_file)
 
         return entries_with_ids
 
diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py
index d4cd78f3..489f0341 100644
--- a/src/khoj/processor/notion/notion_to_jsonl.py
+++ b/src/khoj/processor/notion/notion_to_jsonl.py
@@ -8,7 +8,7 @@ import requests
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry, NotionContentConfig
 from khoj.processor.text_to_jsonl import TextToJsonl
-from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 
 from enum import Enum
@@ -250,9 +250,6 @@ class NotionToJsonl(TextToJsonl):
             jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if self.config.compressed_jsonl.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
-            elif self.config.compressed_jsonl.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, self.config.compressed_jsonl)
+            compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
 
         return entries_with_ids
diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index b00a6c50..b3bc06fd 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -8,7 +8,7 @@ from typing import Iterable, List
 from khoj.processor.org_mode import orgnode
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
-from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import compress_jsonl_data
 from khoj.utils.rawconfig import Entry, TextContentConfig
 from khoj.utils import state
 
@@ -62,10 +62,7 @@ class OrgToJsonl(TextToJsonl):
             jsonl_data = self.convert_org_entries_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if output_file.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, output_file)
-            elif output_file.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, output_file)
+            compress_jsonl_data(jsonl_data, output_file)
 
         return entries_with_ids
 
diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py
index e41fd976..f8a20692 100644
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@@ -10,7 +10,7 @@ from langchain.document_loaders import PyPDFLoader
 # Internal Packages
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
-from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.jsonl import compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 
 
@@ -55,10 +55,7 @@ class PdfToJsonl(TextToJsonl):
             jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
 
             # Compress JSONL formatted Data
-            if output_file.suffix == ".gz":
-                compress_jsonl_data(jsonl_data, output_file)
-            elif output_file.suffix == ".jsonl":
-                dump_jsonl(jsonl_data, output_file)
+            compress_jsonl_data(jsonl_data, output_file)
 
         return entries_with_ids
 
diff --git a/src/khoj/utils/jsonl.py b/src/khoj/utils/jsonl.py
index c9576810..ed779e79 100644
--- a/src/khoj/utils/jsonl.py
+++ b/src/khoj/utils/jsonl.py
@@ -20,7 +20,7 @@ def load_jsonl(input_path):
     # Open JSONL file
     if input_path.suffix == ".gz":
         jsonl_file = gzip.open(get_absolute_path(input_path), "rt", encoding="utf-8")
-    elif input_path.suffix == ".jsonl":
+    else:
         jsonl_file = open(get_absolute_path(input_path), "r", encoding="utf-8")
 
     # Read JSONL file
@@ -36,17 +36,6 @@ def load_jsonl(input_path):
     return data
 
 
-def dump_jsonl(jsonl_data, output_path):
-    "Write List of JSON objects to JSON line file"
-    # Create output directory, if it doesn't exist
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-
-    with open(output_path, "w", encoding="utf-8") as f:
-        f.write(jsonl_data)
-
-    logger.debug(f"Wrote jsonl data to {output_path}")
-
-
 def compress_jsonl_data(jsonl_data, output_path):
     # Create output directory, if it doesn't exist
     output_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tests/conftest.py b/tests/conftest.py
index a92d33ca..07c5156f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -90,7 +90,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
     content_config.org = TextContentConfig(
         input_files=None,
         input_filter=["tests/data/org/*.org"],
-        compressed_jsonl=content_dir.joinpath("notes.jsonl"),
+        compressed_jsonl=content_dir.joinpath("notes.jsonl.gz"),
         embeddings_file=content_dir.joinpath("note_embeddings.pt"),
     )
 
@@ -101,7 +101,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
 
     content_config.plugins = {
         "plugin1": TextContentConfig(
-            input_files=[content_dir.joinpath("notes.jsonl")],
+            input_files=[content_dir.joinpath("notes.jsonl.gz")],
             input_filter=None,
             compressed_jsonl=content_dir.joinpath("plugin.jsonl.gz"),
             embeddings_file=content_dir.joinpath("plugin_embeddings.pt"),
@@ -142,7 +142,7 @@ def md_content_config(tmp_path_factory):
     content_config.markdown = TextContentConfig(
         input_files=None,
         input_filter=["tests/data/markdown/*.markdown"],
-        compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
+        compressed_jsonl=content_dir.joinpath("markdown.jsonl.gz"),
         embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
     )
 

From 7669b85da648ec02be508685a35d78bbdde4ff1d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 00:47:11 -0700
Subject: [PATCH 08/16] Test index is stable sorted on regenerate with new
 entry

---
 tests/test_text_search.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 3e8f7d3d..9a692cf9 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -122,7 +122,9 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: TextContent
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
+def test_regenerate_index_with_new_entry(
+    content_config: ContentConfig, search_models: SearchModels, new_org_file: Path
+):
     # Arrange
     initial_notes_model = text_search.setup(
         OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
@@ -136,25 +138,20 @@ def test_asymmetric_reload(content_config: ContentConfig, search_models: SearchM
     with open(new_org_file, "w") as f:
         f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
 
+    # Act
     # regenerate notes jsonl, model embeddings and model to include entry from new file
     regenerated_notes_model = text_search.setup(
         OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
     )
 
-    # Act
-    # reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
-    initial_notes_model = text_search.setup(
-        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
-    )
-
     # Assert
     assert len(regenerated_notes_model.entries) == 11
     assert len(regenerated_notes_model.corpus_embeddings) == 11
 
-    # Assert
-    # verify new entry loaded from updated embeddings, entries
-    assert len(initial_notes_model.entries) == 11
-    assert len(initial_notes_model.corpus_embeddings) == 11
+    # verify new entry appended to index, without disrupting order or content of existing entries
+    error_details = compare_index(initial_notes_model, regenerated_notes_model)
+    if error_details:
+        pytest.fail(error_details, False)
 
     # Cleanup
     # reset input_files in config to empty list

From 6a0297cc86b898646d406d2a4fae732539a04658 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 15 Jul 2023 23:58:13 -0700
Subject: [PATCH 09/16] Stable sort new entries when marking entries for update

---
 src/khoj/processor/text_to_jsonl.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index ff169fac..f92ab7b1 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -78,16 +78,23 @@ class TextToJsonl(ABC):
             # All entries that exist in both current and previous sets are kept
             existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
 
+            # load new entries in the order in which they are processed for a stable sort
+            new_entries = [
+                (current_entry_hashes.index(entry_hash), hash_to_current_entries[entry_hash])
+                for entry_hash in new_entry_hashes
+            ]
+            new_entries_sorted = sorted(new_entries, key=lambda e: e[0])
             # Mark new entries with -1 id to flag for later embeddings generation
-            new_entries = [(-1, hash_to_current_entries[entry_hash]) for entry_hash in new_entry_hashes]
+            new_entries_sorted = [(-1, entry[1]) for entry in new_entries_sorted]
+
             # Set id of existing entries to their previous ids to reuse their existing encoded embeddings
             existing_entries = [
                 (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
                 for entry_hash in existing_entry_hashes
             ]
-
             existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
-            entries_with_ids = existing_entries_sorted + new_entries
+
+            entries_with_ids = existing_entries_sorted + new_entries_sorted
 
         return entries_with_ids
 

From 89c7819cb79cd2a51a73c611f83e51916be24337 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 00:22:14 -0700
Subject: [PATCH 10/16] Unify logic to generate embeddings from scratch and
 incrementally

This simplifies the `compute_embeddings' method and avoids potential
later divergence in handling the index regenerate vs update scenarios
---
 src/khoj/search_type/text_search.py | 40 +++++++++++++----------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py
index edc735f2..ed3be33c 100644
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -62,36 +62,32 @@ def compute_embeddings(
 ):
     "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
     new_entries = []
+    create_index_msg = ""
     # Load pre-computed embeddings from file if exists and update them if required
     if embeddings_file.exists() and not regenerate:
         corpus_embeddings: torch.Tensor = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
         logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
-
-        # Encode any new entries in the corpus and update corpus embeddings
-        new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
-        if new_entries:
-            logger.info(f"📩 Indexing {len(new_entries)} text entries.")
-            new_embeddings = bi_encoder.encode(
-                new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
-            )
-            existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
-            if existing_entry_ids:
-                existing_embeddings = torch.index_select(
-                    corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
-                )
-            else:
-                existing_embeddings = torch.tensor([], device=state.device)
-            corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
-    # Else compute the corpus embeddings from scratch
     else:
-        new_entries = [entry.compiled for _, entry in entries_with_ids]
-        logger.info(f"📩 Indexing {len(new_entries)} text entries. Creating index from scratch.")
-        corpus_embeddings = bi_encoder.encode(
+        corpus_embeddings = torch.tensor([], device=state.device)
+        create_index_msg = " Creating index from scratch."
+
+    # Encode any new entries in the corpus and update corpus embeddings
+    new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
+    if new_entries:
+        logger.info(f"📩 Indexing {len(new_entries)} text entries.{create_index_msg}")
+        new_embeddings = bi_encoder.encode(
             new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
         )
+        existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
+        if existing_entry_ids:
+            existing_embeddings = torch.index_select(
+                corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
+            )
+        else:
+            existing_embeddings = torch.tensor([], device=state.device)
+        corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
 
-    # Save regenerated or updated embeddings to file
-    if new_entries:
+        # Save regenerated or updated embeddings to file
         corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
         torch.save(corpus_embeddings, embeddings_file)
         logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")

From b02323ade6713b97857c7073f9ccee94aa5e1303 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 01:23:22 -0700
Subject: [PATCH 11/16] Improve name of text search test functions

Asymmetric was older name used to differentiate between symmetric,
asymmetric search.

Now that text search just uses asymmetric search stick to simpler name
---
 tests/test_text_search.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 9a692cf9..bdd1c5c4 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -18,7 +18,7 @@ from khoj.processor.github.github_to_jsonl import GithubToJsonl
 
 # Test
 # ----------------------------------------------------------------------------------------------------
-def test_asymmetric_setup_with_missing_file_raises_error(
+def test_text_search_setup_with_missing_file_raises_error(
     org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
 ):
     # Arrange
@@ -33,7 +33,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_asymmetric_setup_with_empty_file_raises_error(
+def test_text_search_setup_with_empty_file_raises_error(
     org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
 ):
     # Act
@@ -43,7 +43,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_asymmetric_setup(content_config: ContentConfig, search_models: SearchModels):
+def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
     # Act
     # Regenerate notes embeddings during asymmetric setup
     notes_model = text_search.setup(
@@ -56,7 +56,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_models: SearchMo
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_text_content_index_only_updates_on_changes(content_config: ContentConfig, search_models: SearchModels, caplog):
+def test_text_index_same_if_content_unchanged(content_config: ContentConfig, search_models: SearchModels, caplog):
     # Arrange
     caplog.set_level(logging.INFO, logger="khoj")
 
@@ -193,7 +193,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
 
 
 # ----------------------------------------------------------------------------------------------------
-def test_incremental_update(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
+def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
     # Arrange
     initial_notes_model = text_search.setup(
         OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True

From 1482fd4d4d6e14008871bea185964754d1092706 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 01:24:03 -0700
Subject: [PATCH 12/16] Test index is stable sorted on incremental update with
 new entry

Ensure order of new embedding, entry insertion on incremental update
is stable
---
 tests/test_text_search.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index bdd1c5c4..6496a80d 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -199,24 +199,27 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model
         OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
     )
 
-    assert len(initial_notes_model.entries) == 10
-    assert len(initial_notes_model.corpus_embeddings) == 10
-
     # append org-mode entry to first org input file in config
     with open(new_org_file, "w") as f:
-        f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
+        new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+        f.write(new_entry)
 
     # Act
     # update embeddings, entries with the newly added note
     content_config.org.input_files = [f"{new_org_file}"]
-    initial_notes_model = text_search.setup(
+    final_notes_model = text_search.setup(
         OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
     )
 
     # Assert
-    # verify new entry added in updated embeddings, entries
-    assert len(initial_notes_model.entries) == 11
-    assert len(initial_notes_model.corpus_embeddings) == 11
+    assert len(final_notes_model.entries) == len(initial_notes_model.entries) + 1
+    assert len(final_notes_model.corpus_embeddings) == len(initial_notes_model.corpus_embeddings) + 1
+
+    # verify new entry appended to index, without disrupting order or content of existing entries
+    error_details = compare_index(initial_notes_model, final_notes_model)
+    if error_details:
+        # fails at embeddings index 4, 7. These are not swapped with the new entry embedding or each other
+        pytest.fail(error_details, False)
 
     # Cleanup
     # reset input_files in config to empty list

From ad41ef39918120463f6af7e53de0a5b50a1a8230 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 02:16:33 -0700
Subject: [PATCH 13/16] Make normalizing embeddings configurable

---
 src/khoj/search_type/text_search.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py
index ed3be33c..c123974a 100644
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -58,7 +58,11 @@ def extract_entries(jsonl_file) -> List[Entry]:
 
 
 def compute_embeddings(
-    entries_with_ids: List[Tuple[int, Entry]], bi_encoder: BaseEncoder, embeddings_file: Path, regenerate=False
+    entries_with_ids: List[Tuple[int, Entry]],
+    bi_encoder: BaseEncoder,
+    embeddings_file: Path,
+    regenerate=False,
+    normalize=True,
 ):
     "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
     new_entries = []
@@ -87,8 +91,11 @@ def compute_embeddings(
             existing_embeddings = torch.tensor([], device=state.device)
         corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
 
+        if normalize:
+            # Normalize embeddings for faster lookup via dot product when querying
+            corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
+
         # Save regenerated or updated embeddings to file
-        corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
         torch.save(corpus_embeddings, embeddings_file)
         logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
 
@@ -169,6 +176,7 @@ def setup(
     bi_encoder: BaseEncoder,
     regenerate: bool,
     filters: List[BaseFilter] = [],
+    normalize: bool = True,
 ) -> TextContent:
     # Map notes in text files to (compressed) JSONL formatted file
     config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
@@ -186,7 +194,7 @@ def setup(
     # Compute or Load Embeddings
     config.embeddings_file = resolve_absolute_path(config.embeddings_file)
     corpus_embeddings = compute_embeddings(
-        entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate
+        entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate, normalize=normalize
     )
 
     for filter in filters:

From c73feebf257442325d193e47645901271d436a2f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 02:16:58 -0700
Subject: [PATCH 14/16] Test index embeddings are stable on incremental update
 & no norm

Ensure order of new embedding insertion on incremental update
does not affect the order and value of existing embeddings when
normalization is turned off
---
 tests/test_text_search.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 6496a80d..830b0da5 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -196,7 +196,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
 def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
     # Arrange
     initial_notes_model = text_search.setup(
-        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True
+        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=True, normalize=False
     )
 
     # append org-mode entry to first org input file in config
@@ -208,7 +208,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model
     # update embeddings, entries with the newly added note
     content_config.org.input_files = [f"{new_org_file}"]
     final_notes_model = text_search.setup(
-        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False
+        OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, normalize=False
     )
 
     # Assert
@@ -218,7 +218,6 @@ def test_update_index_with_new_entry(content_config: ContentConfig, search_model
     # verify new entry appended to index, without disrupting order or content of existing entries
     error_details = compare_index(initial_notes_model, final_notes_model)
     if error_details:
-        # fails at embeddings index 4, 7. These are not swapped with the new entry embedding or each other
         pytest.fail(error_details, False)
 
     # Cleanup

From ef6a0044f45aa26b89f40415d1fb19006799a35a Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 16 Jul 2023 03:47:05 -0700
Subject: [PATCH 15/16] Drop embeddings of deleted text entries from index

Previously the deleted embeddings would continue to be in the index,
even after the entry was deleted
---
 src/khoj/search_type/text_search.py | 32 ++++++++++++-----------
 tests/test_text_search.py           | 39 +++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py
index c123974a..09174186 100644
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -65,7 +65,8 @@ def compute_embeddings(
     normalize=True,
 ):
     "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
-    new_entries = []
+    new_embeddings = torch.tensor([], device=state.device)
+    existing_embeddings = torch.tensor([], device=state.device)
     create_index_msg = ""
     # Load pre-computed embeddings from file if exists and update them if required
     if embeddings_file.exists() and not regenerate:
@@ -82,22 +83,23 @@ def compute_embeddings(
         new_embeddings = bi_encoder.encode(
             new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
         )
-        existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
-        if existing_entry_ids:
-            existing_embeddings = torch.index_select(
-                corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
-            )
-        else:
-            existing_embeddings = torch.tensor([], device=state.device)
-        corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
 
-        if normalize:
-            # Normalize embeddings for faster lookup via dot product when querying
-            corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
+    # Extract existing embeddings from previous corpus embeddings
+    existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
+    if existing_entry_ids:
+        existing_embeddings = torch.index_select(
+            corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
+        )
 
-        # Save regenerated or updated embeddings to file
-        torch.save(corpus_embeddings, embeddings_file)
-        logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
+    # Set corpus embeddings to merger of existing and new embeddings
+    corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
+    if normalize:
+        # Normalize embeddings for faster lookup via dot product when querying
+        corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
+
+    # Save regenerated or updated embeddings to file
+    torch.save(corpus_embeddings, embeddings_file)
+    logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
 
     return corpus_embeddings
 
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 830b0da5..1ae7e770 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -71,8 +71,8 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, sea
     final_logs = caplog.text
 
     # Assert
-    assert "📩 Saved computed text embeddings to" in initial_logs
-    assert "📩 Saved computed text embeddings to" not in final_logs
+    assert "Creating index from scratch." in initial_logs
+    assert "Creating index from scratch." not in final_logs
 
 
 # ----------------------------------------------------------------------------------------------------
@@ -192,6 +192,41 @@ def test_update_index_with_duplicate_entries_in_stable_order(
         pytest.fail(error_details)
 
 
+# ----------------------------------------------------------------------------------------------------
+def test_update_index_with_deleted_entry(org_config_with_only_new_file: TextContentConfig, search_models: SearchModels):
+    # Arrange
+    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
+
+    # Insert org-mode entries with same compiled form into new org file
+    new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+    with open(new_file_to_index, "w") as f:
+        f.write(f"{new_entry}{new_entry} -- Tatooine")
+
+    # load embeddings, entries, notes model after adding new org file with 2 entries
+    initial_index = text_search.setup(
+        OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=True
+    )
+
+    # update embeddings, entries, notes model after removing an entry from the org file
+    with open(new_file_to_index, "w") as f:
+        f.write(f"{new_entry}")
+
+    # Act
+    updated_index = text_search.setup(
+        OrgToJsonl, org_config_with_only_new_file, search_models.text_search.bi_encoder, regenerate=False
+    )
+
+    # Assert
+    # verify only 1 entry added even if there are multiple duplicate entries
+    assert len(initial_index.entries) == len(updated_index.entries) + 1
+    assert len(initial_index.corpus_embeddings) == len(updated_index.corpus_embeddings) + 1
+
+    # verify the same entry is added even when there are multiple duplicate entries
+    error_details = compare_index(updated_index, initial_index)
+    if error_details:
+        pytest.fail(error_details)
+
+
 # ----------------------------------------------------------------------------------------------------
 def test_update_index_with_new_entry(content_config: ContentConfig, search_models: SearchModels, new_org_file: Path):
     # Arrange

From 3e3a1ecbc87c43ff20945b7233671cc37dd48746 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 17 Jul 2023 14:33:02 -0700
Subject: [PATCH 16/16] Start app even if server init fails to let user fix it

Show stacktrace on error to help debugging
---
 src/khoj/configure.py   | 3 +--
 src/khoj/routers/api.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 680c5417..74f7bd9b 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -54,8 +54,7 @@ def initialize_server(
     try:
         configure_server(config, regenerate, type)
     except Exception as e:
-        logger.error(f"🚨 Failed to configure server on app load: {e}")
-        raise e
+        logger.error(f"🚨 Failed to configure server on app load: {e}", exc_info=True)
 
 
 def configure_server(config: FullConfig, regenerate: bool, search_type: Optional[SearchType] = None):
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 0269987d..834e8997 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -532,7 +532,7 @@ def update(
         configure_server(state.config, regenerate=force or False, search_type=t)
     except Exception as e:
         error_msg = f"🚨 Failed to update server via API: {e}"
-        logger.error(error_msg)
+        logger.error(error_msg, exc_info=True)
         raise HTTPException(status_code=500, detail=error_msg)
     else:
         components = []