From 2fe37a090fce8294a16929b71d58bb83df82286b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 6 Jan 2023 15:58:03 -0300
Subject: [PATCH 1/5] Make type of encoder to use for embeddings configurable
 via khoj.yml

- Previously `model_type' was set in the setup of each `search_type'
  - All encoders were of type `SentenceTransformer'
  - All cross_encoders were of type `CrossEncoder'

- Now `encoder-type' can be configured via the new `encoder_type' field
  in `TextSearchConfig' under `search-type` in `khoj.yml`.

- All the specified `encoder-type' class needs is an `encode' method
  that takes entries and returns embedding vectors
---
 src/search_type/image_search.py |  2 +-
 src/search_type/text_search.py  |  2 +-
 src/utils/helpers.py            | 14 +++++++++++---
 src/utils/rawconfig.py          |  2 ++
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/search_type/image_search.py b/src/search_type/image_search.py
index e04bbe49..10d429c3 100644
--- a/src/search_type/image_search.py
+++ b/src/search_type/image_search.py
@@ -36,7 +36,7 @@ def initialize_model(search_config: ImageSearchConfig):
     encoder = load_model(
         model_dir  = search_config.model_directory,
         model_name = search_config.encoder,
-        model_type = SentenceTransformer)
+        model_type = search_config.encoder_type or SentenceTransformer)
 
     return encoder
 
diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py
index 53eb3c3d..5fd04470 100644
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -37,7 +37,7 @@ def initialize_model(search_config: TextSearchConfig):
     bi_encoder = load_model(
         model_dir  = search_config.model_directory,
         model_name = search_config.encoder,
-        model_type = SentenceTransformer,
+        model_type = search_config.encoder_type or SentenceTransformer,
         device=f'{state.device}')
 
     # The cross-encoder re-ranks the results to improve quality
diff --git a/src/utils/helpers.py b/src/utils/helpers.py
index 2285018d..38181ad6 100644
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@@ -1,5 +1,6 @@
 # Standard Packages
 from pathlib import Path
+from importlib import import_module
 import sys
 from os.path import join
 from collections import OrderedDict
@@ -44,17 +45,18 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
     return merged_dict
 
 
-def load_model(model_name, model_dir, model_type, device:str=None):
+def load_model(model_name: str, model_type, model_dir=None, device:str=None):
     "Load model from disk or huggingface"
     # Construct model path
     model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None
 
     # Load model from model_path if it exists there
+    model_type_class = get_class_by_name(model_type) if isinstance(model_type, str) else model_type
     if model_path is not None and resolve_absolute_path(model_path).exists():
-        model = model_type(get_absolute_path(model_path), device=device)
+        model = model_type_class(get_absolute_path(model_path), device=device)
     # Else load the model from the model_name
     else:
-        model = model_type(model_name, device=device)
+        model = model_type_class(model_name, device=device)
         if model_path is not None:
             model.save(model_path)
 
@@ -66,6 +68,12 @@ def is_pyinstaller_app():
     return getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS')
 
 
+def get_class_by_name(name: str) -> object:
+    "Returns the class object from name string"
+    module_name, class_name = name.rsplit('.', 1)
+    return getattr(import_module(module_name), class_name)
+
+
 class LRU(OrderedDict):
     def __init__(self, *args, capacity=128, **kwargs):
         self.capacity = capacity
diff --git a/src/utils/rawconfig.py b/src/utils/rawconfig.py
index 165be0d1..5ed3a9eb 100644
--- a/src/utils/rawconfig.py
+++ b/src/utils/rawconfig.py
@@ -50,10 +50,12 @@ class ContentConfig(ConfigBase):
 class TextSearchConfig(ConfigBase):
     encoder: str
     cross_encoder: str
+    encoder_type: Optional[str]
     model_directory: Optional[Path]
 
 class ImageSearchConfig(ConfigBase):
     encoder: str
+    encoder_type: Optional[str]
     model_directory: Optional[Path]
 
 class SearchConfig(ConfigBase):

From 6a30a133265dbb177f72c5ee2360b8ca89422f10 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 6 Jan 2023 16:57:59 -0300
Subject: [PATCH 2/5] Only create model directory if the optional field is set
 in SearchConfig

---
 src/search_type/text_search.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py
index 5fd04470..816972cc 100644
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -27,11 +27,12 @@ def initialize_model(search_config: TextSearchConfig):
     # Number of entries we want to retrieve with the bi-encoder
     top_k = 15
 
-    # Convert model directory to absolute path
-    search_config.model_directory = resolve_absolute_path(search_config.model_directory)
-
-    # Create model directory if it doesn't exist
-    search_config.model_directory.parent.mkdir(parents=True, exist_ok=True)
+    # If model directory is configured
+    if search_config.model_directory:
+        # Convert model directory to absolute path
+        search_config.model_directory = resolve_absolute_path(search_config.model_directory)
+        # Create model directory if it doesn't exist
+        search_config.model_directory.parent.mkdir(parents=True, exist_ok=True)
 
     # The bi-encoder encodes all entries to use for semantic search
     bi_encoder = load_model(

From 826f9dc05455d04431cb253f82f3a3993e5de891 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 7 Jan 2023 16:59:33 -0300
Subject: [PATCH 3/5] Drop long words from compiled entries to be within max
 token limit of models

Long words (>500 characters) provide less useful context to models.

Dropping very long words allow models to create better embeddings by
passing more of the useful context from the entry to the model
---
 src/processor/text_to_jsonl.py |  4 +++-
 tests/test_org_to_jsonl.py     | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/processor/text_to_jsonl.py b/src/processor/text_to_jsonl.py
index 0eb60e6c..4d88a612 100644
--- a/src/processor/text_to_jsonl.py
+++ b/src/processor/text_to_jsonl.py
@@ -24,11 +24,13 @@ class TextToJsonl(ABC):
         return lambda entry: hashlib.md5(bytes(getattr(entry, key), encoding='utf-8')).hexdigest()
 
     @staticmethod
-    def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256) -> list[Entry]:
+    def split_entries_by_max_tokens(entries: list[Entry], max_tokens: int=256, max_word_length: int=500) -> list[Entry]:
         "Split entries if compiled entry length exceeds the max tokens supported by the ML model."
         chunked_entries: list[Entry] = []
         for entry in entries:
             compiled_entry_words = entry.compiled.split()
+            # Drop long words instead of having entry truncated to maintain quality of entry processed by models
+            compiled_entry_words = [word for word in compiled_entry_words if len(word) <= max_word_length]
             for chunk_index in range(0, len(compiled_entry_words), max_tokens):
                 compiled_entry_words_chunk = compiled_entry_words[chunk_index:chunk_index + max_tokens]
                 compiled_entry_chunk = ' '.join(compiled_entry_words_chunk)
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index fe64cc67..3f30b7fc 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -5,6 +5,7 @@ import json
 from src.processor.org_mode.org_to_jsonl import OrgToJsonl
 from src.processor.text_to_jsonl import TextToJsonl
 from src.utils.helpers import is_none_or_empty
+from src.utils.rawconfig import Entry
 
 
 def test_configure_heading_entry_to_jsonl(tmp_path):
@@ -61,6 +62,24 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
     assert len(jsonl_data) == 2
 
 
+def test_entry_split_drops_large_words(tmp_path):
+    "Ensure entries drops words larger than specified max word length from compiled version."
+    # Arrange
+    entry_text = f'''*** Heading
+    \t\r
+    Body Line 1
+    '''
+    entry = Entry(raw=entry_text, compiled=entry_text)
+
+    # Act
+    # Split entry by max words and drop words larger than max word length
+    processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0]
+
+    # Assert
+    # "Heading" dropped from compiled version because its over the set max word limit
+    assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1
+
+
 def test_entry_with_body_to_jsonl(tmp_path):
     "Ensure entries with valid body text are loaded."
     # Arrange

From c0ae8eee9952d0a21ece56387dd7cff069a6fd5f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 7 Jan 2023 17:08:59 -0300
Subject: [PATCH 4/5] Allow using OpenAI models for search in Khoj

- Init processor before search to instantiate `openai_api_key'
  from `khoj.yml'. The key is used to configure search with openai models

- To use OpenAI models for search in Khoj
  - Set `encoder' to name of an OpenAI model. E.g text-embedding-ada-002
  - Set `encoder-type' in `khoj.yml' to `src.utils.models.OpenAI'
  - Set `model-directory' to `null', as online model cannot be stored on disk
---
 src/configure.py    |  6 +++---
 src/utils/models.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 src/utils/models.py

diff --git a/src/configure.py b/src/configure.py
index 1ddc9f76..c1132e8c 100644
--- a/src/configure.py
+++ b/src/configure.py
@@ -34,14 +34,14 @@ def configure_server(args, required=False):
     else:
         state.config = args.config
 
+    # Initialize Processor from Config
+    state.processor_config = configure_processor(args.config.processor)
+
     # Initialize the search model from Config
     state.search_index_lock.acquire()
     state.model = configure_search(state.model, state.config, args.regenerate)
     state.search_index_lock.release()
 
-    # Initialize Processor from Config
-    state.processor_config = configure_processor(args.config.processor)
-
 
 @schedule.repeat(schedule.every(1).hour)
 def update_search_index():
diff --git a/src/utils/models.py b/src/utils/models.py
new file mode 100644
index 00000000..64197fe1
--- /dev/null
+++ b/src/utils/models.py
@@ -0,0 +1,28 @@
+# External Packages
+import openai
+import torch
+from tqdm import trange
+
+# Internal Packages
+from src.utils.state import processor_config
+
+
+class OpenAI:
+    def __init__(self, model_name, device=None):
+        self.model_name = model_name
+        openai.api_key = processor_config.conversation.openai_api_key
+        self.embedding_dimensions = 1536  # Default to embedding dimensions of text-embedding-ada-002 model
+
+    def encode(self, entries, device=None, **kwargs):
+        embedding_tensors = []
+        for index in trange(0, len(entries)):
+            try:
+                response = openai.Embedding.create(input=entries[index], model=self.model_name)
+                embedding_tensors += [torch.tensor(response.data[0].embedding, device=device)]
+                self.embedding_dimensions = len(response.data[0].embedding)  # Set embedding dimensions to this model's
+            except Exception as e:
+                print(f"Failed to encode entry {index} of length: {len(entries[index])}\n\n{entries[index][:1000]}...\n\n{e}")
+                embedding_tensors += [torch.zeros(self.embedding_dimensions, device=device)]
+        return torch.stack(embedding_tensors)
+
+

From 61190058384cca2ae2b9b4cc5168eb493268d40e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 7 Jan 2023 21:56:00 -0300
Subject: [PATCH 5/5] Improve comments, exceptions, typing and init of OpenAI
 model code

---
 src/utils/models.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/utils/models.py b/src/utils/models.py
index 64197fe1..9ba204b7 100644
--- a/src/utils/models.py
+++ b/src/utils/models.py
@@ -4,25 +4,35 @@ import torch
 from tqdm import trange
 
 # Internal Packages
-from src.utils.state import processor_config
+from src.utils.state import processor_config, config_file
 
 
 class OpenAI:
     def __init__(self, model_name, device=None):
         self.model_name = model_name
+        if not processor_config or not processor_config.conversation or not processor_config.conversation.openai_api_key:
+            raise Exception(f"Set OpenAI API key under processor-config > conversation > openai-api-key in config file: {config_file}")
         openai.api_key = processor_config.conversation.openai_api_key
-        self.embedding_dimensions = 1536  # Default to embedding dimensions of text-embedding-ada-002 model
+        self.embedding_dimensions = None
 
-    def encode(self, entries, device=None, **kwargs):
+    def encode(self, entries: list[str], device=None, **kwargs):
         embedding_tensors = []
+
         for index in trange(0, len(entries)):
+            # OpenAI models create better embeddings for entries without newlines
+            processed_entry = entries[index].replace('\n', ' ')
+
             try:
-                response = openai.Embedding.create(input=entries[index], model=self.model_name)
+                response = openai.Embedding.create(input=processed_entry, model=self.model_name)
                 embedding_tensors += [torch.tensor(response.data[0].embedding, device=device)]
-                self.embedding_dimensions = len(response.data[0].embedding)  # Set embedding dimensions to this model's
+                # Use current models embedding dimension, once available
+                # Else default to embedding dimensions of the text-embedding-ada-002 model
+                self.embedding_dimensions = len(response.data[0].embedding) if not self.embedding_dimensions else 1536
             except Exception as e:
                 print(f"Failed to encode entry {index} of length: {len(entries[index])}\n\n{entries[index][:1000]}...\n\n{e}")
+                # Use zero embedding vector for entries with failed embeddings
+                # This ensures entry embeddings match the order of the source entries
+                # And they have minimal similarity to other entries (as zero vectors are always orthogonal to other vector)
                 embedding_tensors += [torch.zeros(self.embedding_dimensions, device=device)]
-        return torch.stack(embedding_tensors)
-
 
+        return torch.stack(embedding_tensors)
\ No newline at end of file