Move to a push-first model for retrieving embeddings from local files (#457)

* Initial version - setup a file-push architecture for generating embeddings with Khoj
* Update unit tests to fix with new application design
* Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request
* Use state.host and state.port for configuring the URL for the indexer
* On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system
This commit is contained in:
sabaimran
2023-08-31 12:55:17 -07:00
committed by GitHub
parent 92cbfef7ab
commit 4854258047
23 changed files with 990 additions and 508 deletions

View File

@@ -9,6 +9,7 @@ import pytest
from khoj.main import app
from khoj.configure import configure_processor, configure_routes, configure_search_types
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.search_type import image_search, text_search
from khoj.utils.config import SearchModels
from khoj.utils.helpers import resolve_absolute_path
@@ -97,7 +98,12 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
OrgToJsonl,
get_sample_data("org"),
content_config.org,
search_models.text_search.bi_encoder,
regenerate=False,
filters=filters,
)
content_config.plugins = {
@@ -109,6 +115,20 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
)
}
if os.getenv("GITHUB_PAT_TOKEN"):
content_config.github = GithubContentConfig(
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
repos=[
GithubRepoConfig(
owner="khoj-ai",
name="lantern",
branch="master",
)
],
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
)
content_config.plaintext = TextContentConfig(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
@@ -132,6 +152,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(
JsonlToJsonl,
None,
content_config.plugins["plugin1"],
search_models.text_search.bi_encoder,
regenerate=False,
@@ -203,6 +224,7 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.content_index.markdown = text_search.setup(
MarkdownToJsonl,
get_sample_data("markdown"),
md_content_config.markdown,
state.search_models.text_search.bi_encoder,
regenerate=False,
@@ -226,11 +248,22 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.search_models.image_search = image_search.initialize_model(search_config.image)
state.content_index.org = text_search.setup(
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
OrgToJsonl,
get_sample_data("org"),
content_config.org,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.content_index.image = image_search.setup(
content_config.image, state.search_models.image_search, regenerate=False
)
state.content_index.plaintext = text_search.setup(
PlaintextToJsonl,
get_sample_data("plaintext"),
content_config.plaintext,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.processor_config = configure_processor(processor_config)
@@ -250,8 +283,21 @@ def client_offline_chat(
# Index Markdown Content for Search
filters = [DateFilter(), WordFilter(), FileFilter()]
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
state.search_models.image_search = image_search.initialize_model(search_config.image)
state.content_index.org = text_search.setup(
OrgToJsonl,
get_sample_data("org"),
content_config.org,
state.search_models.text_search.bi_encoder,
regenerate=False,
)
state.content_index.image = image_search.setup(
content_config.image, state.search_models.image_search, regenerate=False
)
state.content_index.markdown = text_search.setup(
MarkdownToJsonl,
get_sample_data("markdown"),
md_content_config.markdown,
state.search_models.text_search.bi_encoder,
regenerate=False,
@@ -284,3 +330,69 @@ def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: P
new_org_config.input_files = [f"{new_org_file}"]
new_org_config.input_filter = None
return new_org_config
@pytest.fixture(scope="function")
def sample_org_data():
return get_sample_data("org")
def get_sample_data(type):
sample_data = {
"org": {
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install
#+begin_src shell
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src"""
},
"markdown": {
"readme.markdown": """
# Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
## Dependencies
- Python3
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
## Install
```shell
git clone
conda env create -f environment.yml
conda activate khoj
```
"""
},
"plaintext": {
"readme.txt": """
Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
Dependencies
- Python3
- Miniconda
Install
git clone
conda env create -f environment.yml
conda activate khoj
"""
},
}
return sample_data[type]