mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Move to a push-first model for retrieving embeddings from local files (#457)
* Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system
This commit is contained in:
@@ -9,6 +9,7 @@ import pytest
|
||||
from khoj.main import app
|
||||
from khoj.configure import configure_processor, configure_routes, configure_search_types
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
@@ -97,7 +98,12 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
OrgToJsonl, content_config.org, search_models.text_search.bi_encoder, regenerate=False, filters=filters
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
content_config.plugins = {
|
||||
@@ -109,6 +115,20 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
)
|
||||
}
|
||||
|
||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||
content_config.github = GithubContentConfig(
|
||||
pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
|
||||
repos=[
|
||||
GithubRepoConfig(
|
||||
owner="khoj-ai",
|
||||
name="lantern",
|
||||
branch="master",
|
||||
)
|
||||
],
|
||||
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
|
||||
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
|
||||
)
|
||||
|
||||
content_config.plaintext = TextContentConfig(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
||||
@@ -132,6 +152,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, search_config:
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
text_search.setup(
|
||||
JsonlToJsonl,
|
||||
None,
|
||||
content_config.plugins["plugin1"],
|
||||
search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -203,6 +224,7 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
get_sample_data("markdown"),
|
||||
md_content_config.markdown,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -226,11 +248,22 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
state.content_index.org = text_search.setup(
|
||||
OrgToJsonl, content_config.org, state.search_models.text_search.bi_encoder, regenerate=False
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
state.content_index.plaintext = text_search.setup(
|
||||
PlaintextToJsonl,
|
||||
get_sample_data("plaintext"),
|
||||
content_config.plaintext,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
|
||||
state.processor_config = configure_processor(processor_config)
|
||||
|
||||
@@ -250,8 +283,21 @@ def client_offline_chat(
|
||||
# Index Markdown Content for Search
|
||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||
state.search_models.text_search = text_search.initialize_model(search_config.asymmetric)
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
state.content_index.org = text_search.setup(
|
||||
OrgToJsonl,
|
||||
get_sample_data("org"),
|
||||
content_config.org,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
|
||||
state.content_index.markdown = text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
get_sample_data("markdown"),
|
||||
md_content_config.markdown,
|
||||
state.search_models.text_search.bi_encoder,
|
||||
regenerate=False,
|
||||
@@ -284,3 +330,69 @@ def org_config_with_only_new_file(content_config: ContentConfig, new_org_file: P
|
||||
new_org_config.input_files = [f"{new_org_file}"]
|
||||
new_org_config.input_filter = None
|
||||
return new_org_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def sample_org_data():
|
||||
return get_sample_data("org")
|
||||
|
||||
|
||||
def get_sample_data(type):
|
||||
sample_data = {
|
||||
"org": {
|
||||
"readme.org": """
|
||||
* Khoj
|
||||
/Allow natural language search on user content like notes, images using transformer based models/
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
||||
|
||||
** Dependencies
|
||||
- Python3
|
||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||
|
||||
** Install
|
||||
#+begin_src shell
|
||||
git clone https://github.com/khoj-ai/khoj && cd khoj
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
#+end_src"""
|
||||
},
|
||||
"markdown": {
|
||||
"readme.markdown": """
|
||||
# Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
||||
|
||||
## Dependencies
|
||||
- Python3
|
||||
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
||||
|
||||
## Install
|
||||
```shell
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
"""
|
||||
},
|
||||
"plaintext": {
|
||||
"readme.txt": """
|
||||
Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
||||
|
||||
Dependencies
|
||||
- Python3
|
||||
- Miniconda
|
||||
|
||||
Install
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
"""
|
||||
},
|
||||
}
|
||||
|
||||
return sample_data[type]
|
||||
|
||||
Reference in New Issue
Block a user