mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Update test setup to index test data after old indexing code removed
- Delete tests testing deprecated server side indexing flows
- Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and
references in tests
- Index test data via new helper method, `get_index_files'
- It is modelled after the old `get_org_files' variants in main app
- It passes the test data in required format to `configure_content'
Allows maintaining the more realistic tests from before while
using new indexing mechanism (rather than the deprecated server
side indexing mechanism
This commit is contained in:
@@ -1,6 +1,3 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
@@ -11,6 +8,7 @@ from khoj.configure import (
|
||||
configure_routes,
|
||||
configure_search_types,
|
||||
)
|
||||
from khoj.database.adapters import get_default_search_model
|
||||
from khoj.database.models import (
|
||||
Agent,
|
||||
ChatModel,
|
||||
@@ -19,21 +17,14 @@ from khoj.database.models import (
|
||||
GithubRepoConfig,
|
||||
KhojApiUser,
|
||||
KhojUser,
|
||||
LocalMarkdownConfig,
|
||||
LocalOrgConfig,
|
||||
LocalPdfConfig,
|
||||
LocalPlaintextConfig,
|
||||
)
|
||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
|
||||
from khoj.routers.api_content import configure_content
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils import fs_syncer, state
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils import state
|
||||
from khoj.utils.constants import web_directory
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from tests.helpers import (
|
||||
AiModelApiFactory,
|
||||
ChatModelFactory,
|
||||
@@ -43,6 +34,8 @@ from tests.helpers import (
|
||||
UserFactory,
|
||||
get_chat_api_key,
|
||||
get_chat_provider,
|
||||
get_index_files,
|
||||
get_sample_data,
|
||||
)
|
||||
|
||||
|
||||
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def search_config() -> SearchConfig:
|
||||
def search_config():
|
||||
search_model = get_default_search_model()
|
||||
state.embeddings_model = dict()
|
||||
state.embeddings_model["default"] = EmbeddingsModel()
|
||||
state.embeddings_model["default"] = EmbeddingsModel(
|
||||
model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
|
||||
)
|
||||
state.cross_encoder_model = dict()
|
||||
state.cross_encoder_model["default"] = CrossEncoderModel()
|
||||
|
||||
model_dir = resolve_absolute_path("~/.khoj/search")
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
search_config = SearchConfig()
|
||||
|
||||
return search_config
|
||||
state.cross_encoder_model["default"] = CrossEncoderModel(
|
||||
model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@@ -201,13 +193,6 @@ def openai_agent():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def search_models(search_config: SearchConfig):
|
||||
search_models = SearchModels()
|
||||
|
||||
return search_models
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.fixture
|
||||
def default_process_lock():
|
||||
@@ -219,72 +204,23 @@ def anyio_backend():
|
||||
return "asyncio"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.fixture(scope="function")
|
||||
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
|
||||
content_dir = tmp_path_factory.mktemp("content")
|
||||
|
||||
# Generate Image Embeddings from Test Images
|
||||
content_config = ContentConfig()
|
||||
|
||||
LocalOrgConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/org/*.org"],
|
||||
index_heading_entries=False,
|
||||
user=default_user,
|
||||
)
|
||||
|
||||
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
|
||||
|
||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||
GithubConfig.objects.create(
|
||||
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
|
||||
user=default_user,
|
||||
)
|
||||
|
||||
GithubRepoConfig.objects.create(
|
||||
owner="khoj-ai",
|
||||
name="lantern",
|
||||
branch="master",
|
||||
github_config=GithubConfig.objects.get(user=default_user),
|
||||
)
|
||||
|
||||
LocalPlaintextConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
||||
user=default_user,
|
||||
)
|
||||
|
||||
return content_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def md_content_config():
|
||||
markdown_config = LocalMarkdownConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/markdown/*.markdown"],
|
||||
)
|
||||
|
||||
return markdown_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
|
||||
def chat_client(search_config, default_user2: KhojUser):
|
||||
return chat_client_builder(search_config, default_user2, require_auth=False)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
|
||||
def chat_client_with_auth(search_config, default_user2: KhojUser):
|
||||
return chat_client_builder(search_config, default_user2, require_auth=True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
|
||||
def chat_client_no_background(search_config, default_user2: KhojUser):
|
||||
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
|
||||
def chat_client_with_large_kb(search_config, default_user2: KhojUser):
|
||||
"""
|
||||
Chat client fixture that creates a large knowledge base with many files
|
||||
for stress testing atomic agent updates.
|
||||
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
|
||||
state.SearchType = configure_search_types()
|
||||
|
||||
if index_content:
|
||||
LocalMarkdownConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/markdown/*.markdown"],
|
||||
user=user,
|
||||
)
|
||||
file_type = "markdown"
|
||||
files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
|
||||
|
||||
# Index Markdown Content for Search
|
||||
all_files = fs_syncer.collect_files(user=user)
|
||||
configure_content(user, all_files)
|
||||
configure_content(user, files_to_index)
|
||||
|
||||
# Initialize Processor from Config
|
||||
chat_provider = get_chat_provider()
|
||||
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
|
||||
|
||||
# Create temporary directory for large number of test files
|
||||
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
|
||||
file_type = "markdown"
|
||||
large_file_list = []
|
||||
|
||||
try:
|
||||
# Generate 200 test files with substantial content
|
||||
for i in range(300):
|
||||
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
|
||||
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
|
||||
content = f"""
|
||||
# Test File {i}
|
||||
|
||||
@@ -401,16 +334,9 @@ End of file {i}.
|
||||
f.write(content)
|
||||
large_file_list.append(file_path)
|
||||
|
||||
# Create LocalMarkdownConfig with all the generated files
|
||||
LocalMarkdownConfig.objects.create(
|
||||
input_files=large_file_list,
|
||||
input_filter=None,
|
||||
user=user,
|
||||
)
|
||||
|
||||
# Index all the files into the user's knowledge base
|
||||
all_files = fs_syncer.collect_files(user=user)
|
||||
configure_content(user, all_files)
|
||||
# Index all generated files into the user's knowledge base
|
||||
files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
|
||||
configure_content(user, files_to_index)
|
||||
|
||||
# Verify we have a substantial knowledge base
|
||||
file_count = FileObject.objects.filter(user=user, agent=None).count()
|
||||
@@ -493,139 +419,18 @@ def client(
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
|
||||
# Setup
|
||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
input_filters = org_config.input_filter
|
||||
new_org_file = Path(input_filters[0]).parent / "new_file.org"
|
||||
new_org_file.touch()
|
||||
|
||||
yield new_org_file
|
||||
|
||||
# Cleanup
|
||||
if new_org_file.exists():
|
||||
new_org_file.unlink()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
|
||||
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
|
||||
return LocalOrgConfig.objects.filter(user=default_user).first()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def pdf_configured_user1(default_user: KhojUser):
|
||||
LocalPdfConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/pdf/singlepage.pdf"],
|
||||
user=default_user,
|
||||
)
|
||||
# Index Markdown Content for Search
|
||||
all_files = fs_syncer.collect_files(user=default_user)
|
||||
configure_content(default_user, all_files)
|
||||
# Read data from pdf file at tests/data/pdf/singlepage.pdf
|
||||
pdf_file_path = "tests/data/pdf/singlepage.pdf"
|
||||
with open(pdf_file_path, "rb") as pdf_file:
|
||||
pdf_data = pdf_file.read()
|
||||
|
||||
knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
|
||||
# Index Content for Search
|
||||
configure_content(default_user, knowledge_base)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def sample_org_data():
|
||||
return get_sample_data("org")
|
||||
|
||||
|
||||
def get_sample_data(type):
|
||||
sample_data = {
|
||||
"org": {
|
||||
"elisp.org": """
|
||||
* Emacs Khoj
|
||||
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
|
||||
|
||||
** Requirements
|
||||
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
|
||||
|
||||
** Installation
|
||||
*** Direct
|
||||
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
|
||||
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
|
||||
#+begin_src elisp
|
||||
;; Khoj Package
|
||||
(use-package khoj
|
||||
:load-path "~/.emacs.d/lisp/khoj.el"
|
||||
:bind ("C-c s" . 'khoj))
|
||||
#+end_src
|
||||
|
||||
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
|
||||
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
|
||||
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
|
||||
#+begin_src elisp
|
||||
;; Khoj Package
|
||||
(use-package khoj
|
||||
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
|
||||
:bind ("C-c s" . 'khoj))
|
||||
#+end_src
|
||||
|
||||
** Usage
|
||||
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
|
||||
2. Enter Query in Natural Language
|
||||
e.g. "What is the meaning of life?" "What are my life goals?"
|
||||
3. Wait for results
|
||||
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
|
||||
4. (Optional) Narrow down results further
|
||||
Include/Exclude specific words from results by adding to query
|
||||
e.g. "What is the meaning of life? -god +none"
|
||||
|
||||
""",
|
||||
"readme.org": """
|
||||
* Khoj
|
||||
/Allow natural language search on user content like notes, images using transformer based models/
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
||||
|
||||
** Dependencies
|
||||
- Python3
|
||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||
|
||||
** Install
|
||||
#+begin_src shell
|
||||
git clone https://github.com/khoj-ai/khoj && cd khoj
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
#+end_src""",
|
||||
},
|
||||
"markdown": {
|
||||
"readme.markdown": """
|
||||
# Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
||||
|
||||
## Dependencies
|
||||
- Python3
|
||||
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
||||
|
||||
## Install
|
||||
```shell
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
```
|
||||
"""
|
||||
},
|
||||
"plaintext": {
|
||||
"readme.txt": """
|
||||
Khoj
|
||||
Allow natural language search on user content like notes, images using transformer based models
|
||||
|
||||
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
||||
|
||||
Dependencies
|
||||
- Python3
|
||||
- Miniconda
|
||||
|
||||
Install
|
||||
git clone
|
||||
conda env create -f environment.yml
|
||||
conda activate khoj
|
||||
"""
|
||||
},
|
||||
}
|
||||
|
||||
return sample_data[type]
|
||||
|
||||
Reference in New Issue
Block a user