Update test setup to index test data after old indexing code removed

- Delete tests testing deprecated server side indexing flows
- Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and
  references in tests
- Index test data via new helper method, `get_index_files'
  - It is modelled after the old `get_org_files' variants in main app
  - It passes the test data in required format to `configure_content'
    Allows maintaining the more realistic tests from before while
    using new indexing mechanism (rather than the deprecated server
    side indexing mechanism
This commit is contained in:
Debanjum
2025-07-11 14:35:05 -07:00
parent d9d24dd638
commit 892d57314e
12 changed files with 295 additions and 604 deletions

View File

@@ -1,6 +1,3 @@
import os
from pathlib import Path
import pytest
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
@@ -11,6 +8,7 @@ from khoj.configure import (
configure_routes,
configure_search_types,
)
from khoj.database.adapters import get_default_search_model
from khoj.database.models import (
Agent,
ChatModel,
@@ -19,21 +17,14 @@ from khoj.database.models import (
GithubRepoConfig,
KhojApiUser,
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
)
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.routers.api_content import configure_content
from khoj.search_type import text_search
from khoj.utils import fs_syncer, state
from khoj.utils.config import SearchModels
from khoj.utils import state
from khoj.utils.constants import web_directory
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from tests.helpers import (
AiModelApiFactory,
ChatModelFactory,
@@ -43,6 +34,8 @@ from tests.helpers import (
UserFactory,
get_chat_api_key,
get_chat_provider,
get_index_files,
get_sample_data,
)
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
@pytest.fixture(scope="session")
def search_config() -> SearchConfig:
def search_config():
search_model = get_default_search_model()
state.embeddings_model = dict()
state.embeddings_model["default"] = EmbeddingsModel()
state.embeddings_model["default"] = EmbeddingsModel(
model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
)
state.cross_encoder_model = dict()
state.cross_encoder_model["default"] = CrossEncoderModel()
model_dir = resolve_absolute_path("~/.khoj/search")
model_dir.mkdir(parents=True, exist_ok=True)
search_config = SearchConfig()
return search_config
state.cross_encoder_model["default"] = CrossEncoderModel(
model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
)
@pytest.mark.django_db
@@ -201,13 +193,6 @@ def openai_agent():
)
@pytest.fixture(scope="session")
def search_models(search_config: SearchConfig):
search_models = SearchModels()
return search_models
@pytest.mark.django_db
@pytest.fixture
def default_process_lock():
@@ -219,72 +204,23 @@ def anyio_backend():
return "asyncio"
@pytest.mark.django_db
@pytest.fixture(scope="function")
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
content_dir = tmp_path_factory.mktemp("content")
# Generate Image Embeddings from Test Images
content_config = ContentConfig()
LocalOrgConfig.objects.create(
input_files=None,
input_filter=["tests/data/org/*.org"],
index_heading_entries=False,
user=default_user,
)
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
if os.getenv("GITHUB_PAT_TOKEN"):
GithubConfig.objects.create(
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
user=default_user,
)
GithubRepoConfig.objects.create(
owner="khoj-ai",
name="lantern",
branch="master",
github_config=GithubConfig.objects.get(user=default_user),
)
LocalPlaintextConfig.objects.create(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
user=default_user,
)
return content_config
@pytest.fixture(scope="session")
def md_content_config():
markdown_config = LocalMarkdownConfig.objects.create(
input_files=None,
input_filter=["tests/data/markdown/*.markdown"],
)
return markdown_config
@pytest.fixture(scope="function")
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
def chat_client(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, require_auth=False)
@pytest.fixture(scope="function")
def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_with_auth(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, require_auth=True)
@pytest.fixture(scope="function")
def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_no_background(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
@pytest.fixture(scope="function")
def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_with_large_kb(search_config, default_user2: KhojUser):
"""
Chat client fixture that creates a large knowledge base with many files
for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
state.SearchType = configure_search_types()
if index_content:
LocalMarkdownConfig.objects.create(
input_files=None,
input_filter=["tests/data/markdown/*.markdown"],
user=user,
)
file_type = "markdown"
files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
# Index Markdown Content for Search
all_files = fs_syncer.collect_files(user=user)
configure_content(user, all_files)
configure_content(user, files_to_index)
# Initialize Processor from Config
chat_provider = get_chat_provider()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
# Create temporary directory for large number of test files
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
file_type = "markdown"
large_file_list = []
try:
# Generate 200 test files with substantial content
for i in range(300):
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
content = f"""
# Test File {i}
@@ -401,16 +334,9 @@ End of file {i}.
f.write(content)
large_file_list.append(file_path)
# Create LocalMarkdownConfig with all the generated files
LocalMarkdownConfig.objects.create(
input_files=large_file_list,
input_filter=None,
user=user,
)
# Index all the files into the user's knowledge base
all_files = fs_syncer.collect_files(user=user)
configure_content(user, all_files)
# Index all generated files into the user's knowledge base
files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
configure_content(user, files_to_index)
# Verify we have a substantial knowledge base
file_count = FileObject.objects.filter(user=user, agent=None).count()
@@ -493,139 +419,18 @@ def client(
return TestClient(app)
@pytest.fixture(scope="function")
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
# Setup
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
input_filters = org_config.input_filter
new_org_file = Path(input_filters[0]).parent / "new_file.org"
new_org_file.touch()
yield new_org_file
# Cleanup
if new_org_file.exists():
new_org_file.unlink()
@pytest.fixture(scope="function")
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
return LocalOrgConfig.objects.filter(user=default_user).first()
@pytest.fixture(scope="function")
def pdf_configured_user1(default_user: KhojUser):
LocalPdfConfig.objects.create(
input_files=None,
input_filter=["tests/data/pdf/singlepage.pdf"],
user=default_user,
)
# Index Markdown Content for Search
all_files = fs_syncer.collect_files(user=default_user)
configure_content(default_user, all_files)
# Read data from pdf file at tests/data/pdf/singlepage.pdf
pdf_file_path = "tests/data/pdf/singlepage.pdf"
with open(pdf_file_path, "rb") as pdf_file:
pdf_data = pdf_file.read()
knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
# Index Content for Search
configure_content(default_user, knowledge_base)
@pytest.fixture(scope="function")
def sample_org_data():
return get_sample_data("org")
def get_sample_data(type):
sample_data = {
"org": {
"elisp.org": """
* Emacs Khoj
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
(use-package khoj
:load-path "~/.emacs.d/lisp/khoj.el"
:bind ("C-c s" . 'khoj))
#+end_src
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Khoj Package
(use-package khoj
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
:bind ("C-c s" . 'khoj))
#+end_src
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g. "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g. "What is the meaning of life? -god +none"
""",
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install
#+begin_src shell
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src""",
},
"markdown": {
"readme.markdown": """
# Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
## Dependencies
- Python3
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
## Install
```shell
git clone
conda env create -f environment.yml
conda activate khoj
```
"""
},
"plaintext": {
"readme.txt": """
Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
Dependencies
- Python3
- Miniconda
Install
git clone
conda env create -f environment.yml
conda activate khoj
"""
},
}
return sample_data[type]