mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Update test setup to index test data after old indexing code removed
- Delete tests testing deprecated server side indexing flows
- Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and
references in tests
- Index test data via new helper method, `get_index_files'
- It is modelled after the old `get_org_files' variants in main app
- It passes the test data in required format to `configure_content'
Allows maintaining the more realistic tests from before while
using new indexing mechanism (rather than the deprecated server
side indexing mechanism
This commit is contained in:
@@ -1,6 +1,3 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
@@ -11,6 +8,7 @@ from khoj.configure import (
|
|||||||
configure_routes,
|
configure_routes,
|
||||||
configure_search_types,
|
configure_search_types,
|
||||||
)
|
)
|
||||||
|
from khoj.database.adapters import get_default_search_model
|
||||||
from khoj.database.models import (
|
from khoj.database.models import (
|
||||||
Agent,
|
Agent,
|
||||||
ChatModel,
|
ChatModel,
|
||||||
@@ -19,21 +17,14 @@ from khoj.database.models import (
|
|||||||
GithubRepoConfig,
|
GithubRepoConfig,
|
||||||
KhojApiUser,
|
KhojApiUser,
|
||||||
KhojUser,
|
KhojUser,
|
||||||
LocalMarkdownConfig,
|
|
||||||
LocalOrgConfig,
|
|
||||||
LocalPdfConfig,
|
|
||||||
LocalPlaintextConfig,
|
|
||||||
)
|
)
|
||||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
|
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
|
||||||
from khoj.routers.api_content import configure_content
|
from khoj.routers.api_content import configure_content
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils import fs_syncer, state
|
from khoj.utils import state
|
||||||
from khoj.utils.config import SearchModels
|
|
||||||
from khoj.utils.constants import web_directory
|
from khoj.utils.constants import web_directory
|
||||||
from khoj.utils.helpers import resolve_absolute_path
|
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
|
||||||
from tests.helpers import (
|
from tests.helpers import (
|
||||||
AiModelApiFactory,
|
AiModelApiFactory,
|
||||||
ChatModelFactory,
|
ChatModelFactory,
|
||||||
@@ -43,6 +34,8 @@ from tests.helpers import (
|
|||||||
UserFactory,
|
UserFactory,
|
||||||
get_chat_api_key,
|
get_chat_api_key,
|
||||||
get_chat_provider,
|
get_chat_provider,
|
||||||
|
get_index_files,
|
||||||
|
get_sample_data,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def search_config() -> SearchConfig:
|
def search_config():
|
||||||
|
search_model = get_default_search_model()
|
||||||
state.embeddings_model = dict()
|
state.embeddings_model = dict()
|
||||||
state.embeddings_model["default"] = EmbeddingsModel()
|
state.embeddings_model["default"] = EmbeddingsModel(
|
||||||
|
model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
|
||||||
|
)
|
||||||
state.cross_encoder_model = dict()
|
state.cross_encoder_model = dict()
|
||||||
state.cross_encoder_model["default"] = CrossEncoderModel()
|
state.cross_encoder_model["default"] = CrossEncoderModel(
|
||||||
|
model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
|
||||||
model_dir = resolve_absolute_path("~/.khoj/search")
|
)
|
||||||
model_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
search_config = SearchConfig()
|
|
||||||
|
|
||||||
return search_config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@@ -201,13 +193,6 @@ def openai_agent():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def search_models(search_config: SearchConfig):
|
|
||||||
search_models = SearchModels()
|
|
||||||
|
|
||||||
return search_models
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def default_process_lock():
|
def default_process_lock():
|
||||||
@@ -219,72 +204,23 @@ def anyio_backend():
|
|||||||
return "asyncio"
|
return "asyncio"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
|
def chat_client(search_config, default_user2: KhojUser):
|
||||||
content_dir = tmp_path_factory.mktemp("content")
|
|
||||||
|
|
||||||
# Generate Image Embeddings from Test Images
|
|
||||||
content_config = ContentConfig()
|
|
||||||
|
|
||||||
LocalOrgConfig.objects.create(
|
|
||||||
input_files=None,
|
|
||||||
input_filter=["tests/data/org/*.org"],
|
|
||||||
index_heading_entries=False,
|
|
||||||
user=default_user,
|
|
||||||
)
|
|
||||||
|
|
||||||
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
|
|
||||||
|
|
||||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
|
||||||
GithubConfig.objects.create(
|
|
||||||
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
|
|
||||||
user=default_user,
|
|
||||||
)
|
|
||||||
|
|
||||||
GithubRepoConfig.objects.create(
|
|
||||||
owner="khoj-ai",
|
|
||||||
name="lantern",
|
|
||||||
branch="master",
|
|
||||||
github_config=GithubConfig.objects.get(user=default_user),
|
|
||||||
)
|
|
||||||
|
|
||||||
LocalPlaintextConfig.objects.create(
|
|
||||||
input_files=None,
|
|
||||||
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
|
|
||||||
user=default_user,
|
|
||||||
)
|
|
||||||
|
|
||||||
return content_config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def md_content_config():
|
|
||||||
markdown_config = LocalMarkdownConfig.objects.create(
|
|
||||||
input_files=None,
|
|
||||||
input_filter=["tests/data/markdown/*.markdown"],
|
|
||||||
)
|
|
||||||
|
|
||||||
return markdown_config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
|
||||||
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
|
|
||||||
return chat_client_builder(search_config, default_user2, require_auth=False)
|
return chat_client_builder(search_config, default_user2, require_auth=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
|
def chat_client_with_auth(search_config, default_user2: KhojUser):
|
||||||
return chat_client_builder(search_config, default_user2, require_auth=True)
|
return chat_client_builder(search_config, default_user2, require_auth=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
|
def chat_client_no_background(search_config, default_user2: KhojUser):
|
||||||
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
|
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
|
def chat_client_with_large_kb(search_config, default_user2: KhojUser):
|
||||||
"""
|
"""
|
||||||
Chat client fixture that creates a large knowledge base with many files
|
Chat client fixture that creates a large knowledge base with many files
|
||||||
for stress testing atomic agent updates.
|
for stress testing atomic agent updates.
|
||||||
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
|
|||||||
state.SearchType = configure_search_types()
|
state.SearchType = configure_search_types()
|
||||||
|
|
||||||
if index_content:
|
if index_content:
|
||||||
LocalMarkdownConfig.objects.create(
|
file_type = "markdown"
|
||||||
input_files=None,
|
files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
|
||||||
input_filter=["tests/data/markdown/*.markdown"],
|
|
||||||
user=user,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Index Markdown Content for Search
|
# Index Markdown Content for Search
|
||||||
all_files = fs_syncer.collect_files(user=user)
|
configure_content(user, files_to_index)
|
||||||
configure_content(user, all_files)
|
|
||||||
|
|
||||||
# Initialize Processor from Config
|
# Initialize Processor from Config
|
||||||
chat_provider = get_chat_provider()
|
chat_provider = get_chat_provider()
|
||||||
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
|
|||||||
|
|
||||||
# Create temporary directory for large number of test files
|
# Create temporary directory for large number of test files
|
||||||
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
|
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
|
||||||
|
file_type = "markdown"
|
||||||
large_file_list = []
|
large_file_list = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate 200 test files with substantial content
|
# Generate 200 test files with substantial content
|
||||||
for i in range(300):
|
for i in range(300):
|
||||||
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
|
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
|
||||||
content = f"""
|
content = f"""
|
||||||
# Test File {i}
|
# Test File {i}
|
||||||
|
|
||||||
@@ -401,16 +334,9 @@ End of file {i}.
|
|||||||
f.write(content)
|
f.write(content)
|
||||||
large_file_list.append(file_path)
|
large_file_list.append(file_path)
|
||||||
|
|
||||||
# Create LocalMarkdownConfig with all the generated files
|
# Index all generated files into the user's knowledge base
|
||||||
LocalMarkdownConfig.objects.create(
|
files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
|
||||||
input_files=large_file_list,
|
configure_content(user, files_to_index)
|
||||||
input_filter=None,
|
|
||||||
user=user,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Index all the files into the user's knowledge base
|
|
||||||
all_files = fs_syncer.collect_files(user=user)
|
|
||||||
configure_content(user, all_files)
|
|
||||||
|
|
||||||
# Verify we have a substantial knowledge base
|
# Verify we have a substantial knowledge base
|
||||||
file_count = FileObject.objects.filter(user=user, agent=None).count()
|
file_count = FileObject.objects.filter(user=user, agent=None).count()
|
||||||
@@ -493,139 +419,18 @@ def client(
|
|||||||
return TestClient(app)
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
|
||||||
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
|
|
||||||
# Setup
|
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
|
||||||
input_filters = org_config.input_filter
|
|
||||||
new_org_file = Path(input_filters[0]).parent / "new_file.org"
|
|
||||||
new_org_file.touch()
|
|
||||||
|
|
||||||
yield new_org_file
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
if new_org_file.exists():
|
|
||||||
new_org_file.unlink()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
|
||||||
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
|
|
||||||
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
|
|
||||||
return LocalOrgConfig.objects.filter(user=default_user).first()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def pdf_configured_user1(default_user: KhojUser):
|
def pdf_configured_user1(default_user: KhojUser):
|
||||||
LocalPdfConfig.objects.create(
|
# Read data from pdf file at tests/data/pdf/singlepage.pdf
|
||||||
input_files=None,
|
pdf_file_path = "tests/data/pdf/singlepage.pdf"
|
||||||
input_filter=["tests/data/pdf/singlepage.pdf"],
|
with open(pdf_file_path, "rb") as pdf_file:
|
||||||
user=default_user,
|
pdf_data = pdf_file.read()
|
||||||
)
|
|
||||||
# Index Markdown Content for Search
|
knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
|
||||||
all_files = fs_syncer.collect_files(user=default_user)
|
# Index Content for Search
|
||||||
configure_content(default_user, all_files)
|
configure_content(default_user, knowledge_base)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def sample_org_data():
|
def sample_org_data():
|
||||||
return get_sample_data("org")
|
return get_sample_data("org")
|
||||||
|
|
||||||
|
|
||||||
def get_sample_data(type):
|
|
||||||
sample_data = {
|
|
||||||
"org": {
|
|
||||||
"elisp.org": """
|
|
||||||
* Emacs Khoj
|
|
||||||
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
|
|
||||||
|
|
||||||
** Requirements
|
|
||||||
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
|
|
||||||
|
|
||||||
** Installation
|
|
||||||
*** Direct
|
|
||||||
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
|
|
||||||
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
|
|
||||||
#+begin_src elisp
|
|
||||||
;; Khoj Package
|
|
||||||
(use-package khoj
|
|
||||||
:load-path "~/.emacs.d/lisp/khoj.el"
|
|
||||||
:bind ("C-c s" . 'khoj))
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
|
|
||||||
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
|
|
||||||
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
|
|
||||||
#+begin_src elisp
|
|
||||||
;; Khoj Package
|
|
||||||
(use-package khoj
|
|
||||||
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
|
|
||||||
:bind ("C-c s" . 'khoj))
|
|
||||||
#+end_src
|
|
||||||
|
|
||||||
** Usage
|
|
||||||
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
|
|
||||||
2. Enter Query in Natural Language
|
|
||||||
e.g. "What is the meaning of life?" "What are my life goals?"
|
|
||||||
3. Wait for results
|
|
||||||
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
|
|
||||||
4. (Optional) Narrow down results further
|
|
||||||
Include/Exclude specific words from results by adding to query
|
|
||||||
e.g. "What is the meaning of life? -god +none"
|
|
||||||
|
|
||||||
""",
|
|
||||||
"readme.org": """
|
|
||||||
* Khoj
|
|
||||||
/Allow natural language search on user content like notes, images using transformer based models/
|
|
||||||
|
|
||||||
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
|
||||||
|
|
||||||
** Dependencies
|
|
||||||
- Python3
|
|
||||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
|
||||||
|
|
||||||
** Install
|
|
||||||
#+begin_src shell
|
|
||||||
git clone https://github.com/khoj-ai/khoj && cd khoj
|
|
||||||
conda env create -f environment.yml
|
|
||||||
conda activate khoj
|
|
||||||
#+end_src""",
|
|
||||||
},
|
|
||||||
"markdown": {
|
|
||||||
"readme.markdown": """
|
|
||||||
# Khoj
|
|
||||||
Allow natural language search on user content like notes, images using transformer based models
|
|
||||||
|
|
||||||
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
- Python3
|
|
||||||
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
|
||||||
|
|
||||||
## Install
|
|
||||||
```shell
|
|
||||||
git clone
|
|
||||||
conda env create -f environment.yml
|
|
||||||
conda activate khoj
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
},
|
|
||||||
"plaintext": {
|
|
||||||
"readme.txt": """
|
|
||||||
Khoj
|
|
||||||
Allow natural language search on user content like notes, images using transformer based models
|
|
||||||
|
|
||||||
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
|
||||||
|
|
||||||
Dependencies
|
|
||||||
- Python3
|
|
||||||
- Miniconda
|
|
||||||
|
|
||||||
Install
|
|
||||||
git clone
|
|
||||||
conda env create -f environment.yml
|
|
||||||
conda activate khoj
|
|
||||||
"""
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
return sample_data[type]
|
|
||||||
|
|||||||
139
tests/helpers.py
139
tests/helpers.py
@@ -1,3 +1,5 @@
|
|||||||
|
import glob
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
@@ -17,6 +19,9 @@ from khoj.database.models import (
|
|||||||
UserConversationConfig,
|
UserConversationConfig,
|
||||||
)
|
)
|
||||||
from khoj.processor.conversation.utils import message_to_log
|
from khoj.processor.conversation.utils import message_to_log
|
||||||
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
|
def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
|
||||||
@@ -61,6 +66,140 @@ def generate_chat_history(message_list):
|
|||||||
return chat_history
|
return chat_history
|
||||||
|
|
||||||
|
|
||||||
|
def get_sample_data(type):
|
||||||
|
sample_data = {
|
||||||
|
"org": {
|
||||||
|
"elisp.org": """
|
||||||
|
* Emacs Khoj
|
||||||
|
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
|
||||||
|
|
||||||
|
** Requirements
|
||||||
|
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
|
||||||
|
|
||||||
|
** Installation
|
||||||
|
*** Direct
|
||||||
|
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
|
||||||
|
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
|
||||||
|
#+begin_src elisp
|
||||||
|
;; Khoj Package
|
||||||
|
(use-package khoj
|
||||||
|
:load-path "~/.emacs.d/lisp/khoj.el"
|
||||||
|
:bind ("C-c s" . 'khoj))
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
|
||||||
|
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
|
||||||
|
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
|
||||||
|
#+begin_src elisp
|
||||||
|
;; Khoj Package
|
||||||
|
(use-package khoj
|
||||||
|
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
|
||||||
|
:bind ("C-c s" . 'khoj))
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
** Usage
|
||||||
|
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
|
||||||
|
2. Enter Query in Natural Language
|
||||||
|
e.g. "What is the meaning of life?" "What are my life goals?"
|
||||||
|
3. Wait for results
|
||||||
|
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
|
||||||
|
4. (Optional) Narrow down results further
|
||||||
|
Include/Exclude specific words from results by adding to query
|
||||||
|
e.g. "What is the meaning of life? -god +none"
|
||||||
|
|
||||||
|
""",
|
||||||
|
"readme.org": """
|
||||||
|
* Khoj
|
||||||
|
/Allow natural language search on user content like notes, images using transformer based models/
|
||||||
|
|
||||||
|
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
||||||
|
|
||||||
|
** Dependencies
|
||||||
|
- Python3
|
||||||
|
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||||
|
|
||||||
|
** Install
|
||||||
|
#+begin_src shell
|
||||||
|
git clone https://github.com/khoj-ai/khoj && cd khoj
|
||||||
|
conda env create -f environment.yml
|
||||||
|
conda activate khoj
|
||||||
|
#+end_src""",
|
||||||
|
},
|
||||||
|
"markdown": {
|
||||||
|
"readme.markdown": """
|
||||||
|
# Khoj
|
||||||
|
Allow natural language search on user content like notes, images using transformer based models
|
||||||
|
|
||||||
|
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
- Python3
|
||||||
|
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
||||||
|
|
||||||
|
## Install
|
||||||
|
```shell
|
||||||
|
git clone
|
||||||
|
conda env create -f environment.yml
|
||||||
|
conda activate khoj
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
"plaintext": {
|
||||||
|
"readme.txt": """
|
||||||
|
Khoj
|
||||||
|
Allow natural language search on user content like notes, images using transformer based models
|
||||||
|
|
||||||
|
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
||||||
|
|
||||||
|
Dependencies
|
||||||
|
- Python3
|
||||||
|
- Miniconda
|
||||||
|
|
||||||
|
Install
|
||||||
|
git clone
|
||||||
|
conda env create -f environment.yml
|
||||||
|
conda activate khoj
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return sample_data[type]
|
||||||
|
|
||||||
|
|
||||||
|
def get_index_files(
|
||||||
|
input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
|
||||||
|
) -> dict[str, str]:
|
||||||
|
# Input Validation
|
||||||
|
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
|
||||||
|
logger.debug("At least one of input_files or input_filter is required to be specified")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Get files to process
|
||||||
|
absolute_files, filtered_files = set(), set()
|
||||||
|
if input_files:
|
||||||
|
absolute_files = {get_absolute_path(input_file) for input_file in input_files}
|
||||||
|
if input_filters:
|
||||||
|
filtered_files = {
|
||||||
|
filtered_file
|
||||||
|
for file_filter in input_filters
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
|
}
|
||||||
|
|
||||||
|
all_files = sorted(absolute_files | filtered_files)
|
||||||
|
|
||||||
|
filename_to_content_map = {}
|
||||||
|
for file in all_files:
|
||||||
|
with open(file, "r", encoding="utf8") as f:
|
||||||
|
try:
|
||||||
|
filename_to_content_map[file] = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Unable to read file: {file}. Skipping file.")
|
||||||
|
logger.warning(e, exc_info=True)
|
||||||
|
|
||||||
|
return filename_to_content_map
|
||||||
|
|
||||||
|
|
||||||
class UserFactory(factory.django.DjangoModelFactory):
|
class UserFactory(factory.django.DjangoModelFactory):
|
||||||
class Meta:
|
class Meta:
|
||||||
model = KhojUser
|
model = KhojUser
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from tests.helpers import ChatModelFactory
|
|||||||
def test_create_default_agent(default_user: KhojUser):
|
def test_create_default_agent(default_user: KhojUser):
|
||||||
ChatModelFactory()
|
ChatModelFactory()
|
||||||
|
|
||||||
agent = AgentAdapters.create_default_agent(default_user)
|
agent = AgentAdapters.create_default_agent()
|
||||||
assert agent is not None
|
assert agent is not None
|
||||||
assert agent.input_tools == []
|
assert agent.input_tools == []
|
||||||
assert agent.output_modes == []
|
assert agent.output_modes == []
|
||||||
|
|||||||
@@ -1,49 +1,15 @@
|
|||||||
# Standard Modules
|
# Standard Modules
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import random
|
|
||||||
|
|
||||||
from khoj.utils.cli import cli
|
from khoj.utils.cli import cli
|
||||||
from khoj.utils.helpers import resolve_absolute_path
|
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_cli_minimal_default():
|
def test_cli_minimal_default():
|
||||||
# Act
|
# Act
|
||||||
actual_args = cli([])
|
actual_args = cli(["-vvv"])
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml"))
|
assert actual_args.log_file == Path("~/.khoj/khoj.log")
|
||||||
assert actual_args.regenerate == False
|
|
||||||
assert actual_args.verbose == 0
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
|
||||||
def test_cli_invalid_config_file_path():
|
|
||||||
# Arrange
|
|
||||||
non_existent_config_file = f"non-existent-khoj-{random()}.yml"
|
|
||||||
|
|
||||||
# Act
|
|
||||||
actual_args = cli([f"--config-file={non_existent_config_file}"])
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
|
|
||||||
assert actual_args.config == None
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
|
||||||
def test_cli_config_from_file():
|
|
||||||
# Act
|
|
||||||
actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"])
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml"))
|
|
||||||
assert actual_args.regenerate == True
|
|
||||||
assert actual_args.config is not None
|
|
||||||
assert actual_args.verbose == 3
|
assert actual_args.verbose == 3
|
||||||
|
|
||||||
# Ensure content config is loaded from file
|
|
||||||
assert actual_args.config.content_type.org.input_files == [
|
|
||||||
Path("~/first_from_config.org"),
|
|
||||||
Path("~/second_from_config.org"),
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ from khoj.database.models import KhojApiUser, KhojUser
|
|||||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils import state
|
from khoj.utils import state
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
@@ -296,7 +295,7 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
def test_notes_search(client, search_config, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
@@ -315,7 +314,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
def test_notes_search_no_results(client, search_config, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
@@ -331,9 +330,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_notes_search_with_only_filters(
|
def test_notes_search_with_only_filters(client, sample_org_data, default_user: KhojUser):
|
||||||
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
@@ -397,9 +394,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db(transaction=True)
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_notes_search_requires_parent_context(
|
def test_notes_search_requires_parent_context(client, search_config, sample_org_data, default_user: KhojUser):
|
||||||
client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
|
|||||||
@@ -1,6 +1,13 @@
|
|||||||
# Application Packages
|
# Application Packages
|
||||||
from khoj.search_filter.file_filter import FileFilter
|
from khoj.search_filter.file_filter import FileFilter
|
||||||
from khoj.utils.rawconfig import Entry
|
|
||||||
|
|
||||||
|
# Mock Entry class for testing
|
||||||
|
class Entry:
|
||||||
|
def __init__(self, compiled="", raw="", file=""):
|
||||||
|
self.compiled = compiled
|
||||||
|
self.raw = raw
|
||||||
|
self.file = file
|
||||||
|
|
||||||
|
|
||||||
def test_can_filter_no_file_filter():
|
def test_can_filter_no_file_filter():
|
||||||
|
|||||||
@@ -3,8 +3,6 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
|
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.utils.fs_syncer import get_markdown_files
|
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_markdown_with_no_headings(tmp_path):
|
def test_extract_markdown_with_no_headings(tmp_path):
|
||||||
@@ -212,43 +210,6 @@ longer body line 2.1
|
|||||||
), "Third entry is second entries child heading"
|
), "Third entry is second entries child heading"
|
||||||
|
|
||||||
|
|
||||||
def test_get_markdown_files(tmp_path):
|
|
||||||
"Ensure Markdown files specified via input-filter, input-files extracted"
|
|
||||||
# Arrange
|
|
||||||
# Include via input-filter globs
|
|
||||||
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
|
|
||||||
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
|
|
||||||
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
|
|
||||||
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
|
|
||||||
# Include via input-file field
|
|
||||||
file1 = create_file(tmp_path, filename="notes.md")
|
|
||||||
# Not included by any filter
|
|
||||||
create_file(tmp_path, filename="not-included-markdown.md")
|
|
||||||
create_file(tmp_path, filename="not-included-text.txt")
|
|
||||||
|
|
||||||
expected_files = set(
|
|
||||||
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
|
||||||
input_files = [tmp_path / "notes.md"]
|
|
||||||
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
|
|
||||||
|
|
||||||
markdown_config = TextContentConfig(
|
|
||||||
input_files=input_files,
|
|
||||||
input_filter=[str(filter) for filter in input_filter],
|
|
||||||
compressed_jsonl=tmp_path / "test.jsonl",
|
|
||||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
extracted_org_files = get_markdown_files(markdown_config)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(extracted_org_files) == 5
|
|
||||||
assert set(extracted_org_files.keys()) == expected_files
|
|
||||||
|
|
||||||
|
|
||||||
def test_line_number_tracking_in_recursive_split():
|
def test_line_number_tracking_in_recursive_split():
|
||||||
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
|
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|||||||
@@ -4,9 +4,8 @@ import time
|
|||||||
|
|
||||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
from khoj.utils.fs_syncer import get_org_files
|
|
||||||
from khoj.utils.helpers import is_none_or_empty
|
from khoj.utils.helpers import is_none_or_empty
|
||||||
from khoj.utils.rawconfig import Entry, TextContentConfig
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
def test_configure_indexing_heading_only_entries(tmp_path):
|
def test_configure_indexing_heading_only_entries(tmp_path):
|
||||||
@@ -330,46 +329,6 @@ def test_file_with_no_headings_to_entry(tmp_path):
|
|||||||
assert len(entries[1]) == 1
|
assert len(entries[1]) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_get_org_files(tmp_path):
|
|
||||||
"Ensure Org files specified via input-filter, input-files extracted"
|
|
||||||
# Arrange
|
|
||||||
# Include via input-filter globs
|
|
||||||
group1_file1 = create_file(tmp_path, filename="group1-file1.org")
|
|
||||||
group1_file2 = create_file(tmp_path, filename="group1-file2.org")
|
|
||||||
group2_file1 = create_file(tmp_path, filename="group2-file1.org")
|
|
||||||
group2_file2 = create_file(tmp_path, filename="group2-file2.org")
|
|
||||||
# Include via input-file field
|
|
||||||
orgfile1 = create_file(tmp_path, filename="orgfile1.org")
|
|
||||||
# Not included by any filter
|
|
||||||
create_file(tmp_path, filename="orgfile2.org")
|
|
||||||
create_file(tmp_path, filename="text1.txt")
|
|
||||||
|
|
||||||
expected_files = set(
|
|
||||||
[
|
|
||||||
os.path.join(tmp_path, file.name)
|
|
||||||
for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
|
||||||
input_files = [tmp_path / "orgfile1.org"]
|
|
||||||
input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
|
|
||||||
|
|
||||||
org_config = TextContentConfig(
|
|
||||||
input_files=input_files,
|
|
||||||
input_filter=[str(filter) for filter in input_filter],
|
|
||||||
compressed_jsonl=tmp_path / "test.jsonl",
|
|
||||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
extracted_org_files = get_org_files(org_config)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(extracted_org_files) == 5
|
|
||||||
assert set(extracted_org_files.keys()) == expected_files
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_entries_with_different_level_headings(tmp_path):
|
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
"Extract org entries with different level headings."
|
"Extract org entries with different level headings."
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ import re
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
||||||
from khoj.utils.fs_syncer import get_pdf_files
|
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
def test_single_page_pdf_to_jsonl():
|
def test_single_page_pdf_to_jsonl():
|
||||||
@@ -61,43 +59,6 @@ def test_ocr_page_pdf_to_jsonl():
|
|||||||
assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
|
assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
|
||||||
|
|
||||||
|
|
||||||
def test_get_pdf_files(tmp_path):
|
|
||||||
"Ensure Pdf files specified via input-filter, input-files extracted"
|
|
||||||
# Arrange
|
|
||||||
# Include via input-filter globs
|
|
||||||
group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
|
|
||||||
group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
|
|
||||||
group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
|
|
||||||
group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
|
|
||||||
# Include via input-file field
|
|
||||||
file1 = create_file(tmp_path, filename="document.pdf")
|
|
||||||
# Not included by any filter
|
|
||||||
create_file(tmp_path, filename="not-included-document.pdf")
|
|
||||||
create_file(tmp_path, filename="not-included-text.txt")
|
|
||||||
|
|
||||||
expected_files = set(
|
|
||||||
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
|
||||||
input_files = [tmp_path / "document.pdf"]
|
|
||||||
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
|
|
||||||
|
|
||||||
pdf_config = TextContentConfig(
|
|
||||||
input_files=input_files,
|
|
||||||
input_filter=[str(path) for path in input_filter],
|
|
||||||
compressed_jsonl=tmp_path / "test.jsonl",
|
|
||||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
extracted_pdf_files = get_pdf_files(pdf_config)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(extracted_pdf_files) == 5
|
|
||||||
assert set(extracted_pdf_files.keys()) == expected_files
|
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry=None, filename="document.pdf"):
|
def create_file(tmp_path, entry=None, filename="document.pdf"):
|
||||||
pdf_file = tmp_path / filename
|
pdf_file = tmp_path / filename
|
||||||
|
|||||||
@@ -1,27 +1,20 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
from khoj.database.models import KhojUser, LocalPlaintextConfig
|
|
||||||
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.utils.fs_syncer import get_plaintext_files
|
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
def test_plaintext_file(tmp_path):
|
def test_plaintext_file():
|
||||||
"Convert files with no heading to jsonl."
|
"Convert files with no heading to jsonl."
|
||||||
# Arrange
|
# Arrange
|
||||||
raw_entry = f"""
|
raw_entry = f"""
|
||||||
Hi, I am a plaintext file and I have some plaintext words.
|
Hi, I am a plaintext file and I have some plaintext words.
|
||||||
"""
|
"""
|
||||||
plaintextfile = create_file(tmp_path, raw_entry)
|
plaintextfile = "test.txt"
|
||||||
|
data = {plaintextfile: raw_entry}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified plaintext files
|
# Extract Entries from specified plaintext files
|
||||||
|
|
||||||
data = {
|
|
||||||
f"{plaintextfile}": raw_entry,
|
|
||||||
}
|
|
||||||
|
|
||||||
entries = PlaintextToEntries.extract_plaintext_entries(data)
|
entries = PlaintextToEntries.extract_plaintext_entries(data)
|
||||||
|
|
||||||
# Convert each entry.file to absolute path to make them JSON serializable
|
# Convert each entry.file to absolute path to make them JSON serializable
|
||||||
@@ -37,59 +30,20 @@ def test_plaintext_file(tmp_path):
|
|||||||
assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
|
assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
|
||||||
|
|
||||||
|
|
||||||
def test_get_plaintext_files(tmp_path):
|
def test_parse_html_plaintext_file(tmp_path):
|
||||||
"Ensure Plaintext files specified via input-filter, input-files extracted"
|
|
||||||
# Arrange
|
|
||||||
# Include via input-filter globs
|
|
||||||
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
|
|
||||||
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
|
|
||||||
|
|
||||||
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
|
|
||||||
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
|
|
||||||
group2_file4 = create_file(tmp_path, filename="group2-file4.html")
|
|
||||||
# Include via input-file field
|
|
||||||
file1 = create_file(tmp_path, filename="notes.txt")
|
|
||||||
# Include unsupported file types
|
|
||||||
create_file(tmp_path, filename="group2-unincluded.py")
|
|
||||||
create_file(tmp_path, filename="group2-unincluded.csv")
|
|
||||||
create_file(tmp_path, filename="group2-unincluded.csv")
|
|
||||||
create_file(tmp_path, filename="group2-file3.mbox")
|
|
||||||
# Not included by any filter
|
|
||||||
create_file(tmp_path, filename="not-included-markdown.md")
|
|
||||||
create_file(tmp_path, filename="not-included-text.txt")
|
|
||||||
|
|
||||||
expected_files = set(
|
|
||||||
[
|
|
||||||
os.path.join(tmp_path, file.name)
|
|
||||||
for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file4, file1]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
|
||||||
input_files = [tmp_path / "notes.txt"]
|
|
||||||
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
|
|
||||||
|
|
||||||
plaintext_config = TextContentConfig(
|
|
||||||
input_files=input_files,
|
|
||||||
input_filter=[str(filter) for filter in input_filter],
|
|
||||||
compressed_jsonl=tmp_path / "test.jsonl",
|
|
||||||
embeddings_file=tmp_path / "test_embeddings.jsonl",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
extracted_plaintext_files = get_plaintext_files(plaintext_config)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(extracted_plaintext_files) == len(expected_files)
|
|
||||||
assert set(extracted_plaintext_files.keys()) == set(expected_files)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
|
|
||||||
"Ensure HTML files are parsed correctly"
|
"Ensure HTML files are parsed correctly"
|
||||||
# Arrange
|
# Arrange
|
||||||
# Setup input-files, input-filters
|
raw_entry = dedent(
|
||||||
config = LocalPlaintextConfig.objects.filter(user=default_user).first()
|
f"""
|
||||||
extracted_plaintext_files = get_plaintext_files(config=config)
|
<html>
|
||||||
|
<head><title>Test HTML</title></head>
|
||||||
|
<body>
|
||||||
|
<div>Test content</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
extracted_plaintext_files = {"test.html": raw_entry}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)
|
entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)
|
||||||
|
|||||||
@@ -2,23 +2,16 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from khoj.database.adapters import EntryAdapters
|
from khoj.database.adapters import EntryAdapters
|
||||||
from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig
|
from khoj.database.models import Entry, GithubConfig, KhojUser
|
||||||
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
|
|
||||||
from khoj.processor.content.github.github_to_entries import GithubToEntries
|
from khoj.processor.content.github.github_to_entries import GithubToEntries
|
||||||
from khoj.processor.content.images.image_to_entries import ImageToEntries
|
|
||||||
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
|
|
||||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
|
|
||||||
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils.fs_syncer import collect_files, get_org_files
|
from tests.helpers import get_index_files, get_sample_data
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -26,53 +19,20 @@ logger = logging.getLogger(__name__)
|
|||||||
# Test
|
# Test
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig):
|
def test_text_search_setup_with_empty_file_creates_no_entries(search_config, default_user: KhojUser):
|
||||||
# Arrange
|
|
||||||
# Ensure file mentioned in org.input-files is missing
|
|
||||||
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
|
||||||
single_new_file.unlink()
|
|
||||||
|
|
||||||
# Act
|
|
||||||
# Generate notes embeddings during asymmetric setup
|
|
||||||
with pytest.raises(FileNotFoundError):
|
|
||||||
get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
|
||||||
@pytest.mark.django_db
|
|
||||||
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser):
|
|
||||||
# Arrange
|
|
||||||
orgfile = tmp_path / "directory.org" / "file.org"
|
|
||||||
orgfile.parent.mkdir()
|
|
||||||
with open(orgfile, "w") as f:
|
|
||||||
f.write("* Heading\n- List item\n")
|
|
||||||
|
|
||||||
LocalOrgConfig.objects.create(
|
|
||||||
input_filter=[f"{tmp_path}/**/*"],
|
|
||||||
input_files=None,
|
|
||||||
user=default_user,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
org_files = collect_files(user=default_user)["org"]
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
# should return orgfile and not raise IsADirectoryError
|
|
||||||
assert org_files == {f"{orgfile}": "* Heading\n- List item\n"}
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
|
||||||
@pytest.mark.django_db
|
|
||||||
def test_text_search_setup_with_empty_file_creates_no_entries(
|
|
||||||
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
|
initial_data = {
|
||||||
|
"test.org": "* First heading\nFirst content",
|
||||||
|
"test2.org": "* Second heading\nSecond content",
|
||||||
|
}
|
||||||
|
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
|
||||||
existing_entries = Entry.objects.filter(user=default_user).count()
|
existing_entries = Entry.objects.filter(user=default_user).count()
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
final_data = {"new_file.org": ""}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
updated_entries = Entry.objects.filter(user=default_user).count()
|
updated_entries = Entry.objects.filter(user=default_user).count()
|
||||||
@@ -84,13 +44,14 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_text_indexer_deletes_embedding_before_regenerate(
|
def test_text_indexer_deletes_embedding_before_regenerate(search_config, default_user: KhojUser, caplog):
|
||||||
content_config: ContentConfig, default_user: KhojUser, caplog
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
|
data = {
|
||||||
|
"test1.org": "* Test heading\nTest content",
|
||||||
|
"test2.org": "* Another heading\nAnother content",
|
||||||
|
}
|
||||||
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
existing_entries = Entry.objects.filter(user=default_user).count()
|
existing_entries = Entry.objects.filter(user=default_user).count()
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
|
||||||
data = get_org_files(org_config)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
@@ -107,11 +68,10 @@ def test_text_indexer_deletes_embedding_before_regenerate(
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog):
|
def test_text_index_same_if_content_unchanged(search_config, default_user: KhojUser, caplog):
|
||||||
# Arrange
|
# Arrange
|
||||||
existing_entries = Entry.objects.filter(user=default_user)
|
existing_entries = Entry.objects.filter(user=default_user)
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
data = {"test.org": "* Test heading\nTest content"}
|
||||||
data = get_org_files(org_config)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Generate initial notes embeddings during asymmetric setup
|
# Generate initial notes embeddings during asymmetric setup
|
||||||
@@ -136,20 +96,14 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@pytest.mark.anyio
|
@pytest.mark.asyncio
|
||||||
# @pytest.mark.asyncio
|
async def test_text_search(search_config):
|
||||||
async def test_text_search(search_config: SearchConfig):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
default_user = await KhojUser.objects.acreate(
|
default_user, _ = await KhojUser.objects.aget_or_create(
|
||||||
username="test_user", password="test_password", email="test@example.com"
|
username="test_user", password="test_password", email="test@example.com"
|
||||||
)
|
)
|
||||||
org_config = await LocalOrgConfig.objects.acreate(
|
# Get some sample org data to index
|
||||||
input_files=None,
|
data = get_sample_data("org")
|
||||||
input_filter=["tests/data/org/*.org"],
|
|
||||||
index_heading_entries=False,
|
|
||||||
user=default_user,
|
|
||||||
)
|
|
||||||
data = get_org_files(org_config)
|
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
await loop.run_in_executor(
|
await loop.run_in_executor(
|
||||||
@@ -175,17 +129,15 @@ async def test_text_search(search_config: SearchConfig):
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog):
|
def test_entry_chunking_by_max_tokens(tmp_path, search_config, default_user: KhojUser, caplog):
|
||||||
# Arrange
|
# Arrange
|
||||||
# Insert org-mode entry with size exceeding max token limit to new org file
|
# Insert org-mode entry with size exceeding max token limit to new org file
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
new_file_to_index = tmp_path / "test.org"
|
||||||
with open(new_file_to_index, "w") as f:
|
content = f"* Entry more than {max_tokens} words\n"
|
||||||
f.write(f"* Entry more than {max_tokens} words\n")
|
for index in range(max_tokens + 1):
|
||||||
for index in range(max_tokens + 1):
|
content += f"{index} "
|
||||||
f.write(f"{index} ")
|
data = {str(new_file_to_index): content}
|
||||||
|
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
@@ -200,9 +152,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_entry_chunking_by_max_tokens_not_full_corpus(
|
def test_entry_chunking_by_max_tokens_not_full_corpus(tmp_path, search_config, default_user: KhojUser, caplog):
|
||||||
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
# Insert org-mode entry with size exceeding max token limit to new org file
|
# Insert org-mode entry with size exceeding max token limit to new org file
|
||||||
data = {
|
data = {
|
||||||
@@ -231,13 +181,11 @@ conda activate khoj
|
|||||||
)
|
)
|
||||||
|
|
||||||
max_tokens = 256
|
max_tokens = 256
|
||||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
new_file_to_index = tmp_path / "test.org"
|
||||||
with open(new_file_to_index, "w") as f:
|
content = f"* Entry more than {max_tokens} words\n"
|
||||||
f.write(f"* Entry more than {max_tokens} words\n")
|
for index in range(max_tokens + 1):
|
||||||
for index in range(max_tokens + 1):
|
content += f"{index} "
|
||||||
f.write(f"{index} ")
|
data = {str(new_file_to_index): content}
|
||||||
|
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
@@ -257,34 +205,34 @@ conda activate khoj
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
|
def test_regenerate_index_with_new_entry(search_config, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
|
# Initial indexed files
|
||||||
|
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
|
||||||
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
|
||||||
initial_data = get_org_files(org_config)
|
|
||||||
|
|
||||||
# append org-mode entry to first org input file in config
|
# Regenerate index with only files from test data set
|
||||||
org_config.input_files = [f"{new_org_file}"]
|
files_to_index = get_index_files()
|
||||||
with open(new_org_file, "w") as f:
|
text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
|
||||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
|
||||||
|
|
||||||
final_data = get_org_files(org_config)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
|
|
||||||
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Update index with the new file
|
||||||
|
new_file = "test.org"
|
||||||
|
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||||
|
files_to_index[new_file] = new_entry
|
||||||
|
|
||||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||||
text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
|
||||||
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
for entry in updated_entries1:
|
for entry in updated_entries1:
|
||||||
assert entry in updated_entries2
|
assert entry in updated_entries2
|
||||||
|
|
||||||
assert not any([new_org_file.name in entry for entry in updated_entries1])
|
assert not any([new_file in entry for entry in updated_entries1])
|
||||||
assert not any([new_org_file.name in entry for entry in existing_entries])
|
assert not any([new_file in entry for entry in existing_entries])
|
||||||
assert any([new_org_file.name in entry for entry in updated_entries2])
|
assert any([new_file in entry for entry in updated_entries2])
|
||||||
|
|
||||||
assert any(
|
assert any(
|
||||||
["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2]
|
["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2]
|
||||||
@@ -294,28 +242,24 @@ def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_update_index_with_duplicate_entries_in_stable_order(
|
def test_update_index_with_duplicate_entries_in_stable_order(tmp_path, search_config, default_user: KhojUser):
|
||||||
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
|
initial_data = get_sample_data("org")
|
||||||
|
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
|
||||||
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
|
||||||
|
|
||||||
# Insert org-mode entries with same compiled form into new org file
|
# Insert org-mode entries with same compiled form into new org file
|
||||||
|
new_file_to_index = tmp_path / "test.org"
|
||||||
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||||
with open(new_file_to_index, "w") as f:
|
# Initial data with duplicate entries
|
||||||
f.write(f"{new_entry}{new_entry}")
|
data = {str(new_file_to_index): f"{new_entry}{new_entry}"}
|
||||||
|
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
||||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
|
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
# idempotent indexing when data unchanged
|
||||||
|
|
||||||
# update embeddings, entries, notes model with no new changes
|
|
||||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
|
|
||||||
@@ -324,6 +268,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||||||
for entry in existing_entries:
|
for entry in existing_entries:
|
||||||
assert entry not in updated_entries1
|
assert entry not in updated_entries1
|
||||||
|
|
||||||
|
# verify the second indexing update has same entries and ordering as first
|
||||||
for entry in updated_entries1:
|
for entry in updated_entries1:
|
||||||
assert entry in updated_entries2
|
assert entry in updated_entries2
|
||||||
|
|
||||||
@@ -334,22 +279,17 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser):
|
def test_update_index_with_deleted_entry(tmp_path, search_config, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
|
|
||||||
|
|
||||||
# Insert org-mode entries with same compiled form into new org file
|
new_file_to_index = tmp_path / "test.org"
|
||||||
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||||
with open(new_file_to_index, "w") as f:
|
|
||||||
f.write(f"{new_entry}{new_entry} -- Tatooine")
|
|
||||||
initial_data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
# update embeddings, entries, notes model after removing an entry from the org file
|
# Initial data with two entries
|
||||||
with open(new_file_to_index, "w") as f:
|
initial_data = {str(new_file_to_index): f"{new_entry}{new_entry} -- Tatooine"}
|
||||||
f.write(f"{new_entry}")
|
# Final data with only first entry, with second entry removed
|
||||||
|
final_data = {str(new_file_to_index): f"{new_entry}"}
|
||||||
final_data = get_org_files(org_config_with_only_new_file)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||||
@@ -375,29 +315,29 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
|
def test_update_index_with_new_entry(search_config, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
# Initial indexed files
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
|
||||||
data = get_org_files(org_config)
|
old_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
|
||||||
|
|
||||||
# append org-mode entry to first org input file in config
|
# Regenerate index with only files from test data set
|
||||||
with open(new_org_file, "w") as f:
|
files_to_index = get_index_files()
|
||||||
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
new_entries = text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
|
||||||
f.write(new_entry)
|
|
||||||
|
|
||||||
data = get_org_files(org_config)
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# update embeddings, entries with the newly added note
|
# Update index with the new file
|
||||||
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
new_file = "test.org"
|
||||||
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
|
||||||
|
final_data = {new_file: new_entry}
|
||||||
|
|
||||||
|
text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user)
|
||||||
|
updated_new_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
for entry in existing_entries:
|
for old_entry in old_entries:
|
||||||
assert entry not in updated_entries1
|
assert old_entry not in updated_new_entries
|
||||||
assert len(updated_entries1) == len(existing_entries) + 1
|
assert len(updated_new_entries) == len(new_entries) + 1
|
||||||
verify_embeddings(3, default_user)
|
verify_embeddings(3, default_user)
|
||||||
|
|
||||||
|
|
||||||
@@ -409,9 +349,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
|||||||
(OrgToEntries),
|
(OrgToEntries),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_update_index_with_deleted_file(
|
def test_update_index_with_deleted_file(text_to_entries: TextToEntries, search_config, default_user: KhojUser):
|
||||||
org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser
|
|
||||||
):
|
|
||||||
"Delete entries associated with new file when file path with empty content passed."
|
"Delete entries associated with new file when file path with empty content passed."
|
||||||
# Arrange
|
# Arrange
|
||||||
file_to_index = "test"
|
file_to_index = "test"
|
||||||
@@ -446,7 +384,7 @@ def test_update_index_with_deleted_file(
|
|||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
|
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
|
||||||
def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser):
|
def test_text_search_setup_github(search_config, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
github_config = GithubConfig.objects.filter(user=default_user).first()
|
github_config = GithubConfig.objects.filter(user=default_user).first()
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,12 @@
|
|||||||
# Application Packages
|
# Application Packages
|
||||||
from khoj.search_filter.word_filter import WordFilter
|
from khoj.search_filter.word_filter import WordFilter
|
||||||
from khoj.utils.rawconfig import Entry
|
|
||||||
|
|
||||||
|
# Mock Entry class for testing
|
||||||
|
class Entry:
|
||||||
|
def __init__(self, compiled="", raw=""):
|
||||||
|
self.compiled = compiled
|
||||||
|
self.raw = raw
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
|
|||||||
Reference in New Issue
Block a user