Update test setup to index test data after old indexing code removed

- Delete tests testing deprecated server side indexing flows
- Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and
  references in tests
- Index test data via new helper method, `get_index_files'
  - It is modelled after the old `get_org_files' variants in main app
  - It passes the test data in required format to `configure_content'
    Allows maintaining the more realistic tests from before while
    using new indexing mechanism (rather than the deprecated server
    side indexing mechanism
This commit is contained in:
Debanjum
2025-07-11 14:35:05 -07:00
parent d9d24dd638
commit 892d57314e
12 changed files with 295 additions and 604 deletions

View File

@@ -1,6 +1,3 @@
import os
from pathlib import Path
import pytest
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
@@ -11,6 +8,7 @@ from khoj.configure import (
configure_routes,
configure_search_types,
)
from khoj.database.adapters import get_default_search_model
from khoj.database.models import (
Agent,
ChatModel,
@@ -19,21 +17,14 @@ from khoj.database.models import (
GithubRepoConfig,
KhojApiUser,
KhojUser,
LocalMarkdownConfig,
LocalOrgConfig,
LocalPdfConfig,
LocalPlaintextConfig,
)
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.routers.api_content import configure_content
from khoj.search_type import text_search
from khoj.utils import fs_syncer, state
from khoj.utils.config import SearchModels
from khoj.utils import state
from khoj.utils.constants import web_directory
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from tests.helpers import (
AiModelApiFactory,
ChatModelFactory,
@@ -43,6 +34,8 @@ from tests.helpers import (
UserFactory,
get_chat_api_key,
get_chat_provider,
get_index_files,
get_sample_data,
)
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
@pytest.fixture(scope="session")
def search_config() -> SearchConfig:
def search_config():
search_model = get_default_search_model()
state.embeddings_model = dict()
state.embeddings_model["default"] = EmbeddingsModel()
state.embeddings_model["default"] = EmbeddingsModel(
model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
)
state.cross_encoder_model = dict()
state.cross_encoder_model["default"] = CrossEncoderModel()
model_dir = resolve_absolute_path("~/.khoj/search")
model_dir.mkdir(parents=True, exist_ok=True)
search_config = SearchConfig()
return search_config
state.cross_encoder_model["default"] = CrossEncoderModel(
model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
)
@pytest.mark.django_db
@@ -201,13 +193,6 @@ def openai_agent():
)
@pytest.fixture(scope="session")
def search_models(search_config: SearchConfig):
search_models = SearchModels()
return search_models
@pytest.mark.django_db
@pytest.fixture
def default_process_lock():
@@ -219,72 +204,23 @@ def anyio_backend():
return "asyncio"
@pytest.mark.django_db
@pytest.fixture(scope="function")
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
content_dir = tmp_path_factory.mktemp("content")
# Generate Image Embeddings from Test Images
content_config = ContentConfig()
LocalOrgConfig.objects.create(
input_files=None,
input_filter=["tests/data/org/*.org"],
index_heading_entries=False,
user=default_user,
)
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
if os.getenv("GITHUB_PAT_TOKEN"):
GithubConfig.objects.create(
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
user=default_user,
)
GithubRepoConfig.objects.create(
owner="khoj-ai",
name="lantern",
branch="master",
github_config=GithubConfig.objects.get(user=default_user),
)
LocalPlaintextConfig.objects.create(
input_files=None,
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
user=default_user,
)
return content_config
@pytest.fixture(scope="session")
def md_content_config():
markdown_config = LocalMarkdownConfig.objects.create(
input_files=None,
input_filter=["tests/data/markdown/*.markdown"],
)
return markdown_config
@pytest.fixture(scope="function")
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
def chat_client(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, require_auth=False)
@pytest.fixture(scope="function")
def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_with_auth(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, require_auth=True)
@pytest.fixture(scope="function")
def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_no_background(search_config, default_user2: KhojUser):
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
@pytest.fixture(scope="function")
def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
def chat_client_with_large_kb(search_config, default_user2: KhojUser):
"""
Chat client fixture that creates a large knowledge base with many files
for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
state.SearchType = configure_search_types()
if index_content:
LocalMarkdownConfig.objects.create(
input_files=None,
input_filter=["tests/data/markdown/*.markdown"],
user=user,
)
file_type = "markdown"
files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
# Index Markdown Content for Search
all_files = fs_syncer.collect_files(user=user)
configure_content(user, all_files)
configure_content(user, files_to_index)
# Initialize Processor from Config
chat_provider = get_chat_provider()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
# Create temporary directory for large number of test files
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
file_type = "markdown"
large_file_list = []
try:
# Generate 200 test files with substantial content
for i in range(300):
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
content = f"""
# Test File {i}
@@ -401,16 +334,9 @@ End of file {i}.
f.write(content)
large_file_list.append(file_path)
# Create LocalMarkdownConfig with all the generated files
LocalMarkdownConfig.objects.create(
input_files=large_file_list,
input_filter=None,
user=user,
)
# Index all the files into the user's knowledge base
all_files = fs_syncer.collect_files(user=user)
configure_content(user, all_files)
# Index all generated files into the user's knowledge base
files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
configure_content(user, files_to_index)
# Verify we have a substantial knowledge base
file_count = FileObject.objects.filter(user=user, agent=None).count()
@@ -493,139 +419,18 @@ def client(
return TestClient(app)
@pytest.fixture(scope="function")
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
# Setup
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
input_filters = org_config.input_filter
new_org_file = Path(input_filters[0]).parent / "new_file.org"
new_org_file.touch()
yield new_org_file
# Cleanup
if new_org_file.exists():
new_org_file.unlink()
@pytest.fixture(scope="function")
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
return LocalOrgConfig.objects.filter(user=default_user).first()
@pytest.fixture(scope="function")
def pdf_configured_user1(default_user: KhojUser):
LocalPdfConfig.objects.create(
input_files=None,
input_filter=["tests/data/pdf/singlepage.pdf"],
user=default_user,
)
# Index Markdown Content for Search
all_files = fs_syncer.collect_files(user=default_user)
configure_content(default_user, all_files)
# Read data from pdf file at tests/data/pdf/singlepage.pdf
pdf_file_path = "tests/data/pdf/singlepage.pdf"
with open(pdf_file_path, "rb") as pdf_file:
pdf_data = pdf_file.read()
knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
# Index Content for Search
configure_content(default_user, knowledge_base)
@pytest.fixture(scope="function")
def sample_org_data():
return get_sample_data("org")
def get_sample_data(type):
sample_data = {
"org": {
"elisp.org": """
* Emacs Khoj
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
(use-package khoj
:load-path "~/.emacs.d/lisp/khoj.el"
:bind ("C-c s" . 'khoj))
#+end_src
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Khoj Package
(use-package khoj
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
:bind ("C-c s" . 'khoj))
#+end_src
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g. "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g. "What is the meaning of life? -god +none"
""",
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install
#+begin_src shell
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src""",
},
"markdown": {
"readme.markdown": """
# Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
## Dependencies
- Python3
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
## Install
```shell
git clone
conda env create -f environment.yml
conda activate khoj
```
"""
},
"plaintext": {
"readme.txt": """
Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
Dependencies
- Python3
- Miniconda
Install
git clone
conda env create -f environment.yml
conda activate khoj
"""
},
}
return sample_data[type]

View File

@@ -1,3 +1,5 @@
import glob
import logging
import os
from datetime import datetime
@@ -17,6 +19,9 @@ from khoj.database.models import (
UserConversationConfig,
)
from khoj.processor.conversation.utils import message_to_log
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
logger = logging.getLogger(__name__)
def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
@@ -61,6 +66,140 @@ def generate_chat_history(message_list):
return chat_history
def get_sample_data(type):
sample_data = {
"org": {
"elisp.org": """
* Emacs Khoj
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
(use-package khoj
:load-path "~/.emacs.d/lisp/khoj.el"
:bind ("C-c s" . 'khoj))
#+end_src
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Khoj Package
(use-package khoj
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
:bind ("C-c s" . 'khoj))
#+end_src
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g. "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g. "What is the meaning of life? -god +none"
""",
"readme.org": """
* Khoj
/Allow natural language search on user content like notes, images using transformer based models/
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
** Dependencies
- Python3
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
** Install
#+begin_src shell
git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml
conda activate khoj
#+end_src""",
},
"markdown": {
"readme.markdown": """
# Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
## Dependencies
- Python3
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
## Install
```shell
git clone
conda env create -f environment.yml
conda activate khoj
```
"""
},
"plaintext": {
"readme.txt": """
Khoj
Allow natural language search on user content like notes, images using transformer based models
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
Dependencies
- Python3
- Miniconda
Install
git clone
conda env create -f environment.yml
conda activate khoj
"""
},
}
return sample_data[type]
def get_index_files(
input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
) -> dict[str, str]:
# Input Validation
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
logger.debug("At least one of input_files or input_filter is required to be specified")
return {}
# Get files to process
absolute_files, filtered_files = set(), set()
if input_files:
absolute_files = {get_absolute_path(input_file) for input_file in input_files}
if input_filters:
filtered_files = {
filtered_file
for file_filter in input_filters
for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_files = sorted(absolute_files | filtered_files)
filename_to_content_map = {}
for file in all_files:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file}. Skipping file.")
logger.warning(e, exc_info=True)
return filename_to_content_map
class UserFactory(factory.django.DjangoModelFactory):
class Meta:
model = KhojUser

View File

@@ -15,7 +15,7 @@ from tests.helpers import ChatModelFactory
def test_create_default_agent(default_user: KhojUser):
ChatModelFactory()
agent = AgentAdapters.create_default_agent(default_user)
agent = AgentAdapters.create_default_agent()
assert agent is not None
assert agent.input_tools == []
assert agent.output_modes == []

View File

@@ -1,49 +1,15 @@
# Standard Modules
from pathlib import Path
from random import random
from khoj.utils.cli import cli
from khoj.utils.helpers import resolve_absolute_path
# Test
# ----------------------------------------------------------------------------------------------------
def test_cli_minimal_default():
# Act
actual_args = cli([])
actual_args = cli(["-vvv"])
# Assert
assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml"))
assert actual_args.regenerate == False
assert actual_args.verbose == 0
# ----------------------------------------------------------------------------------------------------
def test_cli_invalid_config_file_path():
# Arrange
non_existent_config_file = f"non-existent-khoj-{random()}.yml"
# Act
actual_args = cli([f"--config-file={non_existent_config_file}"])
# Assert
assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
assert actual_args.config == None
# ----------------------------------------------------------------------------------------------------
def test_cli_config_from_file():
# Act
actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"])
# Assert
assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml"))
assert actual_args.regenerate == True
assert actual_args.config is not None
assert actual_args.log_file == Path("~/.khoj/khoj.log")
assert actual_args.verbose == 3
# Ensure content config is loaded from file
assert actual_args.config.content_type.org.input_files == [
Path("~/first_from_config.org"),
Path("~/second_from_config.org"),
]

View File

@@ -13,7 +13,6 @@ from khoj.database.models import KhojApiUser, KhojUser
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.search_type import text_search
from khoj.utils import state
from khoj.utils.rawconfig import ContentConfig, SearchConfig
# Test
@@ -296,7 +295,7 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
def test_notes_search(client, search_config, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -315,7 +314,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
def test_notes_search_no_results(client, search_config, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -331,9 +330,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search_with_only_filters(
client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser
):
def test_notes_search_with_only_filters(client, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(
@@ -397,9 +394,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search_requires_parent_context(
client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
):
def test_notes_search_requires_parent_context(client, search_config, sample_org_data, default_user: KhojUser):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)

View File

@@ -1,6 +1,13 @@
# Application Packages
from khoj.search_filter.file_filter import FileFilter
from khoj.utils.rawconfig import Entry
# Mock Entry class for testing
class Entry:
def __init__(self, compiled="", raw="", file=""):
self.compiled = compiled
self.raw = raw
self.file = file
def test_can_filter_no_file_filter():

View File

@@ -3,8 +3,6 @@ import re
from pathlib import Path
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
from khoj.utils.fs_syncer import get_markdown_files
from khoj.utils.rawconfig import TextContentConfig
def test_extract_markdown_with_no_headings(tmp_path):
@@ -212,43 +210,6 @@ longer body line 2.1
), "Third entry is second entries child heading"
def test_get_markdown_files(tmp_path):
"Ensure Markdown files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
# Include via input-file field
file1 = create_file(tmp_path, filename="notes.md")
# Not included by any filter
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.md"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
markdown_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_org_files = get_markdown_files(markdown_config)
# Assert
assert len(extracted_org_files) == 5
assert set(extracted_org_files.keys()) == expected_files
def test_line_number_tracking_in_recursive_split():
"Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
# Arrange

View File

@@ -4,9 +4,8 @@ import time
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.processor.content.text_to_entries import TextToEntries
from khoj.utils.fs_syncer import get_org_files
from khoj.utils.helpers import is_none_or_empty
from khoj.utils.rawconfig import Entry, TextContentConfig
from khoj.utils.rawconfig import Entry
def test_configure_indexing_heading_only_entries(tmp_path):
@@ -330,46 +329,6 @@ def test_file_with_no_headings_to_entry(tmp_path):
assert len(entries[1]) == 1
def test_get_org_files(tmp_path):
"Ensure Org files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.org")
group1_file2 = create_file(tmp_path, filename="group1-file2.org")
group2_file1 = create_file(tmp_path, filename="group2-file1.org")
group2_file2 = create_file(tmp_path, filename="group2-file2.org")
# Include via input-file field
orgfile1 = create_file(tmp_path, filename="orgfile1.org")
# Not included by any filter
create_file(tmp_path, filename="orgfile2.org")
create_file(tmp_path, filename="text1.txt")
expected_files = set(
[
os.path.join(tmp_path, file.name)
for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
]
)
# Setup input-files, input-filters
input_files = [tmp_path / "orgfile1.org"]
input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
org_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_org_files = get_org_files(org_config)
# Assert
assert len(extracted_org_files) == 5
assert set(extracted_org_files.keys()) == expected_files
def test_extract_entries_with_different_level_headings(tmp_path):
"Extract org entries with different level headings."
# Arrange

View File

@@ -4,8 +4,6 @@ import re
import pytest
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
from khoj.utils.fs_syncer import get_pdf_files
from khoj.utils.rawconfig import TextContentConfig
def test_single_page_pdf_to_jsonl():
@@ -61,43 +59,6 @@ def test_ocr_page_pdf_to_jsonl():
assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
def test_get_pdf_files(tmp_path):
"Ensure Pdf files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
# Include via input-file field
file1 = create_file(tmp_path, filename="document.pdf")
# Not included by any filter
create_file(tmp_path, filename="not-included-document.pdf")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(
[os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
)
# Setup input-files, input-filters
input_files = [tmp_path / "document.pdf"]
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
pdf_config = TextContentConfig(
input_files=input_files,
input_filter=[str(path) for path in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_pdf_files = get_pdf_files(pdf_config)
# Assert
assert len(extracted_pdf_files) == 5
assert set(extracted_pdf_files.keys()) == expected_files
# Helper Functions
def create_file(tmp_path, entry=None, filename="document.pdf"):
pdf_file = tmp_path / filename

View File

@@ -1,27 +1,20 @@
import os
from pathlib import Path
from textwrap import dedent
from khoj.database.models import KhojUser, LocalPlaintextConfig
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.utils.fs_syncer import get_plaintext_files
from khoj.utils.rawconfig import TextContentConfig
def test_plaintext_file(tmp_path):
def test_plaintext_file():
"Convert files with no heading to jsonl."
# Arrange
raw_entry = f"""
Hi, I am a plaintext file and I have some plaintext words.
"""
plaintextfile = create_file(tmp_path, raw_entry)
plaintextfile = "test.txt"
data = {plaintextfile: raw_entry}
# Act
# Extract Entries from specified plaintext files
data = {
f"{plaintextfile}": raw_entry,
}
entries = PlaintextToEntries.extract_plaintext_entries(data)
# Convert each entry.file to absolute path to make them JSON serializable
@@ -37,59 +30,20 @@ def test_plaintext_file(tmp_path):
assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
def test_get_plaintext_files(tmp_path):
"Ensure Plaintext files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
group2_file4 = create_file(tmp_path, filename="group2-file4.html")
# Include via input-file field
file1 = create_file(tmp_path, filename="notes.txt")
# Include unsupported file types
create_file(tmp_path, filename="group2-unincluded.py")
create_file(tmp_path, filename="group2-unincluded.csv")
create_file(tmp_path, filename="group2-unincluded.csv")
create_file(tmp_path, filename="group2-file3.mbox")
# Not included by any filter
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(
[
os.path.join(tmp_path, file.name)
for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file4, file1]
]
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.txt"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
plaintext_config = TextContentConfig(
input_files=input_files,
input_filter=[str(filter) for filter in input_filter],
compressed_jsonl=tmp_path / "test.jsonl",
embeddings_file=tmp_path / "test_embeddings.jsonl",
)
# Act
extracted_plaintext_files = get_plaintext_files(plaintext_config)
# Assert
assert len(extracted_plaintext_files) == len(expected_files)
assert set(extracted_plaintext_files.keys()) == set(expected_files)
def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
def test_parse_html_plaintext_file(tmp_path):
"Ensure HTML files are parsed correctly"
# Arrange
# Setup input-files, input-filters
config = LocalPlaintextConfig.objects.filter(user=default_user).first()
extracted_plaintext_files = get_plaintext_files(config=config)
raw_entry = dedent(
f"""
<html>
<head><title>Test HTML</title></head>
<body>
<div>Test content</div>
</body>
</html>
"""
)
extracted_plaintext_files = {"test.html": raw_entry}
# Act
entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)

View File

@@ -2,23 +2,16 @@
import asyncio
import logging
import os
from pathlib import Path
import pytest
from khoj.database.adapters import EntryAdapters
from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig
from khoj.processor.content.docx.docx_to_entries import DocxToEntries
from khoj.database.models import Entry, GithubConfig, KhojUser
from khoj.processor.content.github.github_to_entries import GithubToEntries
from khoj.processor.content.images.image_to_entries import ImageToEntries
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.processor.content.text_to_entries import TextToEntries
from khoj.search_type import text_search
from khoj.utils.fs_syncer import collect_files, get_org_files
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from tests.helpers import get_index_files, get_sample_data
logger = logging.getLogger(__name__)
@@ -26,53 +19,20 @@ logger = logging.getLogger(__name__)
# Test
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig):
# Arrange
# Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0])
single_new_file.unlink()
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError):
get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser):
# Arrange
orgfile = tmp_path / "directory.org" / "file.org"
orgfile.parent.mkdir()
with open(orgfile, "w") as f:
f.write("* Heading\n- List item\n")
LocalOrgConfig.objects.create(
input_filter=[f"{tmp_path}/**/*"],
input_files=None,
user=default_user,
)
# Act
org_files = collect_files(user=default_user)["org"]
# Assert
# should return orgfile and not raise IsADirectoryError
assert org_files == {f"{orgfile}": "* Heading\n- List item\n"}
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_text_search_setup_with_empty_file_creates_no_entries(
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
):
def test_text_search_setup_with_empty_file_creates_no_entries(search_config, default_user: KhojUser):
# Arrange
initial_data = {
"test.org": "* First heading\nFirst content",
"test2.org": "* Second heading\nSecond content",
}
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
existing_entries = Entry.objects.filter(user=default_user).count()
data = get_org_files(org_config_with_only_new_file)
final_data = {"new_file.org": ""}
# Act
# Generate notes embeddings during asymmetric setup
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
# Assert
updated_entries = Entry.objects.filter(user=default_user).count()
@@ -84,13 +44,14 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_text_indexer_deletes_embedding_before_regenerate(
content_config: ContentConfig, default_user: KhojUser, caplog
):
def test_text_indexer_deletes_embedding_before_regenerate(search_config, default_user: KhojUser, caplog):
# Arrange
data = {
"test1.org": "* Test heading\nTest content",
"test2.org": "* Another heading\nAnother content",
}
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
existing_entries = Entry.objects.filter(user=default_user).count()
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
# Act
# Generate notes embeddings during asymmetric setup
@@ -107,11 +68,10 @@ def test_text_indexer_deletes_embedding_before_regenerate(
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog):
def test_text_index_same_if_content_unchanged(search_config, default_user: KhojUser, caplog):
# Arrange
existing_entries = Entry.objects.filter(user=default_user)
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
data = {"test.org": "* Test heading\nTest content"}
# Act
# Generate initial notes embeddings during asymmetric setup
@@ -136,20 +96,14 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
@pytest.mark.anyio
# @pytest.mark.asyncio
async def test_text_search(search_config: SearchConfig):
@pytest.mark.asyncio
async def test_text_search(search_config):
# Arrange
default_user = await KhojUser.objects.acreate(
default_user, _ = await KhojUser.objects.aget_or_create(
username="test_user", password="test_password", email="test@example.com"
)
org_config = await LocalOrgConfig.objects.acreate(
input_files=None,
input_filter=["tests/data/org/*.org"],
index_heading_entries=False,
user=default_user,
)
data = get_org_files(org_config)
# Get some sample org data to index
data = get_sample_data("org")
loop = asyncio.get_event_loop()
await loop.run_in_executor(
@@ -175,17 +129,15 @@ async def test_text_search(search_config: SearchConfig):
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog):
def test_entry_chunking_by_max_tokens(tmp_path, search_config, default_user: KhojUser, caplog):
# Arrange
# Insert org-mode entry with size exceeding max token limit to new org file
max_tokens = 256
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
with open(new_file_to_index, "w") as f:
f.write(f"* Entry more than {max_tokens} words\n")
for index in range(max_tokens + 1):
f.write(f"{index} ")
data = get_org_files(org_config_with_only_new_file)
new_file_to_index = tmp_path / "test.org"
content = f"* Entry more than {max_tokens} words\n"
for index in range(max_tokens + 1):
content += f"{index} "
data = {str(new_file_to_index): content}
# Act
# reload embeddings, entries, notes model after adding new org-mode file
@@ -200,9 +152,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_entry_chunking_by_max_tokens_not_full_corpus(
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog
):
def test_entry_chunking_by_max_tokens_not_full_corpus(tmp_path, search_config, default_user: KhojUser, caplog):
# Arrange
# Insert org-mode entry with size exceeding max token limit to new org file
data = {
@@ -231,13 +181,11 @@ conda activate khoj
)
max_tokens = 256
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
with open(new_file_to_index, "w") as f:
f.write(f"* Entry more than {max_tokens} words\n")
for index in range(max_tokens + 1):
f.write(f"{index} ")
data = get_org_files(org_config_with_only_new_file)
new_file_to_index = tmp_path / "test.org"
content = f"* Entry more than {max_tokens} words\n"
for index in range(max_tokens + 1):
content += f"{index} "
data = {str(new_file_to_index): content}
# Act
# reload embeddings, entries, notes model after adding new org-mode file
@@ -257,34 +205,34 @@ conda activate khoj
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
def test_regenerate_index_with_new_entry(search_config, default_user: KhojUser):
# Arrange
# Initial indexed files
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
initial_data = get_org_files(org_config)
# append org-mode entry to first org input file in config
org_config.input_files = [f"{new_org_file}"]
with open(new_org_file, "w") as f:
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
final_data = get_org_files(org_config)
# Act
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
# Regenerate index with only files from test data set
files_to_index = get_index_files()
text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
# Act
# Update index with the new file
new_file = "test.org"
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
files_to_index[new_file] = new_entry
# regenerate notes jsonl, model embeddings and model to include entry from new file
text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
# Assert
for entry in updated_entries1:
assert entry in updated_entries2
assert not any([new_org_file.name in entry for entry in updated_entries1])
assert not any([new_org_file.name in entry for entry in existing_entries])
assert any([new_org_file.name in entry for entry in updated_entries2])
assert not any([new_file in entry for entry in updated_entries1])
assert not any([new_file in entry for entry in existing_entries])
assert any([new_file in entry for entry in updated_entries2])
assert any(
["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2]
@@ -294,28 +242,24 @@ def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_update_index_with_duplicate_entries_in_stable_order(
org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
):
def test_update_index_with_duplicate_entries_in_stable_order(tmp_path, search_config, default_user: KhojUser):
# Arrange
initial_data = get_sample_data("org")
text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
# Insert org-mode entries with same compiled form into new org file
new_file_to_index = tmp_path / "test.org"
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}{new_entry}")
data = get_org_files(org_config_with_only_new_file)
# Initial data with duplicate entries
data = {str(new_file_to_index): f"{new_entry}{new_entry}"}
# Act
# generate embeddings, entries, notes model from scratch after adding new org-mode file
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
data = get_org_files(org_config_with_only_new_file)
# update embeddings, entries, notes model with no new changes
# idempotent indexing when data unchanged
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
@@ -324,6 +268,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
for entry in existing_entries:
assert entry not in updated_entries1
# verify the second indexing update has same entries and ordering as first
for entry in updated_entries1:
assert entry in updated_entries2
@@ -334,22 +279,17 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser):
def test_update_index_with_deleted_entry(tmp_path, search_config, default_user: KhojUser):
# Arrange
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
# Insert org-mode entries with same compiled form into new org file
new_file_to_index = tmp_path / "test.org"
new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}{new_entry} -- Tatooine")
initial_data = get_org_files(org_config_with_only_new_file)
# update embeddings, entries, notes model after removing an entry from the org file
with open(new_file_to_index, "w") as f:
f.write(f"{new_entry}")
final_data = get_org_files(org_config_with_only_new_file)
# Initial data with two entries
initial_data = {str(new_file_to_index): f"{new_entry}{new_entry} -- Tatooine"}
# Final data with only first entry, with second entry removed
final_data = {str(new_file_to_index): f"{new_entry}"}
# Act
# load embeddings, entries, notes model after adding new org file with 2 entries
@@ -375,29 +315,29 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
def test_update_index_with_new_entry(search_config, default_user: KhojUser):
# Arrange
existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config)
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Initial indexed files
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
old_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
# append org-mode entry to first org input file in config
with open(new_org_file, "w") as f:
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
f.write(new_entry)
data = get_org_files(org_config)
# Regenerate index with only files from test data set
files_to_index = get_index_files()
new_entries = text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
# Act
# update embeddings, entries with the newly added note
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
# Update index with the new file
new_file = "test.org"
new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
final_data = {new_file: new_entry}
text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user)
updated_new_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
# Assert
for entry in existing_entries:
assert entry not in updated_entries1
assert len(updated_entries1) == len(existing_entries) + 1
for old_entry in old_entries:
assert old_entry not in updated_new_entries
assert len(updated_new_entries) == len(new_entries) + 1
verify_embeddings(3, default_user)
@@ -409,9 +349,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
(OrgToEntries),
],
)
def test_update_index_with_deleted_file(
org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser
):
def test_update_index_with_deleted_file(text_to_entries: TextToEntries, search_config, default_user: KhojUser):
"Delete entries associated with new file when file path with empty content passed."
# Arrange
file_to_index = "test"
@@ -446,7 +384,7 @@ def test_update_index_with_deleted_file(
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser):
def test_text_search_setup_github(search_config, default_user: KhojUser):
# Arrange
github_config = GithubConfig.objects.filter(user=default_user).first()

View File

@@ -1,6 +1,12 @@
# Application Packages
from khoj.search_filter.word_filter import WordFilter
from khoj.utils.rawconfig import Entry
# Mock Entry class for testing
class Entry:
def __init__(self, compiled="", raw=""):
self.compiled = compiled
self.raw = raw
# Test