From 892d57314e3a047a3f63deb775aaf09374d8ed8e Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 11 Jul 2025 14:35:05 -0700 Subject: [PATCH] Update test setup to index test data after old indexing code removed - Delete tests testing deprecated server side indexing flows - Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and references in tests - Index test data via new helper method, `get_index_files' - It is modelled after the old `get_org_files' variants in main app - It passes the test data in required format to `configure_content' Allows maintaining the more realistic tests from before while using new indexing mechanism (rather than the deprecated server side indexing mechanism --- tests/conftest.py | 259 ++++------------------------- tests/helpers.py | 139 ++++++++++++++++ tests/test_agents.py | 2 +- tests/test_cli.py | 38 +---- tests/test_client.py | 13 +- tests/test_file_filter.py | 9 +- tests/test_markdown_to_entries.py | 39 ----- tests/test_org_to_entries.py | 43 +---- tests/test_pdf_to_entries.py | 39 ----- tests/test_plaintext_to_entries.py | 78 ++------- tests/test_text_search.py | 232 ++++++++++---------------- tests/test_word_filter.py | 8 +- 12 files changed, 295 insertions(+), 604 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 097a0ab0..dd448bd1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,3 @@ -import os -from pathlib import Path - import pytest from fastapi import FastAPI from fastapi.staticfiles import StaticFiles @@ -11,6 +8,7 @@ from khoj.configure import ( configure_routes, configure_search_types, ) +from khoj.database.adapters import get_default_search_model from khoj.database.models import ( Agent, ChatModel, @@ -19,21 +17,14 @@ from khoj.database.models import ( GithubRepoConfig, KhojApiUser, KhojUser, - LocalMarkdownConfig, - LocalOrgConfig, - LocalPdfConfig, - LocalPlaintextConfig, ) from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel from khoj.routers.api_content import configure_content from khoj.search_type import text_search -from khoj.utils import fs_syncer, state -from khoj.utils.config import SearchModels +from khoj.utils import state from khoj.utils.constants import web_directory -from khoj.utils.helpers import resolve_absolute_path -from khoj.utils.rawconfig import ContentConfig, SearchConfig from tests.helpers import ( AiModelApiFactory, ChatModelFactory, @@ -43,6 +34,8 @@ from tests.helpers import ( UserFactory, get_chat_api_key, get_chat_provider, + get_index_files, + get_sample_data, ) @@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker): @pytest.fixture(scope="session") -def search_config() -> SearchConfig: +def search_config(): + search_model = get_default_search_model() state.embeddings_model = dict() - state.embeddings_model["default"] = EmbeddingsModel() + state.embeddings_model["default"] = EmbeddingsModel( + model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config + ) state.cross_encoder_model = dict() - state.cross_encoder_model["default"] = CrossEncoderModel() - - model_dir = resolve_absolute_path("~/.khoj/search") - model_dir.mkdir(parents=True, exist_ok=True) - search_config = SearchConfig() - - return search_config + state.cross_encoder_model["default"] = CrossEncoderModel( + model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config + ) @pytest.mark.django_db @@ -201,13 +193,6 @@ def openai_agent(): ) -@pytest.fixture(scope="session") -def search_models(search_config: SearchConfig): - search_models = SearchModels() - - return search_models - - @pytest.mark.django_db @pytest.fixture def default_process_lock(): @@ -219,72 +204,23 @@ def anyio_backend(): return "asyncio" -@pytest.mark.django_db @pytest.fixture(scope="function") -def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser): - content_dir = tmp_path_factory.mktemp("content") - - # Generate Image Embeddings from Test Images - content_config = ContentConfig() - - LocalOrgConfig.objects.create( - input_files=None, - input_filter=["tests/data/org/*.org"], - index_heading_entries=False, - user=default_user, - ) - - text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user) - - if os.getenv("GITHUB_PAT_TOKEN"): - GithubConfig.objects.create( - pat_token=os.getenv("GITHUB_PAT_TOKEN"), - user=default_user, - ) - - GithubRepoConfig.objects.create( - owner="khoj-ai", - name="lantern", - branch="master", - github_config=GithubConfig.objects.get(user=default_user), - ) - - LocalPlaintextConfig.objects.create( - input_files=None, - input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"], - user=default_user, - ) - - return content_config - - -@pytest.fixture(scope="session") -def md_content_config(): - markdown_config = LocalMarkdownConfig.objects.create( - input_files=None, - input_filter=["tests/data/markdown/*.markdown"], - ) - - return markdown_config - - -@pytest.fixture(scope="function") -def chat_client(search_config: SearchConfig, default_user2: KhojUser): +def chat_client(search_config, default_user2: KhojUser): return chat_client_builder(search_config, default_user2, require_auth=False) @pytest.fixture(scope="function") -def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser): +def chat_client_with_auth(search_config, default_user2: KhojUser): return chat_client_builder(search_config, default_user2, require_auth=True) @pytest.fixture(scope="function") -def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser): +def chat_client_no_background(search_config, default_user2: KhojUser): return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False) @pytest.fixture(scope="function") -def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser): +def chat_client_with_large_kb(search_config, default_user2: KhojUser): """ Chat client fixture that creates a large knowledge base with many files for stress testing atomic agent updates. @@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa state.SearchType = configure_search_types() if index_content: - LocalMarkdownConfig.objects.create( - input_files=None, - input_filter=["tests/data/markdown/*.markdown"], - user=user, - ) + file_type = "markdown" + files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])} # Index Markdown Content for Search - all_files = fs_syncer.collect_files(user=user) - configure_content(user, all_files) + configure_content(user, files_to_index) # Initialize Processor from Config chat_provider = get_chat_provider() @@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user): # Create temporary directory for large number of test files temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_") + file_type = "markdown" large_file_list = [] try: # Generate 200 test files with substantial content for i in range(300): - file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown") + file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}") content = f""" # Test File {i} @@ -401,16 +334,9 @@ End of file {i}. f.write(content) large_file_list.append(file_path) - # Create LocalMarkdownConfig with all the generated files - LocalMarkdownConfig.objects.create( - input_files=large_file_list, - input_filter=None, - user=user, - ) - - # Index all the files into the user's knowledge base - all_files = fs_syncer.collect_files(user=user) - configure_content(user, all_files) + # Index all generated files into the user's knowledge base + files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)} + configure_content(user, files_to_index) # Verify we have a substantial knowledge base file_count = FileObject.objects.filter(user=user, agent=None).count() @@ -493,139 +419,18 @@ def client( return TestClient(app) -@pytest.fixture(scope="function") -def new_org_file(default_user: KhojUser, content_config: ContentConfig): - # Setup - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - input_filters = org_config.input_filter - new_org_file = Path(input_filters[0]).parent / "new_file.org" - new_org_file.touch() - - yield new_org_file - - # Cleanup - if new_org_file.exists(): - new_org_file.unlink() - - -@pytest.fixture(scope="function") -def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser): - LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None) - return LocalOrgConfig.objects.filter(user=default_user).first() - - @pytest.fixture(scope="function") def pdf_configured_user1(default_user: KhojUser): - LocalPdfConfig.objects.create( - input_files=None, - input_filter=["tests/data/pdf/singlepage.pdf"], - user=default_user, - ) - # Index Markdown Content for Search - all_files = fs_syncer.collect_files(user=default_user) - configure_content(default_user, all_files) + # Read data from pdf file at tests/data/pdf/singlepage.pdf + pdf_file_path = "tests/data/pdf/singlepage.pdf" + with open(pdf_file_path, "rb") as pdf_file: + pdf_data = pdf_file.read() + + knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}} + # Index Content for Search + configure_content(default_user, knowledge_base) @pytest.fixture(scope="function") def sample_org_data(): return get_sample_data("org") - - -def get_sample_data(type): - sample_data = { - "org": { - "elisp.org": """ -* Emacs Khoj - /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/ - -** Requirements - - Install and Run [[https://github.com/khoj-ai/khoj][khoj]] - -** Installation -*** Direct - - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp - - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet - #+begin_src elisp - ;; Khoj Package - (use-package khoj - :load-path "~/.emacs.d/lisp/khoj.el" - :bind ("C-c s" . 'khoj)) - #+end_src - -*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]] - - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed - - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it. - #+begin_src elisp - ;; Khoj Package - (use-package khoj - :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el") - :bind ("C-c s" . 'khoj)) - #+end_src - -** Usage - 1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~ - 2. Enter Query in Natural Language - e.g. "What is the meaning of life?" "What are my life goals?" - 3. Wait for results - *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files* - 4. (Optional) Narrow down results further - Include/Exclude specific words from results by adding to query - e.g. "What is the meaning of life? -god +none" - -""", - "readme.org": """ -* Khoj - /Allow natural language search on user content like notes, images using transformer based models/ - - All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline - -** Dependencies - - Python3 - - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] - -** Install - #+begin_src shell - git clone https://github.com/khoj-ai/khoj && cd khoj - conda env create -f environment.yml - conda activate khoj - #+end_src""", - }, - "markdown": { - "readme.markdown": """ -# Khoj -Allow natural language search on user content like notes, images using transformer based models - -All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline - -## Dependencies -- Python3 -- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links) - -## Install -```shell -git clone -conda env create -f environment.yml -conda activate khoj -``` -""" - }, - "plaintext": { - "readme.txt": """ -Khoj -Allow natural language search on user content like notes, images using transformer based models - -All data is processed locally. User can interface with khoj app via Emacs, API or Commandline - -Dependencies -- Python3 -- Miniconda - -Install -git clone -conda env create -f environment.yml -conda activate khoj -""" - }, - } - - return sample_data[type] diff --git a/tests/helpers.py b/tests/helpers.py index 53ce4ea6..6edb0946 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,3 +1,5 @@ +import glob +import logging import os from datetime import datetime @@ -17,6 +19,9 @@ from khoj.database.models import ( UserConversationConfig, ) from khoj.processor.conversation.utils import message_to_log +from khoj.utils.helpers import get_absolute_path, is_none_or_empty + +logger = logging.getLogger(__name__) def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE): @@ -61,6 +66,140 @@ def generate_chat_history(message_list): return chat_history +def get_sample_data(type): + sample_data = { + "org": { + "elisp.org": """ +* Emacs Khoj + /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/ + +** Requirements + - Install and Run [[https://github.com/khoj-ai/khoj][khoj]] + +** Installation +*** Direct + - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp + - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet + #+begin_src elisp + ;; Khoj Package + (use-package khoj + :load-path "~/.emacs.d/lisp/khoj.el" + :bind ("C-c s" . 'khoj)) + #+end_src + +*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]] + - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed + - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it. + #+begin_src elisp + ;; Khoj Package + (use-package khoj + :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el") + :bind ("C-c s" . 'khoj)) + #+end_src + +** Usage + 1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~ + 2. Enter Query in Natural Language + e.g. "What is the meaning of life?" "What are my life goals?" + 3. Wait for results + *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files* + 4. (Optional) Narrow down results further + Include/Exclude specific words from results by adding to query + e.g. "What is the meaning of life? -god +none" + +""", + "readme.org": """ +* Khoj + /Allow natural language search on user content like notes, images using transformer based models/ + + All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline + +** Dependencies + - Python3 + - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]] + +** Install + #+begin_src shell + git clone https://github.com/khoj-ai/khoj && cd khoj + conda env create -f environment.yml + conda activate khoj + #+end_src""", + }, + "markdown": { + "readme.markdown": """ +# Khoj +Allow natural language search on user content like notes, images using transformer based models + +All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline + +## Dependencies +- Python3 +- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links) + +## Install +```shell +git clone +conda env create -f environment.yml +conda activate khoj +``` +""" + }, + "plaintext": { + "readme.txt": """ +Khoj +Allow natural language search on user content like notes, images using transformer based models + +All data is processed locally. User can interface with khoj app via Emacs, API or Commandline + +Dependencies +- Python3 +- Miniconda + +Install +git clone +conda env create -f environment.yml +conda activate khoj +""" + }, + } + + return sample_data[type] + + +def get_index_files( + input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"] +) -> dict[str, str]: + # Input Validation + if is_none_or_empty(input_files) and is_none_or_empty(input_filters): + logger.debug("At least one of input_files or input_filter is required to be specified") + return {} + + # Get files to process + absolute_files, filtered_files = set(), set() + if input_files: + absolute_files = {get_absolute_path(input_file) for input_file in input_files} + if input_filters: + filtered_files = { + filtered_file + for file_filter in input_filters + for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True) + if os.path.isfile(filtered_file) + } + + all_files = sorted(absolute_files | filtered_files) + + filename_to_content_map = {} + for file in all_files: + with open(file, "r", encoding="utf8") as f: + try: + filename_to_content_map[file] = f.read() + except Exception as e: + logger.warning(f"Unable to read file: {file}. Skipping file.") + logger.warning(e, exc_info=True) + + return filename_to_content_map + + class UserFactory(factory.django.DjangoModelFactory): class Meta: model = KhojUser diff --git a/tests/test_agents.py b/tests/test_agents.py index 21a242ef..1d3b96ec 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -15,7 +15,7 @@ from tests.helpers import ChatModelFactory def test_create_default_agent(default_user: KhojUser): ChatModelFactory() - agent = AgentAdapters.create_default_agent(default_user) + agent = AgentAdapters.create_default_agent() assert agent is not None assert agent.input_tools == [] assert agent.output_modes == [] diff --git a/tests/test_cli.py b/tests/test_cli.py index 211ff38e..15908653 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,49 +1,15 @@ # Standard Modules from pathlib import Path -from random import random from khoj.utils.cli import cli -from khoj.utils.helpers import resolve_absolute_path # Test # ---------------------------------------------------------------------------------------------------- def test_cli_minimal_default(): # Act - actual_args = cli([]) + actual_args = cli(["-vvv"]) # Assert - assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml")) - assert actual_args.regenerate == False - assert actual_args.verbose == 0 - - -# ---------------------------------------------------------------------------------------------------- -def test_cli_invalid_config_file_path(): - # Arrange - non_existent_config_file = f"non-existent-khoj-{random()}.yml" - - # Act - actual_args = cli([f"--config-file={non_existent_config_file}"]) - - # Assert - assert actual_args.config_file == resolve_absolute_path(non_existent_config_file) - assert actual_args.config == None - - -# ---------------------------------------------------------------------------------------------------- -def test_cli_config_from_file(): - # Act - actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"]) - - # Assert - assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml")) - assert actual_args.regenerate == True - assert actual_args.config is not None + assert actual_args.log_file == Path("~/.khoj/khoj.log") assert actual_args.verbose == 3 - - # Ensure content config is loaded from file - assert actual_args.config.content_type.org.input_files == [ - Path("~/first_from_config.org"), - Path("~/second_from_config.org"), - ] diff --git a/tests/test_client.py b/tests/test_client.py index b7341d0e..46732a86 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -13,7 +13,6 @@ from khoj.database.models import KhojApiUser, KhojUser from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.search_type import text_search from khoj.utils import state -from khoj.utils.rawconfig import ContentConfig, SearchConfig # Test @@ -296,7 +295,7 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI): # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) -def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): +def test_notes_search(client, search_config, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) @@ -315,7 +314,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) -def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): +def test_notes_search_no_results(client, search_config, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) @@ -331,9 +330,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) -def test_notes_search_with_only_filters( - client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser -): +def test_notes_search_with_only_filters(client, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup( @@ -397,9 +394,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user: # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db(transaction=True) -def test_notes_search_requires_parent_context( - client, search_config: SearchConfig, sample_org_data, default_user: KhojUser -): +def test_notes_search_requires_parent_context(client, search_config, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) diff --git a/tests/test_file_filter.py b/tests/test_file_filter.py index 9a36bd57..21f198ea 100644 --- a/tests/test_file_filter.py +++ b/tests/test_file_filter.py @@ -1,6 +1,13 @@ # Application Packages from khoj.search_filter.file_filter import FileFilter -from khoj.utils.rawconfig import Entry + + +# Mock Entry class for testing +class Entry: + def __init__(self, compiled="", raw="", file=""): + self.compiled = compiled + self.raw = raw + self.file = file def test_can_filter_no_file_filter(): diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 30813555..b8ab37a7 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -3,8 +3,6 @@ import re from pathlib import Path from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries -from khoj.utils.fs_syncer import get_markdown_files -from khoj.utils.rawconfig import TextContentConfig def test_extract_markdown_with_no_headings(tmp_path): @@ -212,43 +210,6 @@ longer body line 2.1 ), "Third entry is second entries child heading" -def test_get_markdown_files(tmp_path): - "Ensure Markdown files specified via input-filter, input-files extracted" - # Arrange - # Include via input-filter globs - group1_file1 = create_file(tmp_path, filename="group1-file1.md") - group1_file2 = create_file(tmp_path, filename="group1-file2.md") - group2_file1 = create_file(tmp_path, filename="group2-file1.markdown") - group2_file2 = create_file(tmp_path, filename="group2-file2.markdown") - # Include via input-file field - file1 = create_file(tmp_path, filename="notes.md") - # Not included by any filter - create_file(tmp_path, filename="not-included-markdown.md") - create_file(tmp_path, filename="not-included-text.txt") - - expected_files = set( - [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]] - ) - - # Setup input-files, input-filters - input_files = [tmp_path / "notes.md"] - input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"] - - markdown_config = TextContentConfig( - input_files=input_files, - input_filter=[str(filter) for filter in input_filter], - compressed_jsonl=tmp_path / "test.jsonl", - embeddings_file=tmp_path / "test_embeddings.jsonl", - ) - - # Act - extracted_org_files = get_markdown_files(markdown_config) - - # Assert - assert len(extracted_org_files) == 5 - assert set(extracted_org_files.keys()) == expected_files - - def test_line_number_tracking_in_recursive_split(): "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file." # Arrange diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index d5dcdbd2..0196ef6c 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -4,9 +4,8 @@ import time from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.text_to_entries import TextToEntries -from khoj.utils.fs_syncer import get_org_files from khoj.utils.helpers import is_none_or_empty -from khoj.utils.rawconfig import Entry, TextContentConfig +from khoj.utils.rawconfig import Entry def test_configure_indexing_heading_only_entries(tmp_path): @@ -330,46 +329,6 @@ def test_file_with_no_headings_to_entry(tmp_path): assert len(entries[1]) == 1 -def test_get_org_files(tmp_path): - "Ensure Org files specified via input-filter, input-files extracted" - # Arrange - # Include via input-filter globs - group1_file1 = create_file(tmp_path, filename="group1-file1.org") - group1_file2 = create_file(tmp_path, filename="group1-file2.org") - group2_file1 = create_file(tmp_path, filename="group2-file1.org") - group2_file2 = create_file(tmp_path, filename="group2-file2.org") - # Include via input-file field - orgfile1 = create_file(tmp_path, filename="orgfile1.org") - # Not included by any filter - create_file(tmp_path, filename="orgfile2.org") - create_file(tmp_path, filename="text1.txt") - - expected_files = set( - [ - os.path.join(tmp_path, file.name) - for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1] - ] - ) - - # Setup input-files, input-filters - input_files = [tmp_path / "orgfile1.org"] - input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"] - - org_config = TextContentConfig( - input_files=input_files, - input_filter=[str(filter) for filter in input_filter], - compressed_jsonl=tmp_path / "test.jsonl", - embeddings_file=tmp_path / "test_embeddings.jsonl", - ) - - # Act - extracted_org_files = get_org_files(org_config) - - # Assert - assert len(extracted_org_files) == 5 - assert set(extracted_org_files.keys()) == expected_files - - def test_extract_entries_with_different_level_headings(tmp_path): "Extract org entries with different level headings." # Arrange diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index a62eca8b..d7336fdc 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -4,8 +4,6 @@ import re import pytest from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries -from khoj.utils.fs_syncer import get_pdf_files -from khoj.utils.rawconfig import TextContentConfig def test_single_page_pdf_to_jsonl(): @@ -61,43 +59,6 @@ def test_ocr_page_pdf_to_jsonl(): assert re.search(expected_str_with_variable_spaces, raw_entry) is not None -def test_get_pdf_files(tmp_path): - "Ensure Pdf files specified via input-filter, input-files extracted" - # Arrange - # Include via input-filter globs - group1_file1 = create_file(tmp_path, filename="group1-file1.pdf") - group1_file2 = create_file(tmp_path, filename="group1-file2.pdf") - group2_file1 = create_file(tmp_path, filename="group2-file1.pdf") - group2_file2 = create_file(tmp_path, filename="group2-file2.pdf") - # Include via input-file field - file1 = create_file(tmp_path, filename="document.pdf") - # Not included by any filter - create_file(tmp_path, filename="not-included-document.pdf") - create_file(tmp_path, filename="not-included-text.txt") - - expected_files = set( - [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]] - ) - - # Setup input-files, input-filters - input_files = [tmp_path / "document.pdf"] - input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"] - - pdf_config = TextContentConfig( - input_files=input_files, - input_filter=[str(path) for path in input_filter], - compressed_jsonl=tmp_path / "test.jsonl", - embeddings_file=tmp_path / "test_embeddings.jsonl", - ) - - # Act - extracted_pdf_files = get_pdf_files(pdf_config) - - # Assert - assert len(extracted_pdf_files) == 5 - assert set(extracted_pdf_files.keys()) == expected_files - - # Helper Functions def create_file(tmp_path, entry=None, filename="document.pdf"): pdf_file = tmp_path / filename diff --git a/tests/test_plaintext_to_entries.py b/tests/test_plaintext_to_entries.py index a085b2b5..558832d3 100644 --- a/tests/test_plaintext_to_entries.py +++ b/tests/test_plaintext_to_entries.py @@ -1,27 +1,20 @@ -import os from pathlib import Path +from textwrap import dedent -from khoj.database.models import KhojUser, LocalPlaintextConfig from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries -from khoj.utils.fs_syncer import get_plaintext_files -from khoj.utils.rawconfig import TextContentConfig -def test_plaintext_file(tmp_path): +def test_plaintext_file(): "Convert files with no heading to jsonl." # Arrange raw_entry = f""" Hi, I am a plaintext file and I have some plaintext words. """ - plaintextfile = create_file(tmp_path, raw_entry) + plaintextfile = "test.txt" + data = {plaintextfile: raw_entry} # Act # Extract Entries from specified plaintext files - - data = { - f"{plaintextfile}": raw_entry, - } - entries = PlaintextToEntries.extract_plaintext_entries(data) # Convert each entry.file to absolute path to make them JSON serializable @@ -37,59 +30,20 @@ def test_plaintext_file(tmp_path): assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}" -def test_get_plaintext_files(tmp_path): - "Ensure Plaintext files specified via input-filter, input-files extracted" - # Arrange - # Include via input-filter globs - group1_file1 = create_file(tmp_path, filename="group1-file1.md") - group1_file2 = create_file(tmp_path, filename="group1-file2.md") - - group2_file1 = create_file(tmp_path, filename="group2-file1.markdown") - group2_file2 = create_file(tmp_path, filename="group2-file2.markdown") - group2_file4 = create_file(tmp_path, filename="group2-file4.html") - # Include via input-file field - file1 = create_file(tmp_path, filename="notes.txt") - # Include unsupported file types - create_file(tmp_path, filename="group2-unincluded.py") - create_file(tmp_path, filename="group2-unincluded.csv") - create_file(tmp_path, filename="group2-unincluded.csv") - create_file(tmp_path, filename="group2-file3.mbox") - # Not included by any filter - create_file(tmp_path, filename="not-included-markdown.md") - create_file(tmp_path, filename="not-included-text.txt") - - expected_files = set( - [ - os.path.join(tmp_path, file.name) - for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file4, file1] - ] - ) - - # Setup input-files, input-filters - input_files = [tmp_path / "notes.txt"] - input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"] - - plaintext_config = TextContentConfig( - input_files=input_files, - input_filter=[str(filter) for filter in input_filter], - compressed_jsonl=tmp_path / "test.jsonl", - embeddings_file=tmp_path / "test_embeddings.jsonl", - ) - - # Act - extracted_plaintext_files = get_plaintext_files(plaintext_config) - - # Assert - assert len(extracted_plaintext_files) == len(expected_files) - assert set(extracted_plaintext_files.keys()) == set(expected_files) - - -def test_parse_html_plaintext_file(content_config, default_user: KhojUser): +def test_parse_html_plaintext_file(tmp_path): "Ensure HTML files are parsed correctly" # Arrange - # Setup input-files, input-filters - config = LocalPlaintextConfig.objects.filter(user=default_user).first() - extracted_plaintext_files = get_plaintext_files(config=config) + raw_entry = dedent( + f""" + + Test HTML + +
Test content
+ + + """ + ) + extracted_plaintext_files = {"test.html": raw_entry} # Act entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 712f4aba..9e532429 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -2,23 +2,16 @@ import asyncio import logging import os -from pathlib import Path import pytest from khoj.database.adapters import EntryAdapters -from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig -from khoj.processor.content.docx.docx_to_entries import DocxToEntries +from khoj.database.models import Entry, GithubConfig, KhojUser from khoj.processor.content.github.github_to_entries import GithubToEntries -from khoj.processor.content.images.image_to_entries import ImageToEntries -from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries from khoj.processor.content.org_mode.org_to_entries import OrgToEntries -from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries -from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.processor.content.text_to_entries import TextToEntries from khoj.search_type import text_search -from khoj.utils.fs_syncer import collect_files, get_org_files -from khoj.utils.rawconfig import ContentConfig, SearchConfig +from tests.helpers import get_index_files, get_sample_data logger = logging.getLogger(__name__) @@ -26,53 +19,20 @@ logger = logging.getLogger(__name__) # Test # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig): - # Arrange - # Ensure file mentioned in org.input-files is missing - single_new_file = Path(org_config_with_only_new_file.input_files[0]) - single_new_file.unlink() - - # Act - # Generate notes embeddings during asymmetric setup - with pytest.raises(FileNotFoundError): - get_org_files(org_config_with_only_new_file) - - -# ---------------------------------------------------------------------------------------------------- -@pytest.mark.django_db -def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser): - # Arrange - orgfile = tmp_path / "directory.org" / "file.org" - orgfile.parent.mkdir() - with open(orgfile, "w") as f: - f.write("* Heading\n- List item\n") - - LocalOrgConfig.objects.create( - input_filter=[f"{tmp_path}/**/*"], - input_files=None, - user=default_user, - ) - - # Act - org_files = collect_files(user=default_user)["org"] - - # Assert - # should return orgfile and not raise IsADirectoryError - assert org_files == {f"{orgfile}": "* Heading\n- List item\n"} - - -# ---------------------------------------------------------------------------------------------------- -@pytest.mark.django_db -def test_text_search_setup_with_empty_file_creates_no_entries( - org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser -): +def test_text_search_setup_with_empty_file_creates_no_entries(search_config, default_user: KhojUser): # Arrange + initial_data = { + "test.org": "* First heading\nFirst content", + "test2.org": "* Second heading\nSecond content", + } + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) existing_entries = Entry.objects.filter(user=default_user).count() - data = get_org_files(org_config_with_only_new_file) + + final_data = {"new_file.org": ""} # Act # Generate notes embeddings during asymmetric setup - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user) # Assert updated_entries = Entry.objects.filter(user=default_user).count() @@ -84,13 +44,14 @@ def test_text_search_setup_with_empty_file_creates_no_entries( # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_text_indexer_deletes_embedding_before_regenerate( - content_config: ContentConfig, default_user: KhojUser, caplog -): +def test_text_indexer_deletes_embedding_before_regenerate(search_config, default_user: KhojUser, caplog): # Arrange + data = { + "test1.org": "* Test heading\nTest content", + "test2.org": "* Another heading\nAnother content", + } + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) existing_entries = Entry.objects.filter(user=default_user).count() - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - data = get_org_files(org_config) # Act # Generate notes embeddings during asymmetric setup @@ -107,11 +68,10 @@ def test_text_indexer_deletes_embedding_before_regenerate( # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog): +def test_text_index_same_if_content_unchanged(search_config, default_user: KhojUser, caplog): # Arrange existing_entries = Entry.objects.filter(user=default_user) - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - data = get_org_files(org_config) + data = {"test.org": "* Test heading\nTest content"} # Act # Generate initial notes embeddings during asymmetric setup @@ -136,20 +96,14 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -@pytest.mark.anyio -# @pytest.mark.asyncio -async def test_text_search(search_config: SearchConfig): +@pytest.mark.asyncio +async def test_text_search(search_config): # Arrange - default_user = await KhojUser.objects.acreate( + default_user, _ = await KhojUser.objects.aget_or_create( username="test_user", password="test_password", email="test@example.com" ) - org_config = await LocalOrgConfig.objects.acreate( - input_files=None, - input_filter=["tests/data/org/*.org"], - index_heading_entries=False, - user=default_user, - ) - data = get_org_files(org_config) + # Get some sample org data to index + data = get_sample_data("org") loop = asyncio.get_event_loop() await loop.run_in_executor( @@ -175,17 +129,15 @@ async def test_text_search(search_config: SearchConfig): # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog): +def test_entry_chunking_by_max_tokens(tmp_path, search_config, default_user: KhojUser, caplog): # Arrange # Insert org-mode entry with size exceeding max token limit to new org file max_tokens = 256 - new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) - with open(new_file_to_index, "w") as f: - f.write(f"* Entry more than {max_tokens} words\n") - for index in range(max_tokens + 1): - f.write(f"{index} ") - - data = get_org_files(org_config_with_only_new_file) + new_file_to_index = tmp_path / "test.org" + content = f"* Entry more than {max_tokens} words\n" + for index in range(max_tokens + 1): + content += f"{index} " + data = {str(new_file_to_index): content} # Act # reload embeddings, entries, notes model after adding new org-mode file @@ -200,9 +152,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_entry_chunking_by_max_tokens_not_full_corpus( - org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog -): +def test_entry_chunking_by_max_tokens_not_full_corpus(tmp_path, search_config, default_user: KhojUser, caplog): # Arrange # Insert org-mode entry with size exceeding max token limit to new org file data = { @@ -231,13 +181,11 @@ conda activate khoj ) max_tokens = 256 - new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) - with open(new_file_to_index, "w") as f: - f.write(f"* Entry more than {max_tokens} words\n") - for index in range(max_tokens + 1): - f.write(f"{index} ") - - data = get_org_files(org_config_with_only_new_file) + new_file_to_index = tmp_path / "test.org" + content = f"* Entry more than {max_tokens} words\n" + for index in range(max_tokens + 1): + content += f"{index} " + data = {str(new_file_to_index): content} # Act # reload embeddings, entries, notes model after adding new org-mode file @@ -257,34 +205,34 @@ conda activate khoj # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser): +def test_regenerate_index_with_new_entry(search_config, default_user: KhojUser): # Arrange + # Initial indexed files + text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user) existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - initial_data = get_org_files(org_config) - # append org-mode entry to first org input file in config - org_config.input_files = [f"{new_org_file}"] - with open(new_org_file, "w") as f: - f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") - - final_data = get_org_files(org_config) - - # Act - text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) + # Regenerate index with only files from test data set + files_to_index = get_index_files() + text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user) updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) + # Act + # Update index with the new file + new_file = "test.org" + new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + files_to_index[new_file] = new_entry + # regenerate notes jsonl, model embeddings and model to include entry from new file - text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user) updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert for entry in updated_entries1: assert entry in updated_entries2 - assert not any([new_org_file.name in entry for entry in updated_entries1]) - assert not any([new_org_file.name in entry for entry in existing_entries]) - assert any([new_org_file.name in entry for entry in updated_entries2]) + assert not any([new_file in entry for entry in updated_entries1]) + assert not any([new_file in entry for entry in existing_entries]) + assert any([new_file in entry for entry in updated_entries2]) assert any( ["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2] @@ -294,28 +242,24 @@ def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_ # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_update_index_with_duplicate_entries_in_stable_order( - org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser -): +def test_update_index_with_duplicate_entries_in_stable_order(tmp_path, search_config, default_user: KhojUser): # Arrange + initial_data = get_sample_data("org") + text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user) existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) # Insert org-mode entries with same compiled form into new org file + new_file_to_index = tmp_path / "test.org" new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" - with open(new_file_to_index, "w") as f: - f.write(f"{new_entry}{new_entry}") - - data = get_org_files(org_config_with_only_new_file) + # Initial data with duplicate entries + data = {str(new_file_to_index): f"{new_entry}{new_entry}"} # Act # generate embeddings, entries, notes model from scratch after adding new org-mode file text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - data = get_org_files(org_config_with_only_new_file) - - # update embeddings, entries, notes model with no new changes + # idempotent indexing when data unchanged text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) @@ -324,6 +268,7 @@ def test_update_index_with_duplicate_entries_in_stable_order( for entry in existing_entries: assert entry not in updated_entries1 + # verify the second indexing update has same entries and ordering as first for entry in updated_entries1: assert entry in updated_entries2 @@ -334,22 +279,17 @@ def test_update_index_with_duplicate_entries_in_stable_order( # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser): +def test_update_index_with_deleted_entry(tmp_path, search_config, default_user: KhojUser): # Arrange existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - new_file_to_index = Path(org_config_with_only_new_file.input_files[0]) - # Insert org-mode entries with same compiled form into new org file + new_file_to_index = tmp_path / "test.org" new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" - with open(new_file_to_index, "w") as f: - f.write(f"{new_entry}{new_entry} -- Tatooine") - initial_data = get_org_files(org_config_with_only_new_file) - # update embeddings, entries, notes model after removing an entry from the org file - with open(new_file_to_index, "w") as f: - f.write(f"{new_entry}") - - final_data = get_org_files(org_config_with_only_new_file) + # Initial data with two entries + initial_data = {str(new_file_to_index): f"{new_entry}{new_entry} -- Tatooine"} + # Final data with only first entry, with second entry removed + final_data = {str(new_file_to_index): f"{new_entry}"} # Act # load embeddings, entries, notes model after adding new org file with 2 entries @@ -375,29 +315,29 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg # ---------------------------------------------------------------------------------------------------- @pytest.mark.django_db -def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser): +def test_update_index_with_new_entry(search_config, default_user: KhojUser): # Arrange - existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - org_config = LocalOrgConfig.objects.filter(user=default_user).first() - data = get_org_files(org_config) - text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) + # Initial indexed files + text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user) + old_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) - # append org-mode entry to first org input file in config - with open(new_org_file, "w") as f: - new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" - f.write(new_entry) - - data = get_org_files(org_config) + # Regenerate index with only files from test data set + files_to_index = get_index_files() + new_entries = text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user) # Act - # update embeddings, entries with the newly added note - text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) - updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) + # Update index with the new file + new_file = "test.org" + new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n" + final_data = {new_file: new_entry} + + text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user) + updated_new_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True)) # Assert - for entry in existing_entries: - assert entry not in updated_entries1 - assert len(updated_entries1) == len(existing_entries) + 1 + for old_entry in old_entries: + assert old_entry not in updated_new_entries + assert len(updated_new_entries) == len(new_entries) + 1 verify_embeddings(3, default_user) @@ -409,9 +349,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file (OrgToEntries), ], ) -def test_update_index_with_deleted_file( - org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser -): +def test_update_index_with_deleted_file(text_to_entries: TextToEntries, search_config, default_user: KhojUser): "Delete entries associated with new file when file path with empty content passed." # Arrange file_to_index = "test" @@ -446,7 +384,7 @@ def test_update_index_with_deleted_file( # ---------------------------------------------------------------------------------------------------- @pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set") -def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser): +def test_text_search_setup_github(search_config, default_user: KhojUser): # Arrange github_config = GithubConfig.objects.filter(user=default_user).first() diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py index ebd6cccf..5333e17f 100644 --- a/tests/test_word_filter.py +++ b/tests/test_word_filter.py @@ -1,6 +1,12 @@ # Application Packages from khoj.search_filter.word_filter import WordFilter -from khoj.utils.rawconfig import Entry + + +# Mock Entry class for testing +class Entry: + def __init__(self, compiled="", raw=""): + self.compiled = compiled + self.raw = raw # Test