Update test setup to index test data after old indexing code removed

- Delete tests testing deprecated server side indexing flows - Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and references in tests - Index test data via new helper method, `get_index_files' - It is modelled after the old `get_org_files' variants in main app - It passes the test data in required format to `configure_content' Allows maintaining the more realistic tests from before while using new indexing mechanism (rather than the deprecated server side indexing mechanism
2026-03-02 13:18:18 +00:00 · 2025-07-11 14:35:05 -07:00
parent d9d24dd638
commit 892d57314e
12 changed files with 295 additions and 604 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,3 @@
 import os
 from pathlib import Path
 import pytest
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
@@ -11,6 +8,7 @@ from khoj.configure import (
    configure_routes,
    configure_search_types,
 )
 from khoj.database.adapters import get_default_search_model
 from khoj.database.models import (
    Agent,
    ChatModel,
@@ -19,21 +17,14 @@ from khoj.database.models import (
    GithubRepoConfig,
    KhojApiUser,
    KhojUser,
    LocalMarkdownConfig,
    LocalOrgConfig,
    LocalPdfConfig,
    LocalPlaintextConfig,
 )
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.routers.api_content import configure_content
 from khoj.search_type import text_search
-from khoj.utils import fs_syncer, state
+from khoj.utils import state
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import web_directory
 from khoj.utils.helpers import resolve_absolute_path
 from khoj.utils.rawconfig import ContentConfig, SearchConfig
 from tests.helpers import (
    AiModelApiFactory,
    ChatModelFactory,
@@ -43,6 +34,8 @@ from tests.helpers import (
    UserFactory,
    get_chat_api_key,
    get_chat_provider,
    get_index_files,
    get_sample_data,
 )
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
@pytest.fixture(scope="session")
-def search_config() -> SearchConfig:
+def search_config():
    search_model = get_default_search_model()
    state.embeddings_model = dict()
-    state.embeddings_model["default"] = EmbeddingsModel()
+    state.embeddings_model["default"] = EmbeddingsModel(
        model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
    )
    state.cross_encoder_model = dict()
-    state.cross_encoder_model["default"] = CrossEncoderModel()
+    state.cross_encoder_model["default"] = CrossEncoderModel(
-
+        model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
-    model_dir = resolve_absolute_path("~/.khoj/search")
+    )
    model_dir.mkdir(parents=True, exist_ok=True)
    search_config = SearchConfig()
    return search_config
@pytest.mark.django_db
@@ -201,13 +193,6 @@ def openai_agent():
    )
@pytest.fixture(scope="session")
 def search_models(search_config: SearchConfig):
    search_models = SearchModels()
    return search_models
@pytest.mark.django_db
@pytest.fixture
 def default_process_lock():
@@ -219,72 +204,23 @@ def anyio_backend():
    return "asyncio"
@pytest.mark.django_db
@pytest.fixture(scope="function")
-def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
+def chat_client(search_config, default_user2: KhojUser):
    content_dir = tmp_path_factory.mktemp("content")
    # Generate Image Embeddings from Test Images
    content_config = ContentConfig()
    LocalOrgConfig.objects.create(
        input_files=None,
        input_filter=["tests/data/org/*.org"],
        index_heading_entries=False,
        user=default_user,
    )
    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
    if os.getenv("GITHUB_PAT_TOKEN"):
        GithubConfig.objects.create(
            pat_token=os.getenv("GITHUB_PAT_TOKEN"),
            user=default_user,
        )
        GithubRepoConfig.objects.create(
            owner="khoj-ai",
            name="lantern",
            branch="master",
            github_config=GithubConfig.objects.get(user=default_user),
        )
    LocalPlaintextConfig.objects.create(
        input_files=None,
        input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
        user=default_user,
    )
    return content_config
@pytest.fixture(scope="session")
 def md_content_config():
    markdown_config = LocalMarkdownConfig.objects.create(
        input_files=None,
        input_filter=["tests/data/markdown/*.markdown"],
    )
    return markdown_config
@pytest.fixture(scope="function")
 def chat_client(search_config: SearchConfig, default_user2: KhojUser):
    return chat_client_builder(search_config, default_user2, require_auth=False)
@pytest.fixture(scope="function")
-def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_with_auth(search_config, default_user2: KhojUser):
    return chat_client_builder(search_config, default_user2, require_auth=True)
@pytest.fixture(scope="function")
-def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_no_background(search_config, default_user2: KhojUser):
    return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
@pytest.fixture(scope="function")
-def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_with_large_kb(search_config, default_user2: KhojUser):
    """
    Chat client fixture that creates a large knowledge base with many files
    for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
    state.SearchType = configure_search_types()
    if index_content:
-        LocalMarkdownConfig.objects.create(
+        file_type = "markdown"
-            input_files=None,
+        files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
            input_filter=["tests/data/markdown/*.markdown"],
            user=user,
        )
        # Index Markdown Content for Search
-        all_files = fs_syncer.collect_files(user=user)
+        configure_content(user, files_to_index)
        configure_content(user, all_files)
    # Initialize Processor from Config
    chat_provider = get_chat_provider()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
    # Create temporary directory for large number of test files
    temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
    file_type = "markdown"
    large_file_list = []
    try:
        # Generate 200 test files with substantial content
        for i in range(300):
-            file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
+            file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
            content = f"""
 # Test File {i}
@@ -401,16 +334,9 @@ End of file {i}.
                f.write(content)
            large_file_list.append(file_path)
-        # Create LocalMarkdownConfig with all the generated files
+        # Index all generated files into the user's knowledge base
-        LocalMarkdownConfig.objects.create(
+        files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
-            input_files=large_file_list,
+        configure_content(user, files_to_index)
            input_filter=None,
            user=user,
        )
        # Index all the files into the user's knowledge base
        all_files = fs_syncer.collect_files(user=user)
        configure_content(user, all_files)
        # Verify we have a substantial knowledge base
        file_count = FileObject.objects.filter(user=user, agent=None).count()
@@ -493,139 +419,18 @@ def client(
    return TestClient(app)
@pytest.fixture(scope="function")
 def new_org_file(default_user: KhojUser, content_config: ContentConfig):
    # Setup
    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
    input_filters = org_config.input_filter
    new_org_file = Path(input_filters[0]).parent / "new_file.org"
    new_org_file.touch()
    yield new_org_file
    # Cleanup
    if new_org_file.exists():
        new_org_file.unlink()
@pytest.fixture(scope="function")
 def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
    LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
    return LocalOrgConfig.objects.filter(user=default_user).first()
@pytest.fixture(scope="function")
 def pdf_configured_user1(default_user: KhojUser):
-    LocalPdfConfig.objects.create(
+    # Read data from pdf file at tests/data/pdf/singlepage.pdf
-        input_files=None,
+    pdf_file_path = "tests/data/pdf/singlepage.pdf"
-        input_filter=["tests/data/pdf/singlepage.pdf"],
+    with open(pdf_file_path, "rb") as pdf_file:
-        user=default_user,
+        pdf_data = pdf_file.read()
-    )
+
-    # Index Markdown Content for Search
+    knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
-    all_files = fs_syncer.collect_files(user=default_user)
+    # Index Content for Search
-    configure_content(default_user, all_files)
+    configure_content(default_user, knowledge_base)
@pytest.fixture(scope="function")
 def sample_org_data():
    return get_sample_data("org")
 def get_sample_data(type):
    sample_data = {
        "org": {
            "elisp.org": """
 * Emacs Khoj
  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
 ** Requirements
   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
 ** Installation
 *** Direct
     - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :load-path "~/.emacs.d/lisp/khoj.el"
           :bind ("C-c s" . 'khoj))
       #+end_src
 *** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
           :bind ("C-c s" . 'khoj))
       #+end_src
 ** Usage
   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
   2. Enter Query in Natural Language
      e.g. "What is the meaning of life?" "What are my life goals?"
   3. Wait for results
      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
   4. (Optional) Narrow down results further
      Include/Exclude specific words from results by adding to query
      e.g. "What is the meaning of life? -god +none"
 """,
            "readme.org": """
 * Khoj
  /Allow natural language search on user content like notes, images using transformer based models/
  All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
 ** Dependencies
   - Python3
   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
 ** Install
   #+begin_src shell
   git clone https://github.com/khoj-ai/khoj && cd khoj
   conda env create -f environment.yml
   conda activate khoj
   #+end_src""",
        },
        "markdown": {
            "readme.markdown": """
 # Khoj
 Allow natural language search on user content like notes, images using transformer based models
 All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
 ## Dependencies
 - Python3
 - [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
 ## Install
 ```shell
 git clone
 conda env create -f environment.yml
 conda activate khoj
 ```
 """
        },
        "plaintext": {
            "readme.txt": """
 Khoj
 Allow natural language search on user content like notes, images using transformer based models
 All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
 Dependencies
 - Python3
 - Miniconda
 Install
 git clone
 conda env create -f environment.yml
 conda activate khoj
 """
        },
    }
    return sample_data[type]
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -1,3 +1,5 @@
 import glob
 import logging
 import os
 from datetime import datetime
@@ -17,6 +19,9 @@ from khoj.database.models import (
    UserConversationConfig,
 )
 from khoj.processor.conversation.utils import message_to_log
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty
 logger = logging.getLogger(__name__)
 def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
@@ -61,6 +66,140 @@ def generate_chat_history(message_list):
    return chat_history
 def get_sample_data(type):
    sample_data = {
        "org": {
            "elisp.org": """
 * Emacs Khoj
  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
 ** Requirements
   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
 ** Installation
 *** Direct
     - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :load-path "~/.emacs.d/lisp/khoj.el"
           :bind ("C-c s" . 'khoj))
       #+end_src
 *** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
           :bind ("C-c s" . 'khoj))
       #+end_src
 ** Usage
   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
   2. Enter Query in Natural Language
      e.g. "What is the meaning of life?" "What are my life goals?"
   3. Wait for results
      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
   4. (Optional) Narrow down results further
      Include/Exclude specific words from results by adding to query
      e.g. "What is the meaning of life? -god +none"
 """,
            "readme.org": """
 * Khoj
  /Allow natural language search on user content like notes, images using transformer based models/
  All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
 ** Dependencies
   - Python3
   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
 ** Install
   #+begin_src shell
   git clone https://github.com/khoj-ai/khoj && cd khoj
   conda env create -f environment.yml
   conda activate khoj
   #+end_src""",
        },
        "markdown": {
            "readme.markdown": """
 # Khoj
 Allow natural language search on user content like notes, images using transformer based models
 All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
 ## Dependencies
 - Python3
 - [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
 ## Install
 ```shell
 git clone
 conda env create -f environment.yml
 conda activate khoj
 ```
 """
        },
        "plaintext": {
            "readme.txt": """
 Khoj
 Allow natural language search on user content like notes, images using transformer based models
 All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
 Dependencies
 - Python3
 - Miniconda
 Install
 git clone
 conda env create -f environment.yml
 conda activate khoj
 """
        },
    }
    return sample_data[type]
 def get_index_files(
    input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
 ) -> dict[str, str]:
    # Input Validation
    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
        logger.debug("At least one of input_files or input_filter is required to be specified")
        return {}
    # Get files to process
    absolute_files, filtered_files = set(), set()
    if input_files:
        absolute_files = {get_absolute_path(input_file) for input_file in input_files}
    if input_filters:
        filtered_files = {
            filtered_file
            for file_filter in input_filters
            for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
            if os.path.isfile(filtered_file)
        }
    all_files = sorted(absolute_files | filtered_files)
    filename_to_content_map = {}
    for file in all_files:
        with open(file, "r", encoding="utf8") as f:
            try:
                filename_to_content_map[file] = f.read()
            except Exception as e:
                logger.warning(f"Unable to read file: {file}. Skipping file.")
                logger.warning(e, exc_info=True)
    return filename_to_content_map
 class UserFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = KhojUser
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -15,7 +15,7 @@ from tests.helpers import ChatModelFactory
 def test_create_default_agent(default_user: KhojUser):
    ChatModelFactory()
-    agent = AgentAdapters.create_default_agent(default_user)
+    agent = AgentAdapters.create_default_agent()
    assert agent is not None
    assert agent.input_tools == []
    assert agent.output_modes == []
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,49 +1,15 @@
 # Standard Modules
 from pathlib import Path
 from random import random
 from khoj.utils.cli import cli
 from khoj.utils.helpers import resolve_absolute_path
 # Test
 # ----------------------------------------------------------------------------------------------------
 def test_cli_minimal_default():
    # Act
-    actual_args = cli([])
+    actual_args = cli(["-vvv"])
    # Assert
-    assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml"))
+    assert actual_args.log_file == Path("~/.khoj/khoj.log")
    assert actual_args.regenerate == False
    assert actual_args.verbose == 0
 # ----------------------------------------------------------------------------------------------------
 def test_cli_invalid_config_file_path():
    # Arrange
    non_existent_config_file = f"non-existent-khoj-{random()}.yml"
    # Act
    actual_args = cli([f"--config-file={non_existent_config_file}"])
    # Assert
    assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
    assert actual_args.config == None
 # ----------------------------------------------------------------------------------------------------
 def test_cli_config_from_file():
    # Act
    actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"])
    # Assert
    assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml"))
    assert actual_args.regenerate == True
    assert actual_args.config is not None
    assert actual_args.verbose == 3
    # Ensure content config is loaded from file
    assert actual_args.config.content_type.org.input_files == [
        Path("~/first_from_config.org"),
        Path("~/second_from_config.org"),
    ]
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -13,7 +13,6 @@ from khoj.database.models import KhojApiUser, KhojUser
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.search_type import text_search
 from khoj.utils import state
 from khoj.utils.rawconfig import ContentConfig, SearchConfig
 # Test
@@ -296,7 +295,7 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
-def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
+def test_notes_search(client, search_config, sample_org_data, default_user: KhojUser):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -315,7 +314,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
-def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
+def test_notes_search_no_results(client, search_config, sample_org_data, default_user: KhojUser):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -331,9 +330,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
-def test_notes_search_with_only_filters(
+def test_notes_search_with_only_filters(client, sample_org_data, default_user: KhojUser):
    client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser
 ):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    text_search.setup(
@@ -397,9 +394,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
-def test_notes_search_requires_parent_context(
+def test_notes_search_requires_parent_context(client, search_config, sample_org_data, default_user: KhojUser):
    client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
 ):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
--- a/tests/test_file_filter.py
+++ b/tests/test_file_filter.py
@@ -1,6 +1,13 @@
 # Application Packages
 from khoj.search_filter.file_filter import FileFilter
-from khoj.utils.rawconfig import Entry
+
 # Mock Entry class for testing
 class Entry:
    def __init__(self, compiled="", raw="", file=""):
        self.compiled = compiled
        self.raw = raw
        self.file = file
 def test_can_filter_no_file_filter():
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -3,8 +3,6 @@ import re
 from pathlib import Path
 from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
 from khoj.utils.fs_syncer import get_markdown_files
 from khoj.utils.rawconfig import TextContentConfig
 def test_extract_markdown_with_no_headings(tmp_path):
@@ -212,43 +210,6 @@ longer body line 2.1
    ), "Third entry is second entries child heading"
 def test_get_markdown_files(tmp_path):
    "Ensure Markdown files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
    group1_file2 = create_file(tmp_path, filename="group1-file2.md")
    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
    # Include via input-file field
    file1 = create_file(tmp_path, filename="notes.md")
    # Not included by any filter
    create_file(tmp_path, filename="not-included-markdown.md")
    create_file(tmp_path, filename="not-included-text.txt")
    expected_files = set(
        [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
    )
    # Setup input-files, input-filters
    input_files = [tmp_path / "notes.md"]
    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
    markdown_config = TextContentConfig(
        input_files=input_files,
        input_filter=[str(filter) for filter in input_filter],
        compressed_jsonl=tmp_path / "test.jsonl",
        embeddings_file=tmp_path / "test_embeddings.jsonl",
    )
    # Act
    extracted_org_files = get_markdown_files(markdown_config)
    # Assert
    assert len(extracted_org_files) == 5
    assert set(extracted_org_files.keys()) == expected_files
 def test_line_number_tracking_in_recursive_split():
    "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
    # Arrange
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -4,9 +4,8 @@ import time
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
 from khoj.utils.fs_syncer import get_org_files
 from khoj.utils.helpers import is_none_or_empty
-from khoj.utils.rawconfig import Entry, TextContentConfig
+from khoj.utils.rawconfig import Entry
 def test_configure_indexing_heading_only_entries(tmp_path):
@@ -330,46 +329,6 @@ def test_file_with_no_headings_to_entry(tmp_path):
    assert len(entries[1]) == 1
 def test_get_org_files(tmp_path):
    "Ensure Org files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.org")
    group1_file2 = create_file(tmp_path, filename="group1-file2.org")
    group2_file1 = create_file(tmp_path, filename="group2-file1.org")
    group2_file2 = create_file(tmp_path, filename="group2-file2.org")
    # Include via input-file field
    orgfile1 = create_file(tmp_path, filename="orgfile1.org")
    # Not included by any filter
    create_file(tmp_path, filename="orgfile2.org")
    create_file(tmp_path, filename="text1.txt")
    expected_files = set(
        [
            os.path.join(tmp_path, file.name)
            for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
        ]
    )
    # Setup input-files, input-filters
    input_files = [tmp_path / "orgfile1.org"]
    input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
    org_config = TextContentConfig(
        input_files=input_files,
        input_filter=[str(filter) for filter in input_filter],
        compressed_jsonl=tmp_path / "test.jsonl",
        embeddings_file=tmp_path / "test_embeddings.jsonl",
    )
    # Act
    extracted_org_files = get_org_files(org_config)
    # Assert
    assert len(extracted_org_files) == 5
    assert set(extracted_org_files.keys()) == expected_files
 def test_extract_entries_with_different_level_headings(tmp_path):
    "Extract org entries with different level headings."
    # Arrange
--- a/tests/test_pdf_to_entries.py
+++ b/tests/test_pdf_to_entries.py
@@ -4,8 +4,6 @@ import re
 import pytest
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.utils.fs_syncer import get_pdf_files
 from khoj.utils.rawconfig import TextContentConfig
 def test_single_page_pdf_to_jsonl():
@@ -61,43 +59,6 @@ def test_ocr_page_pdf_to_jsonl():
    assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
 def test_get_pdf_files(tmp_path):
    "Ensure Pdf files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
    group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
    group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
    group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
    # Include via input-file field
    file1 = create_file(tmp_path, filename="document.pdf")
    # Not included by any filter
    create_file(tmp_path, filename="not-included-document.pdf")
    create_file(tmp_path, filename="not-included-text.txt")
    expected_files = set(
        [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
    )
    # Setup input-files, input-filters
    input_files = [tmp_path / "document.pdf"]
    input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
    pdf_config = TextContentConfig(
        input_files=input_files,
        input_filter=[str(path) for path in input_filter],
        compressed_jsonl=tmp_path / "test.jsonl",
        embeddings_file=tmp_path / "test_embeddings.jsonl",
    )
    # Act
    extracted_pdf_files = get_pdf_files(pdf_config)
    # Assert
    assert len(extracted_pdf_files) == 5
    assert set(extracted_pdf_files.keys()) == expected_files
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="document.pdf"):
    pdf_file = tmp_path / filename
--- a/tests/test_plaintext_to_entries.py
+++ b/tests/test_plaintext_to_entries.py
@@ -1,27 +1,20 @@
 import os
 from pathlib import Path
 from textwrap import dedent
 from khoj.database.models import KhojUser, LocalPlaintextConfig
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.utils.fs_syncer import get_plaintext_files
 from khoj.utils.rawconfig import TextContentConfig
-def test_plaintext_file(tmp_path):
+def test_plaintext_file():
    "Convert files with no heading to jsonl."
    # Arrange
    raw_entry = f"""
    Hi, I am a plaintext file and I have some plaintext words.
    """
-    plaintextfile = create_file(tmp_path, raw_entry)
+    plaintextfile = "test.txt"
    data = {plaintextfile: raw_entry}
    # Act
    # Extract Entries from specified plaintext files
    data = {
        f"{plaintextfile}": raw_entry,
    }
    entries = PlaintextToEntries.extract_plaintext_entries(data)
    # Convert each entry.file to absolute path to make them JSON serializable
@@ -37,59 +30,20 @@ def test_plaintext_file(tmp_path):
    assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
-def test_get_plaintext_files(tmp_path):
+def test_parse_html_plaintext_file(tmp_path):
    "Ensure Plaintext files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
    group1_file2 = create_file(tmp_path, filename="group1-file2.md")
    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
    group2_file4 = create_file(tmp_path, filename="group2-file4.html")
    # Include via input-file field
    file1 = create_file(tmp_path, filename="notes.txt")
    # Include unsupported file types
    create_file(tmp_path, filename="group2-unincluded.py")
    create_file(tmp_path, filename="group2-unincluded.csv")
    create_file(tmp_path, filename="group2-unincluded.csv")
    create_file(tmp_path, filename="group2-file3.mbox")
    # Not included by any filter
    create_file(tmp_path, filename="not-included-markdown.md")
    create_file(tmp_path, filename="not-included-text.txt")
    expected_files = set(
        [
            os.path.join(tmp_path, file.name)
            for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file4, file1]
        ]
    )
    # Setup input-files, input-filters
    input_files = [tmp_path / "notes.txt"]
    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
    plaintext_config = TextContentConfig(
        input_files=input_files,
        input_filter=[str(filter) for filter in input_filter],
        compressed_jsonl=tmp_path / "test.jsonl",
        embeddings_file=tmp_path / "test_embeddings.jsonl",
    )
    # Act
    extracted_plaintext_files = get_plaintext_files(plaintext_config)
    # Assert
    assert len(extracted_plaintext_files) == len(expected_files)
    assert set(extracted_plaintext_files.keys()) == set(expected_files)
 def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
    "Ensure HTML files are parsed correctly"
    # Arrange
-    # Setup input-files, input-filters
+    raw_entry = dedent(
-    config = LocalPlaintextConfig.objects.filter(user=default_user).first()
+        f"""
-    extracted_plaintext_files = get_plaintext_files(config=config)
+        <html>
        <head><title>Test HTML</title></head>
        <body>
        <div>Test content</div>
        </body>
        </html>
        """
    )
    extracted_plaintext_files = {"test.html": raw_entry}
    # Act
    entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -2,23 +2,16 @@
 import asyncio
 import logging
 import os
 from pathlib import Path
 import pytest
 from khoj.database.adapters import EntryAdapters
-from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig
+from khoj.database.models import Entry, GithubConfig, KhojUser
 from khoj.processor.content.docx.docx_to_entries import DocxToEntries
 from khoj.processor.content.github.github_to_entries import GithubToEntries
 from khoj.processor.content.images.image_to_entries import ImageToEntries
 from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
 from khoj.search_type import text_search
-from khoj.utils.fs_syncer import collect_files, get_org_files
+from tests.helpers import get_index_files, get_sample_data
 from khoj.utils.rawconfig import ContentConfig, SearchConfig
 logger = logging.getLogger(__name__)
@@ -26,53 +19,20 @@ logger = logging.getLogger(__name__)
 # Test
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig):
+def test_text_search_setup_with_empty_file_creates_no_entries(search_config, default_user: KhojUser):
    # Arrange
    # Ensure file mentioned in org.input-files is missing
    single_new_file = Path(org_config_with_only_new_file.input_files[0])
    single_new_file.unlink()
    # Act
    # Generate notes embeddings during asymmetric setup
    with pytest.raises(FileNotFoundError):
        get_org_files(org_config_with_only_new_file)
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
 def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser):
    # Arrange
    orgfile = tmp_path / "directory.org" / "file.org"
    orgfile.parent.mkdir()
    with open(orgfile, "w") as f:
        f.write("* Heading\n- List item\n")
    LocalOrgConfig.objects.create(
        input_filter=[f"{tmp_path}/**/*"],
        input_files=None,
        user=default_user,
    )
    # Act
    org_files = collect_files(user=default_user)["org"]
    # Assert
    # should return orgfile and not raise IsADirectoryError
    assert org_files == {f"{orgfile}": "* Heading\n- List item\n"}
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
 def test_text_search_setup_with_empty_file_creates_no_entries(
    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
 ):
    # Arrange
    initial_data = {
        "test.org": "* First heading\nFirst content",
        "test2.org": "* Second heading\nSecond content",
    }
    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
    existing_entries = Entry.objects.filter(user=default_user).count()
-    data = get_org_files(org_config_with_only_new_file)
+
    final_data = {"new_file.org": ""}
    # Act
    # Generate notes embeddings during asymmetric setup
-    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
+    text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
    # Assert
    updated_entries = Entry.objects.filter(user=default_user).count()
@@ -84,13 +44,14 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_text_indexer_deletes_embedding_before_regenerate(
+def test_text_indexer_deletes_embedding_before_regenerate(search_config, default_user: KhojUser, caplog):
    content_config: ContentConfig, default_user: KhojUser, caplog
 ):
    # Arrange
    data = {
        "test1.org": "* Test heading\nTest content",
        "test2.org": "* Another heading\nAnother content",
    }
    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
    existing_entries = Entry.objects.filter(user=default_user).count()
    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
    data = get_org_files(org_config)
    # Act
    # Generate notes embeddings during asymmetric setup
@@ -107,11 +68,10 @@ def test_text_indexer_deletes_embedding_before_regenerate(
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog):
+def test_text_index_same_if_content_unchanged(search_config, default_user: KhojUser, caplog):
    # Arrange
    existing_entries = Entry.objects.filter(user=default_user)
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
+    data = {"test.org": "* Test heading\nTest content"}
    data = get_org_files(org_config)
    # Act
    # Generate initial notes embeddings during asymmetric setup
@@ -136,20 +96,14 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-@pytest.mark.anyio
+@pytest.mark.asyncio
-# @pytest.mark.asyncio
+async def test_text_search(search_config):
 async def test_text_search(search_config: SearchConfig):
    # Arrange
-    default_user = await KhojUser.objects.acreate(
+    default_user, _ = await KhojUser.objects.aget_or_create(
        username="test_user", password="test_password", email="test@example.com"
    )
-    org_config = await LocalOrgConfig.objects.acreate(
+    # Get some sample org data to index
-        input_files=None,
+    data = get_sample_data("org")
        input_filter=["tests/data/org/*.org"],
        index_heading_entries=False,
        user=default_user,
    )
    data = get_org_files(org_config)
    loop = asyncio.get_event_loop()
    await loop.run_in_executor(
@@ -175,17 +129,15 @@ async def test_text_search(search_config: SearchConfig):
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog):
+def test_entry_chunking_by_max_tokens(tmp_path, search_config, default_user: KhojUser, caplog):
    # Arrange
    # Insert org-mode entry with size exceeding max token limit to new org file
    max_tokens = 256
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
+    new_file_to_index = tmp_path / "test.org"
-    with open(new_file_to_index, "w") as f:
+    content = f"* Entry more than {max_tokens} words\n"
-        f.write(f"* Entry more than {max_tokens} words\n")
+    for index in range(max_tokens + 1):
-        for index in range(max_tokens + 1):
+        content += f"{index} "
-            f.write(f"{index} ")
+    data = {str(new_file_to_index): content}
    data = get_org_files(org_config_with_only_new_file)
    # Act
    # reload embeddings, entries, notes model after adding new org-mode file
@@ -200,9 +152,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_entry_chunking_by_max_tokens_not_full_corpus(
+def test_entry_chunking_by_max_tokens_not_full_corpus(tmp_path, search_config, default_user: KhojUser, caplog):
    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog
 ):
    # Arrange
    # Insert org-mode entry with size exceeding max token limit to new org file
    data = {
@@ -231,13 +181,11 @@ conda activate khoj
    )
    max_tokens = 256
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
+    new_file_to_index = tmp_path / "test.org"
-    with open(new_file_to_index, "w") as f:
+    content = f"* Entry more than {max_tokens} words\n"
-        f.write(f"* Entry more than {max_tokens} words\n")
+    for index in range(max_tokens + 1):
-        for index in range(max_tokens + 1):
+        content += f"{index} "
-            f.write(f"{index} ")
+    data = {str(new_file_to_index): content}
    data = get_org_files(org_config_with_only_new_file)
    # Act
    # reload embeddings, entries, notes model after adding new org-mode file
@@ -257,34 +205,34 @@ conda activate khoj
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
+def test_regenerate_index_with_new_entry(search_config, default_user: KhojUser):
    # Arrange
    # Initial indexed files
    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
    existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
    initial_data = get_org_files(org_config)
-    # append org-mode entry to first org input file in config
+    # Regenerate index with only files from test data set
-    org_config.input_files = [f"{new_org_file}"]
+    files_to_index = get_index_files()
-    with open(new_org_file, "w") as f:
+    text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
        f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
    final_data = get_org_files(org_config)
    # Act
    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
    updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    # Act
    # Update index with the new file
    new_file = "test.org"
    new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
    files_to_index[new_file] = new_entry
    # regenerate notes jsonl, model embeddings and model to include entry from new file
-    text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
+    text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
    updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    # Assert
    for entry in updated_entries1:
        assert entry in updated_entries2
-    assert not any([new_org_file.name in entry for entry in updated_entries1])
+    assert not any([new_file in entry for entry in updated_entries1])
-    assert not any([new_org_file.name in entry for entry in existing_entries])
+    assert not any([new_file in entry for entry in existing_entries])
-    assert any([new_org_file.name in entry for entry in updated_entries2])
+    assert any([new_file in entry for entry in updated_entries2])
    assert any(
        ["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2]
@@ -294,28 +242,24 @@ def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_update_index_with_duplicate_entries_in_stable_order(
+def test_update_index_with_duplicate_entries_in_stable_order(tmp_path, search_config, default_user: KhojUser):
    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
 ):
    # Arrange
    initial_data = get_sample_data("org")
    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
    existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
    # Insert org-mode entries with same compiled form into new org file
    new_file_to_index = tmp_path / "test.org"
    new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
-    with open(new_file_to_index, "w") as f:
+    # Initial data with duplicate entries
-        f.write(f"{new_entry}{new_entry}")
+    data = {str(new_file_to_index): f"{new_entry}{new_entry}"}
    data = get_org_files(org_config_with_only_new_file)
    # Act
    # generate embeddings, entries, notes model from scratch after adding new org-mode file
    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
    updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
-    data = get_org_files(org_config_with_only_new_file)
+    # idempotent indexing when data unchanged
    # update embeddings, entries, notes model with no new changes
    text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
    updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
@@ -324,6 +268,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
    for entry in existing_entries:
        assert entry not in updated_entries1
    # verify the second indexing update has same entries and ordering as first
    for entry in updated_entries1:
        assert entry in updated_entries2
@@ -334,22 +279,17 @@ def test_update_index_with_duplicate_entries_in_stable_order(
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser):
+def test_update_index_with_deleted_entry(tmp_path, search_config, default_user: KhojUser):
    # Arrange
    existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
-    # Insert org-mode entries with same compiled form into new org file
+    new_file_to_index = tmp_path / "test.org"
    new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
    with open(new_file_to_index, "w") as f:
        f.write(f"{new_entry}{new_entry} -- Tatooine")
    initial_data = get_org_files(org_config_with_only_new_file)
-    # update embeddings, entries, notes model after removing an entry from the org file
+    # Initial data with two entries
-    with open(new_file_to_index, "w") as f:
+    initial_data = {str(new_file_to_index): f"{new_entry}{new_entry} -- Tatooine"}
-        f.write(f"{new_entry}")
+    # Final data with only first entry, with second entry removed
-
+    final_data = {str(new_file_to_index): f"{new_entry}"}
    final_data = get_org_files(org_config_with_only_new_file)
    # Act
    # load embeddings, entries, notes model after adding new org file with 2 entries
@@ -375,29 +315,29 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db
-def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
+def test_update_index_with_new_entry(search_config, default_user: KhojUser):
    # Arrange
-    existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
+    # Initial indexed files
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
+    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
-    data = get_org_files(org_config)
+    old_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
-    # append org-mode entry to first org input file in config
+    # Regenerate index with only files from test data set
-    with open(new_org_file, "w") as f:
+    files_to_index = get_index_files()
-        new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+    new_entries = text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
        f.write(new_entry)
    data = get_org_files(org_config)
    # Act
-    # update embeddings, entries with the newly added note
+    # Update index with the new file
-    text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
+    new_file = "test.org"
-    updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
+    new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
    final_data = {new_file: new_entry}
    text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user)
    updated_new_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
    # Assert
-    for entry in existing_entries:
+    for old_entry in old_entries:
-        assert entry not in updated_entries1
+        assert old_entry not in updated_new_entries
-    assert len(updated_entries1) == len(existing_entries) + 1
+    assert len(updated_new_entries) == len(new_entries) + 1
    verify_embeddings(3, default_user)
@@ -409,9 +349,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
        (OrgToEntries),
    ],
 )
-def test_update_index_with_deleted_file(
+def test_update_index_with_deleted_file(text_to_entries: TextToEntries, search_config, default_user: KhojUser):
    org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser
 ):
    "Delete entries associated with new file when file path with empty content passed."
    # Arrange
    file_to_index = "test"
@@ -446,7 +384,7 @@ def test_update_index_with_deleted_file(
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
-def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser):
+def test_text_search_setup_github(search_config, default_user: KhojUser):
    # Arrange
    github_config = GithubConfig.objects.filter(user=default_user).first()
--- a/tests/test_word_filter.py
+++ b/tests/test_word_filter.py
@@ -1,6 +1,12 @@
 # Application Packages
 from khoj.search_filter.word_filter import WordFilter
-from khoj.utils.rawconfig import Entry
+
 # Mock Entry class for testing
 class Entry:
    def __init__(self, compiled="", raw=""):
        self.compiled = compiled
        self.raw = raw
 # Test