From 892d57314e3a047a3f63deb775aaf09374d8ed8e Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 11 Jul 2025 14:35:05 -0700
Subject: [PATCH] Update test setup to index test data after old indexing code
 removed

- Delete tests testing deprecated server side indexing flows
- Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and
  references in tests
- Index test data via new helper method, `get_index_files'
  - It is modelled after the old `get_org_files' variants in main app
  - It passes the test data in required format to `configure_content'
    Allows maintaining the more realistic tests from before while
    using new indexing mechanism (rather than the deprecated server
    side indexing mechanism
---
 tests/conftest.py                  | 259 ++++-------------------------
 tests/helpers.py                   | 139 ++++++++++++++++
 tests/test_agents.py               |   2 +-
 tests/test_cli.py                  |  38 +----
 tests/test_client.py               |  13 +-
 tests/test_file_filter.py          |   9 +-
 tests/test_markdown_to_entries.py  |  39 -----
 tests/test_org_to_entries.py       |  43 +----
 tests/test_pdf_to_entries.py       |  39 -----
 tests/test_plaintext_to_entries.py |  78 ++-------
 tests/test_text_search.py          | 232 ++++++++++----------------
 tests/test_word_filter.py          |   8 +-
 12 files changed, 295 insertions(+), 604 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 097a0ab0..dd448bd1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,3 @@
-import os
-from pathlib import Path
-
 import pytest
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
@@ -11,6 +8,7 @@ from khoj.configure import (
     configure_routes,
     configure_search_types,
 )
+from khoj.database.adapters import get_default_search_model
 from khoj.database.models import (
     Agent,
     ChatModel,
@@ -19,21 +17,14 @@ from khoj.database.models import (
     GithubRepoConfig,
     KhojApiUser,
     KhojUser,
-    LocalMarkdownConfig,
-    LocalOrgConfig,
-    LocalPdfConfig,
-    LocalPlaintextConfig,
 )
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.routers.api_content import configure_content
 from khoj.search_type import text_search
-from khoj.utils import fs_syncer, state
-from khoj.utils.config import SearchModels
+from khoj.utils import state
 from khoj.utils.constants import web_directory
-from khoj.utils.helpers import resolve_absolute_path
-from khoj.utils.rawconfig import ContentConfig, SearchConfig
 from tests.helpers import (
     AiModelApiFactory,
     ChatModelFactory,
@@ -43,6 +34,8 @@ from tests.helpers import (
     UserFactory,
     get_chat_api_key,
     get_chat_provider,
+    get_index_files,
+    get_sample_data,
 )
 
 
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
 
 
 @pytest.fixture(scope="session")
-def search_config() -> SearchConfig:
+def search_config():
+    search_model = get_default_search_model()
     state.embeddings_model = dict()
-    state.embeddings_model["default"] = EmbeddingsModel()
+    state.embeddings_model["default"] = EmbeddingsModel(
+        model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
+    )
     state.cross_encoder_model = dict()
-    state.cross_encoder_model["default"] = CrossEncoderModel()
-
-    model_dir = resolve_absolute_path("~/.khoj/search")
-    model_dir.mkdir(parents=True, exist_ok=True)
-    search_config = SearchConfig()
-
-    return search_config
+    state.cross_encoder_model["default"] = CrossEncoderModel(
+        model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
+    )
 
 
 @pytest.mark.django_db
@@ -201,13 +193,6 @@ def openai_agent():
     )
 
 
-@pytest.fixture(scope="session")
-def search_models(search_config: SearchConfig):
-    search_models = SearchModels()
-
-    return search_models
-
-
 @pytest.mark.django_db
 @pytest.fixture
 def default_process_lock():
@@ -219,72 +204,23 @@ def anyio_backend():
     return "asyncio"
 
 
-@pytest.mark.django_db
 @pytest.fixture(scope="function")
-def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
-    content_dir = tmp_path_factory.mktemp("content")
-
-    # Generate Image Embeddings from Test Images
-    content_config = ContentConfig()
-
-    LocalOrgConfig.objects.create(
-        input_files=None,
-        input_filter=["tests/data/org/*.org"],
-        index_heading_entries=False,
-        user=default_user,
-    )
-
-    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
-
-    if os.getenv("GITHUB_PAT_TOKEN"):
-        GithubConfig.objects.create(
-            pat_token=os.getenv("GITHUB_PAT_TOKEN"),
-            user=default_user,
-        )
-
-        GithubRepoConfig.objects.create(
-            owner="khoj-ai",
-            name="lantern",
-            branch="master",
-            github_config=GithubConfig.objects.get(user=default_user),
-        )
-
-    LocalPlaintextConfig.objects.create(
-        input_files=None,
-        input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
-        user=default_user,
-    )
-
-    return content_config
-
-
-@pytest.fixture(scope="session")
-def md_content_config():
-    markdown_config = LocalMarkdownConfig.objects.create(
-        input_files=None,
-        input_filter=["tests/data/markdown/*.markdown"],
-    )
-
-    return markdown_config
-
-
-@pytest.fixture(scope="function")
-def chat_client(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client(search_config, default_user2: KhojUser):
     return chat_client_builder(search_config, default_user2, require_auth=False)
 
 
 @pytest.fixture(scope="function")
-def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_with_auth(search_config, default_user2: KhojUser):
     return chat_client_builder(search_config, default_user2, require_auth=True)
 
 
 @pytest.fixture(scope="function")
-def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_no_background(search_config, default_user2: KhojUser):
     return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
 
 
 @pytest.fixture(scope="function")
-def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
+def chat_client_with_large_kb(search_config, default_user2: KhojUser):
     """
     Chat client fixture that creates a large knowledge base with many files
     for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
     state.SearchType = configure_search_types()
 
     if index_content:
-        LocalMarkdownConfig.objects.create(
-            input_files=None,
-            input_filter=["tests/data/markdown/*.markdown"],
-            user=user,
-        )
+        file_type = "markdown"
+        files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
 
         # Index Markdown Content for Search
-        all_files = fs_syncer.collect_files(user=user)
-        configure_content(user, all_files)
+        configure_content(user, files_to_index)
 
     # Initialize Processor from Config
     chat_provider = get_chat_provider()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
 
     # Create temporary directory for large number of test files
     temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
+    file_type = "markdown"
     large_file_list = []
 
     try:
         # Generate 200 test files with substantial content
         for i in range(300):
-            file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
+            file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
             content = f"""
 # Test File {i}
 
@@ -401,16 +334,9 @@ End of file {i}.
                 f.write(content)
             large_file_list.append(file_path)
 
-        # Create LocalMarkdownConfig with all the generated files
-        LocalMarkdownConfig.objects.create(
-            input_files=large_file_list,
-            input_filter=None,
-            user=user,
-        )
-
-        # Index all the files into the user's knowledge base
-        all_files = fs_syncer.collect_files(user=user)
-        configure_content(user, all_files)
+        # Index all generated files into the user's knowledge base
+        files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
+        configure_content(user, files_to_index)
 
         # Verify we have a substantial knowledge base
         file_count = FileObject.objects.filter(user=user, agent=None).count()
@@ -493,139 +419,18 @@ def client(
     return TestClient(app)
 
 
-@pytest.fixture(scope="function")
-def new_org_file(default_user: KhojUser, content_config: ContentConfig):
-    # Setup
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
-    input_filters = org_config.input_filter
-    new_org_file = Path(input_filters[0]).parent / "new_file.org"
-    new_org_file.touch()
-
-    yield new_org_file
-
-    # Cleanup
-    if new_org_file.exists():
-        new_org_file.unlink()
-
-
-@pytest.fixture(scope="function")
-def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
-    LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
-    return LocalOrgConfig.objects.filter(user=default_user).first()
-
-
 @pytest.fixture(scope="function")
 def pdf_configured_user1(default_user: KhojUser):
-    LocalPdfConfig.objects.create(
-        input_files=None,
-        input_filter=["tests/data/pdf/singlepage.pdf"],
-        user=default_user,
-    )
-    # Index Markdown Content for Search
-    all_files = fs_syncer.collect_files(user=default_user)
-    configure_content(default_user, all_files)
+    # Read data from pdf file at tests/data/pdf/singlepage.pdf
+    pdf_file_path = "tests/data/pdf/singlepage.pdf"
+    with open(pdf_file_path, "rb") as pdf_file:
+        pdf_data = pdf_file.read()
+
+    knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
+    # Index Content for Search
+    configure_content(default_user, knowledge_base)
 
 
 @pytest.fixture(scope="function")
 def sample_org_data():
     return get_sample_data("org")
-
-
-def get_sample_data(type):
-    sample_data = {
-        "org": {
-            "elisp.org": """
-* Emacs Khoj
-  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
-
-** Requirements
-   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
-
-** Installation
-*** Direct
-     - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
-     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
-       #+begin_src elisp
-         ;; Khoj Package
-         (use-package khoj
-           :load-path "~/.emacs.d/lisp/khoj.el"
-           :bind ("C-c s" . 'khoj))
-       #+end_src
-
-*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
-     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
-     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
-       #+begin_src elisp
-         ;; Khoj Package
-         (use-package khoj
-           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
-           :bind ("C-c s" . 'khoj))
-       #+end_src
-
-** Usage
-   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
-   2. Enter Query in Natural Language
-      e.g. "What is the meaning of life?" "What are my life goals?"
-   3. Wait for results
-      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
-   4. (Optional) Narrow down results further
-      Include/Exclude specific words from results by adding to query
-      e.g. "What is the meaning of life? -god +none"
-
-""",
-            "readme.org": """
-* Khoj
-  /Allow natural language search on user content like notes, images using transformer based models/
-
-  All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
-
-** Dependencies
-   - Python3
-   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
-
-** Install
-   #+begin_src shell
-   git clone https://github.com/khoj-ai/khoj && cd khoj
-   conda env create -f environment.yml
-   conda activate khoj
-   #+end_src""",
-        },
-        "markdown": {
-            "readme.markdown": """
-# Khoj
-Allow natural language search on user content like notes, images using transformer based models
-
-All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
-
-## Dependencies
-- Python3
-- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
-
-## Install
-```shell
-git clone
-conda env create -f environment.yml
-conda activate khoj
-```
-"""
-        },
-        "plaintext": {
-            "readme.txt": """
-Khoj
-Allow natural language search on user content like notes, images using transformer based models
-
-All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
-
-Dependencies
-- Python3
-- Miniconda
-
-Install
-git clone
-conda env create -f environment.yml
-conda activate khoj
-"""
-        },
-    }
-
-    return sample_data[type]
diff --git a/tests/helpers.py b/tests/helpers.py
index 53ce4ea6..6edb0946 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -1,3 +1,5 @@
+import glob
+import logging
 import os
 from datetime import datetime
 
@@ -17,6 +19,9 @@ from khoj.database.models import (
     UserConversationConfig,
 )
 from khoj.processor.conversation.utils import message_to_log
+from khoj.utils.helpers import get_absolute_path, is_none_or_empty
+
+logger = logging.getLogger(__name__)
 
 
 def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
@@ -61,6 +66,140 @@ def generate_chat_history(message_list):
     return chat_history
 
 
+def get_sample_data(type):
+    sample_data = {
+        "org": {
+            "elisp.org": """
+* Emacs Khoj
+  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
+
+** Requirements
+   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
+
+** Installation
+*** Direct
+     - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
+     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
+       #+begin_src elisp
+         ;; Khoj Package
+         (use-package khoj
+           :load-path "~/.emacs.d/lisp/khoj.el"
+           :bind ("C-c s" . 'khoj))
+       #+end_src
+
+*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
+     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
+     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
+       #+begin_src elisp
+         ;; Khoj Package
+         (use-package khoj
+           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
+           :bind ("C-c s" . 'khoj))
+       #+end_src
+
+** Usage
+   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
+   2. Enter Query in Natural Language
+      e.g. "What is the meaning of life?" "What are my life goals?"
+   3. Wait for results
+      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
+   4. (Optional) Narrow down results further
+      Include/Exclude specific words from results by adding to query
+      e.g. "What is the meaning of life? -god +none"
+
+""",
+            "readme.org": """
+* Khoj
+  /Allow natural language search on user content like notes, images using transformer based models/
+
+  All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
+
+** Dependencies
+   - Python3
+   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
+
+** Install
+   #+begin_src shell
+   git clone https://github.com/khoj-ai/khoj && cd khoj
+   conda env create -f environment.yml
+   conda activate khoj
+   #+end_src""",
+        },
+        "markdown": {
+            "readme.markdown": """
+# Khoj
+Allow natural language search on user content like notes, images using transformer based models
+
+All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
+
+## Dependencies
+- Python3
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
+
+## Install
+```shell
+git clone
+conda env create -f environment.yml
+conda activate khoj
+```
+"""
+        },
+        "plaintext": {
+            "readme.txt": """
+Khoj
+Allow natural language search on user content like notes, images using transformer based models
+
+All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
+
+Dependencies
+- Python3
+- Miniconda
+
+Install
+git clone
+conda env create -f environment.yml
+conda activate khoj
+"""
+        },
+    }
+
+    return sample_data[type]
+
+
+def get_index_files(
+    input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
+) -> dict[str, str]:
+    # Input Validation
+    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
+        logger.debug("At least one of input_files or input_filter is required to be specified")
+        return {}
+
+    # Get files to process
+    absolute_files, filtered_files = set(), set()
+    if input_files:
+        absolute_files = {get_absolute_path(input_file) for input_file in input_files}
+    if input_filters:
+        filtered_files = {
+            filtered_file
+            for file_filter in input_filters
+            for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
+        }
+
+    all_files = sorted(absolute_files | filtered_files)
+
+    filename_to_content_map = {}
+    for file in all_files:
+        with open(file, "r", encoding="utf8") as f:
+            try:
+                filename_to_content_map[file] = f.read()
+            except Exception as e:
+                logger.warning(f"Unable to read file: {file}. Skipping file.")
+                logger.warning(e, exc_info=True)
+
+    return filename_to_content_map
+
+
 class UserFactory(factory.django.DjangoModelFactory):
     class Meta:
         model = KhojUser
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 21a242ef..1d3b96ec 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -15,7 +15,7 @@ from tests.helpers import ChatModelFactory
 def test_create_default_agent(default_user: KhojUser):
     ChatModelFactory()
 
-    agent = AgentAdapters.create_default_agent(default_user)
+    agent = AgentAdapters.create_default_agent()
     assert agent is not None
     assert agent.input_tools == []
     assert agent.output_modes == []
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 211ff38e..15908653 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,49 +1,15 @@
 # Standard Modules
 from pathlib import Path
-from random import random
 
 from khoj.utils.cli import cli
-from khoj.utils.helpers import resolve_absolute_path
 
 
 # Test
 # ----------------------------------------------------------------------------------------------------
 def test_cli_minimal_default():
     # Act
-    actual_args = cli([])
+    actual_args = cli(["-vvv"])
 
     # Assert
-    assert actual_args.config_file == resolve_absolute_path(Path("~/.khoj/khoj.yml"))
-    assert actual_args.regenerate == False
-    assert actual_args.verbose == 0
-
-
-# ----------------------------------------------------------------------------------------------------
-def test_cli_invalid_config_file_path():
-    # Arrange
-    non_existent_config_file = f"non-existent-khoj-{random()}.yml"
-
-    # Act
-    actual_args = cli([f"--config-file={non_existent_config_file}"])
-
-    # Assert
-    assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
-    assert actual_args.config == None
-
-
-# ----------------------------------------------------------------------------------------------------
-def test_cli_config_from_file():
-    # Act
-    actual_args = cli(["--config-file=tests/data/config.yml", "--regenerate", "-vvv"])
-
-    # Assert
-    assert actual_args.config_file == resolve_absolute_path(Path("tests/data/config.yml"))
-    assert actual_args.regenerate == True
-    assert actual_args.config is not None
+    assert actual_args.log_file == Path("~/.khoj/khoj.log")
     assert actual_args.verbose == 3
-
-    # Ensure content config is loaded from file
-    assert actual_args.config.content_type.org.input_files == [
-        Path("~/first_from_config.org"),
-        Path("~/second_from_config.org"),
-    ]
diff --git a/tests/test_client.py b/tests/test_client.py
index b7341d0e..46732a86 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -13,7 +13,6 @@ from khoj.database.models import KhojApiUser, KhojUser
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.search_type import text_search
 from khoj.utils import state
-from khoj.utils.rawconfig import ContentConfig, SearchConfig
 
 
 # Test
@@ -296,7 +295,7 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db(transaction=True)
-def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
+def test_notes_search(client, search_config, sample_org_data, default_user: KhojUser):
     # Arrange
     headers = {"Authorization": "Bearer kk-secret"}
     text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -315,7 +314,7 @@ def test_notes_search(client, search_config: SearchConfig, sample_org_data, defa
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db(transaction=True)
-def test_notes_search_no_results(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
+def test_notes_search_no_results(client, search_config, sample_org_data, default_user: KhojUser):
     # Arrange
     headers = {"Authorization": "Bearer kk-secret"}
     text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
@@ -331,9 +330,7 @@ def test_notes_search_no_results(client, search_config: SearchConfig, sample_org
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db(transaction=True)
-def test_notes_search_with_only_filters(
-    client, content_config: ContentConfig, search_config: SearchConfig, sample_org_data, default_user: KhojUser
-):
+def test_notes_search_with_only_filters(client, sample_org_data, default_user: KhojUser):
     # Arrange
     headers = {"Authorization": "Bearer kk-secret"}
     text_search.setup(
@@ -397,9 +394,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db(transaction=True)
-def test_notes_search_requires_parent_context(
-    client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
-):
+def test_notes_search_requires_parent_context(client, search_config, sample_org_data, default_user: KhojUser):
     # Arrange
     headers = {"Authorization": "Bearer kk-secret"}
     text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
diff --git a/tests/test_file_filter.py b/tests/test_file_filter.py
index 9a36bd57..21f198ea 100644
--- a/tests/test_file_filter.py
+++ b/tests/test_file_filter.py
@@ -1,6 +1,13 @@
 # Application Packages
 from khoj.search_filter.file_filter import FileFilter
-from khoj.utils.rawconfig import Entry
+
+
+# Mock Entry class for testing
+class Entry:
+    def __init__(self, compiled="", raw="", file=""):
+        self.compiled = compiled
+        self.raw = raw
+        self.file = file
 
 
 def test_can_filter_no_file_filter():
diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py
index 30813555..b8ab37a7 100644
--- a/tests/test_markdown_to_entries.py
+++ b/tests/test_markdown_to_entries.py
@@ -3,8 +3,6 @@ import re
 from pathlib import Path
 
 from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
-from khoj.utils.fs_syncer import get_markdown_files
-from khoj.utils.rawconfig import TextContentConfig
 
 
 def test_extract_markdown_with_no_headings(tmp_path):
@@ -212,43 +210,6 @@ longer body line 2.1
     ), "Third entry is second entries child heading"
 
 
-def test_get_markdown_files(tmp_path):
-    "Ensure Markdown files specified via input-filter, input-files extracted"
-    # Arrange
-    # Include via input-filter globs
-    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
-    group1_file2 = create_file(tmp_path, filename="group1-file2.md")
-    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
-    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
-    # Include via input-file field
-    file1 = create_file(tmp_path, filename="notes.md")
-    # Not included by any filter
-    create_file(tmp_path, filename="not-included-markdown.md")
-    create_file(tmp_path, filename="not-included-text.txt")
-
-    expected_files = set(
-        [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
-    )
-
-    # Setup input-files, input-filters
-    input_files = [tmp_path / "notes.md"]
-    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.markdown"]
-
-    markdown_config = TextContentConfig(
-        input_files=input_files,
-        input_filter=[str(filter) for filter in input_filter],
-        compressed_jsonl=tmp_path / "test.jsonl",
-        embeddings_file=tmp_path / "test_embeddings.jsonl",
-    )
-
-    # Act
-    extracted_org_files = get_markdown_files(markdown_config)
-
-    # Assert
-    assert len(extracted_org_files) == 5
-    assert set(extracted_org_files.keys()) == expected_files
-
-
 def test_line_number_tracking_in_recursive_split():
     "Ensure line numbers in URIs are correct after recursive splitting by checking against the actual file."
     # Arrange
diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py
index d5dcdbd2..0196ef6c 100644
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -4,9 +4,8 @@ import time
 
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
-from khoj.utils.fs_syncer import get_org_files
 from khoj.utils.helpers import is_none_or_empty
-from khoj.utils.rawconfig import Entry, TextContentConfig
+from khoj.utils.rawconfig import Entry
 
 
 def test_configure_indexing_heading_only_entries(tmp_path):
@@ -330,46 +329,6 @@ def test_file_with_no_headings_to_entry(tmp_path):
     assert len(entries[1]) == 1
 
 
-def test_get_org_files(tmp_path):
-    "Ensure Org files specified via input-filter, input-files extracted"
-    # Arrange
-    # Include via input-filter globs
-    group1_file1 = create_file(tmp_path, filename="group1-file1.org")
-    group1_file2 = create_file(tmp_path, filename="group1-file2.org")
-    group2_file1 = create_file(tmp_path, filename="group2-file1.org")
-    group2_file2 = create_file(tmp_path, filename="group2-file2.org")
-    # Include via input-file field
-    orgfile1 = create_file(tmp_path, filename="orgfile1.org")
-    # Not included by any filter
-    create_file(tmp_path, filename="orgfile2.org")
-    create_file(tmp_path, filename="text1.txt")
-
-    expected_files = set(
-        [
-            os.path.join(tmp_path, file.name)
-            for file in [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]
-        ]
-    )
-
-    # Setup input-files, input-filters
-    input_files = [tmp_path / "orgfile1.org"]
-    input_filter = [tmp_path / "group1*.org", tmp_path / "group2*.org"]
-
-    org_config = TextContentConfig(
-        input_files=input_files,
-        input_filter=[str(filter) for filter in input_filter],
-        compressed_jsonl=tmp_path / "test.jsonl",
-        embeddings_file=tmp_path / "test_embeddings.jsonl",
-    )
-
-    # Act
-    extracted_org_files = get_org_files(org_config)
-
-    # Assert
-    assert len(extracted_org_files) == 5
-    assert set(extracted_org_files.keys()) == expected_files
-
-
 def test_extract_entries_with_different_level_headings(tmp_path):
     "Extract org entries with different level headings."
     # Arrange
diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py
index a62eca8b..d7336fdc 100644
--- a/tests/test_pdf_to_entries.py
+++ b/tests/test_pdf_to_entries.py
@@ -4,8 +4,6 @@ import re
 import pytest
 
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
-from khoj.utils.fs_syncer import get_pdf_files
-from khoj.utils.rawconfig import TextContentConfig
 
 
 def test_single_page_pdf_to_jsonl():
@@ -61,43 +59,6 @@ def test_ocr_page_pdf_to_jsonl():
     assert re.search(expected_str_with_variable_spaces, raw_entry) is not None
 
 
-def test_get_pdf_files(tmp_path):
-    "Ensure Pdf files specified via input-filter, input-files extracted"
-    # Arrange
-    # Include via input-filter globs
-    group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
-    group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
-    group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
-    group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
-    # Include via input-file field
-    file1 = create_file(tmp_path, filename="document.pdf")
-    # Not included by any filter
-    create_file(tmp_path, filename="not-included-document.pdf")
-    create_file(tmp_path, filename="not-included-text.txt")
-
-    expected_files = set(
-        [os.path.join(tmp_path, file.name) for file in [group1_file1, group1_file2, group2_file1, group2_file2, file1]]
-    )
-
-    # Setup input-files, input-filters
-    input_files = [tmp_path / "document.pdf"]
-    input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
-
-    pdf_config = TextContentConfig(
-        input_files=input_files,
-        input_filter=[str(path) for path in input_filter],
-        compressed_jsonl=tmp_path / "test.jsonl",
-        embeddings_file=tmp_path / "test_embeddings.jsonl",
-    )
-
-    # Act
-    extracted_pdf_files = get_pdf_files(pdf_config)
-
-    # Assert
-    assert len(extracted_pdf_files) == 5
-    assert set(extracted_pdf_files.keys()) == expected_files
-
-
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="document.pdf"):
     pdf_file = tmp_path / filename
diff --git a/tests/test_plaintext_to_entries.py b/tests/test_plaintext_to_entries.py
index a085b2b5..558832d3 100644
--- a/tests/test_plaintext_to_entries.py
+++ b/tests/test_plaintext_to_entries.py
@@ -1,27 +1,20 @@
-import os
 from pathlib import Path
+from textwrap import dedent
 
-from khoj.database.models import KhojUser, LocalPlaintextConfig
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
-from khoj.utils.fs_syncer import get_plaintext_files
-from khoj.utils.rawconfig import TextContentConfig
 
 
-def test_plaintext_file(tmp_path):
+def test_plaintext_file():
     "Convert files with no heading to jsonl."
     # Arrange
     raw_entry = f"""
     Hi, I am a plaintext file and I have some plaintext words.
     """
-    plaintextfile = create_file(tmp_path, raw_entry)
+    plaintextfile = "test.txt"
+    data = {plaintextfile: raw_entry}
 
     # Act
     # Extract Entries from specified plaintext files
-
-    data = {
-        f"{plaintextfile}": raw_entry,
-    }
-
     entries = PlaintextToEntries.extract_plaintext_entries(data)
 
     # Convert each entry.file to absolute path to make them JSON serializable
@@ -37,59 +30,20 @@ def test_plaintext_file(tmp_path):
     assert entries[1][0].compiled == f"{plaintextfile}\n{raw_entry}"
 
 
-def test_get_plaintext_files(tmp_path):
-    "Ensure Plaintext files specified via input-filter, input-files extracted"
-    # Arrange
-    # Include via input-filter globs
-    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
-    group1_file2 = create_file(tmp_path, filename="group1-file2.md")
-
-    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
-    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
-    group2_file4 = create_file(tmp_path, filename="group2-file4.html")
-    # Include via input-file field
-    file1 = create_file(tmp_path, filename="notes.txt")
-    # Include unsupported file types
-    create_file(tmp_path, filename="group2-unincluded.py")
-    create_file(tmp_path, filename="group2-unincluded.csv")
-    create_file(tmp_path, filename="group2-unincluded.csv")
-    create_file(tmp_path, filename="group2-file3.mbox")
-    # Not included by any filter
-    create_file(tmp_path, filename="not-included-markdown.md")
-    create_file(tmp_path, filename="not-included-text.txt")
-
-    expected_files = set(
-        [
-            os.path.join(tmp_path, file.name)
-            for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file4, file1]
-        ]
-    )
-
-    # Setup input-files, input-filters
-    input_files = [tmp_path / "notes.txt"]
-    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
-
-    plaintext_config = TextContentConfig(
-        input_files=input_files,
-        input_filter=[str(filter) for filter in input_filter],
-        compressed_jsonl=tmp_path / "test.jsonl",
-        embeddings_file=tmp_path / "test_embeddings.jsonl",
-    )
-
-    # Act
-    extracted_plaintext_files = get_plaintext_files(plaintext_config)
-
-    # Assert
-    assert len(extracted_plaintext_files) == len(expected_files)
-    assert set(extracted_plaintext_files.keys()) == set(expected_files)
-
-
-def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
+def test_parse_html_plaintext_file(tmp_path):
     "Ensure HTML files are parsed correctly"
     # Arrange
-    # Setup input-files, input-filters
-    config = LocalPlaintextConfig.objects.filter(user=default_user).first()
-    extracted_plaintext_files = get_plaintext_files(config=config)
+    raw_entry = dedent(
+        f"""
+        <html>
+        <head><title>Test HTML</title></head>
+        <body>
+        <div>Test content</div>
+        </body>
+        </html>
+        """
+    )
+    extracted_plaintext_files = {"test.html": raw_entry}
 
     # Act
     entries = PlaintextToEntries.extract_plaintext_entries(extracted_plaintext_files)
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 712f4aba..9e532429 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -2,23 +2,16 @@
 import asyncio
 import logging
 import os
-from pathlib import Path
 
 import pytest
 
 from khoj.database.adapters import EntryAdapters
-from khoj.database.models import Entry, GithubConfig, KhojUser, LocalOrgConfig
-from khoj.processor.content.docx.docx_to_entries import DocxToEntries
+from khoj.database.models import Entry, GithubConfig, KhojUser
 from khoj.processor.content.github.github_to_entries import GithubToEntries
-from khoj.processor.content.images.image_to_entries import ImageToEntries
-from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
-from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
-from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
 from khoj.search_type import text_search
-from khoj.utils.fs_syncer import collect_files, get_org_files
-from khoj.utils.rawconfig import ContentConfig, SearchConfig
+from tests.helpers import get_index_files, get_sample_data
 
 logger = logging.getLogger(__name__)
 
@@ -26,53 +19,20 @@ logger = logging.getLogger(__name__)
 # Test
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: LocalOrgConfig):
-    # Arrange
-    # Ensure file mentioned in org.input-files is missing
-    single_new_file = Path(org_config_with_only_new_file.input_files[0])
-    single_new_file.unlink()
-
-    # Act
-    # Generate notes embeddings during asymmetric setup
-    with pytest.raises(FileNotFoundError):
-        get_org_files(org_config_with_only_new_file)
-
-
-# ----------------------------------------------------------------------------------------------------
-@pytest.mark.django_db
-def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path, default_user: KhojUser):
-    # Arrange
-    orgfile = tmp_path / "directory.org" / "file.org"
-    orgfile.parent.mkdir()
-    with open(orgfile, "w") as f:
-        f.write("* Heading\n- List item\n")
-
-    LocalOrgConfig.objects.create(
-        input_filter=[f"{tmp_path}/**/*"],
-        input_files=None,
-        user=default_user,
-    )
-
-    # Act
-    org_files = collect_files(user=default_user)["org"]
-
-    # Assert
-    # should return orgfile and not raise IsADirectoryError
-    assert org_files == {f"{orgfile}": "* Heading\n- List item\n"}
-
-
-# ----------------------------------------------------------------------------------------------------
-@pytest.mark.django_db
-def test_text_search_setup_with_empty_file_creates_no_entries(
-    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
-):
+def test_text_search_setup_with_empty_file_creates_no_entries(search_config, default_user: KhojUser):
     # Arrange
+    initial_data = {
+        "test.org": "* First heading\nFirst content",
+        "test2.org": "* Second heading\nSecond content",
+    }
+    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
     existing_entries = Entry.objects.filter(user=default_user).count()
-    data = get_org_files(org_config_with_only_new_file)
+
+    final_data = {"new_file.org": ""}
 
     # Act
     # Generate notes embeddings during asymmetric setup
-    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
+    text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
 
     # Assert
     updated_entries = Entry.objects.filter(user=default_user).count()
@@ -84,13 +44,14 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_text_indexer_deletes_embedding_before_regenerate(
-    content_config: ContentConfig, default_user: KhojUser, caplog
-):
+def test_text_indexer_deletes_embedding_before_regenerate(search_config, default_user: KhojUser, caplog):
     # Arrange
+    data = {
+        "test1.org": "* Test heading\nTest content",
+        "test2.org": "* Another heading\nAnother content",
+    }
+    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
     existing_entries = Entry.objects.filter(user=default_user).count()
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
-    data = get_org_files(org_config)
 
     # Act
     # Generate notes embeddings during asymmetric setup
@@ -107,11 +68,10 @@ def test_text_indexer_deletes_embedding_before_regenerate(
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_text_index_same_if_content_unchanged(content_config: ContentConfig, default_user: KhojUser, caplog):
+def test_text_index_same_if_content_unchanged(search_config, default_user: KhojUser, caplog):
     # Arrange
     existing_entries = Entry.objects.filter(user=default_user)
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
-    data = get_org_files(org_config)
+    data = {"test.org": "* Test heading\nTest content"}
 
     # Act
     # Generate initial notes embeddings during asymmetric setup
@@ -136,20 +96,14 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-@pytest.mark.anyio
-# @pytest.mark.asyncio
-async def test_text_search(search_config: SearchConfig):
+@pytest.mark.asyncio
+async def test_text_search(search_config):
     # Arrange
-    default_user = await KhojUser.objects.acreate(
+    default_user, _ = await KhojUser.objects.aget_or_create(
         username="test_user", password="test_password", email="test@example.com"
     )
-    org_config = await LocalOrgConfig.objects.acreate(
-        input_files=None,
-        input_filter=["tests/data/org/*.org"],
-        index_heading_entries=False,
-        user=default_user,
-    )
-    data = get_org_files(org_config)
+    # Get some sample org data to index
+    data = get_sample_data("org")
 
     loop = asyncio.get_event_loop()
     await loop.run_in_executor(
@@ -175,17 +129,15 @@ async def test_text_search(search_config: SearchConfig):
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog):
+def test_entry_chunking_by_max_tokens(tmp_path, search_config, default_user: KhojUser, caplog):
     # Arrange
     # Insert org-mode entry with size exceeding max token limit to new org file
     max_tokens = 256
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
-    with open(new_file_to_index, "w") as f:
-        f.write(f"* Entry more than {max_tokens} words\n")
-        for index in range(max_tokens + 1):
-            f.write(f"{index} ")
-
-    data = get_org_files(org_config_with_only_new_file)
+    new_file_to_index = tmp_path / "test.org"
+    content = f"* Entry more than {max_tokens} words\n"
+    for index in range(max_tokens + 1):
+        content += f"{index} "
+    data = {str(new_file_to_index): content}
 
     # Act
     # reload embeddings, entries, notes model after adding new org-mode file
@@ -200,9 +152,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_entry_chunking_by_max_tokens_not_full_corpus(
-    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser, caplog
-):
+def test_entry_chunking_by_max_tokens_not_full_corpus(tmp_path, search_config, default_user: KhojUser, caplog):
     # Arrange
     # Insert org-mode entry with size exceeding max token limit to new org file
     data = {
@@ -231,13 +181,11 @@ conda activate khoj
     )
 
     max_tokens = 256
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
-    with open(new_file_to_index, "w") as f:
-        f.write(f"* Entry more than {max_tokens} words\n")
-        for index in range(max_tokens + 1):
-            f.write(f"{index} ")
-
-    data = get_org_files(org_config_with_only_new_file)
+    new_file_to_index = tmp_path / "test.org"
+    content = f"* Entry more than {max_tokens} words\n"
+    for index in range(max_tokens + 1):
+        content += f"{index} "
+    data = {str(new_file_to_index): content}
 
     # Act
     # reload embeddings, entries, notes model after adding new org-mode file
@@ -257,34 +205,34 @@ conda activate khoj
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
+def test_regenerate_index_with_new_entry(search_config, default_user: KhojUser):
     # Arrange
+    # Initial indexed files
+    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
     existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
-    initial_data = get_org_files(org_config)
 
-    # append org-mode entry to first org input file in config
-    org_config.input_files = [f"{new_org_file}"]
-    with open(new_org_file, "w") as f:
-        f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
-
-    final_data = get_org_files(org_config)
-
-    # Act
-    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
+    # Regenerate index with only files from test data set
+    files_to_index = get_index_files()
+    text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
     updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
+    # Act
+    # Update index with the new file
+    new_file = "test.org"
+    new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+    files_to_index[new_file] = new_entry
+
     # regenerate notes jsonl, model embeddings and model to include entry from new file
-    text_search.setup(OrgToEntries, final_data, regenerate=True, user=default_user)
+    text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
     updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
     # Assert
     for entry in updated_entries1:
         assert entry in updated_entries2
 
-    assert not any([new_org_file.name in entry for entry in updated_entries1])
-    assert not any([new_org_file.name in entry for entry in existing_entries])
-    assert any([new_org_file.name in entry for entry in updated_entries2])
+    assert not any([new_file in entry for entry in updated_entries1])
+    assert not any([new_file in entry for entry in existing_entries])
+    assert any([new_file in entry for entry in updated_entries2])
 
     assert any(
         ["Saw a super cute video of a chihuahua doing the Tango on Youtube" in entry for entry in updated_entries2]
@@ -294,28 +242,24 @@ def test_regenerate_index_with_new_entry(content_config: ContentConfig, new_org_
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_update_index_with_duplicate_entries_in_stable_order(
-    org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser
-):
+def test_update_index_with_duplicate_entries_in_stable_order(tmp_path, search_config, default_user: KhojUser):
     # Arrange
+    initial_data = get_sample_data("org")
+    text_search.setup(OrgToEntries, initial_data, regenerate=True, user=default_user)
     existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
 
     # Insert org-mode entries with same compiled form into new org file
+    new_file_to_index = tmp_path / "test.org"
     new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
-    with open(new_file_to_index, "w") as f:
-        f.write(f"{new_entry}{new_entry}")
-
-    data = get_org_files(org_config_with_only_new_file)
+    # Initial data with duplicate entries
+    data = {str(new_file_to_index): f"{new_entry}{new_entry}"}
 
     # Act
     # generate embeddings, entries, notes model from scratch after adding new org-mode file
     text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
     updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
-    data = get_org_files(org_config_with_only_new_file)
-
-    # update embeddings, entries, notes model with no new changes
+    # idempotent indexing when data unchanged
     text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
     updated_entries2 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
@@ -324,6 +268,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
     for entry in existing_entries:
         assert entry not in updated_entries1
 
+    # verify the second indexing update has same entries and ordering as first
     for entry in updated_entries1:
         assert entry in updated_entries2
 
@@ -334,22 +279,17 @@ def test_update_index_with_duplicate_entries_in_stable_order(
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrgConfig, default_user: KhojUser):
+def test_update_index_with_deleted_entry(tmp_path, search_config, default_user: KhojUser):
     # Arrange
     existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
-    new_file_to_index = Path(org_config_with_only_new_file.input_files[0])
 
-    # Insert org-mode entries with same compiled form into new org file
+    new_file_to_index = tmp_path / "test.org"
     new_entry = "* TODO A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
-    with open(new_file_to_index, "w") as f:
-        f.write(f"{new_entry}{new_entry} -- Tatooine")
-    initial_data = get_org_files(org_config_with_only_new_file)
 
-    # update embeddings, entries, notes model after removing an entry from the org file
-    with open(new_file_to_index, "w") as f:
-        f.write(f"{new_entry}")
-
-    final_data = get_org_files(org_config_with_only_new_file)
+    # Initial data with two entries
+    initial_data = {str(new_file_to_index): f"{new_entry}{new_entry} -- Tatooine"}
+    # Final data with only first entry, with second entry removed
+    final_data = {str(new_file_to_index): f"{new_entry}"}
 
     # Act
     # load embeddings, entries, notes model after adding new org file with 2 entries
@@ -375,29 +315,29 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.django_db
-def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file: Path, default_user: KhojUser):
+def test_update_index_with_new_entry(search_config, default_user: KhojUser):
     # Arrange
-    existing_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
-    org_config = LocalOrgConfig.objects.filter(user=default_user).first()
-    data = get_org_files(org_config)
-    text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
+    # Initial indexed files
+    text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=True, user=default_user)
+    old_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
-    # append org-mode entry to first org input file in config
-    with open(new_org_file, "w") as f:
-        new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
-        f.write(new_entry)
-
-    data = get_org_files(org_config)
+    # Regenerate index with only files from test data set
+    files_to_index = get_index_files()
+    new_entries = text_search.setup(OrgToEntries, files_to_index, regenerate=True, user=default_user)
 
     # Act
-    # update embeddings, entries with the newly added note
-    text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
-    updated_entries1 = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
+    # Update index with the new file
+    new_file = "test.org"
+    new_entry = "\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n"
+    final_data = {new_file: new_entry}
+
+    text_search.setup(OrgToEntries, final_data, regenerate=False, user=default_user)
+    updated_new_entries = list(Entry.objects.filter(user=default_user).values_list("compiled", flat=True))
 
     # Assert
-    for entry in existing_entries:
-        assert entry not in updated_entries1
-    assert len(updated_entries1) == len(existing_entries) + 1
+    for old_entry in old_entries:
+        assert old_entry not in updated_new_entries
+    assert len(updated_new_entries) == len(new_entries) + 1
     verify_embeddings(3, default_user)
 
 
@@ -409,9 +349,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
         (OrgToEntries),
     ],
 )
-def test_update_index_with_deleted_file(
-    org_config_with_only_new_file: LocalOrgConfig, text_to_entries: TextToEntries, default_user: KhojUser
-):
+def test_update_index_with_deleted_file(text_to_entries: TextToEntries, search_config, default_user: KhojUser):
     "Delete entries associated with new file when file path with empty content passed."
     # Arrange
     file_to_index = "test"
@@ -446,7 +384,7 @@ def test_update_index_with_deleted_file(
 
 # ----------------------------------------------------------------------------------------------------
 @pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
-def test_text_search_setup_github(content_config: ContentConfig, default_user: KhojUser):
+def test_text_search_setup_github(search_config, default_user: KhojUser):
     # Arrange
     github_config = GithubConfig.objects.filter(user=default_user).first()
 
diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py
index ebd6cccf..5333e17f 100644
--- a/tests/test_word_filter.py
+++ b/tests/test_word_filter.py
@@ -1,6 +1,12 @@
 # Application Packages
 from khoj.search_filter.word_filter import WordFilter
-from khoj.utils.rawconfig import Entry
+
+
+# Mock Entry class for testing
+class Entry:
+    def __init__(self, compiled="", raw=""):
+        self.compiled = compiled
+        self.raw = raw
 
 
 # Test