Update test setup to index test data after old indexing code removed

- Delete tests testing deprecated server side indexing flows - Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and references in tests - Index test data via new helper method, `get_index_files' - It is modelled after the old `get_org_files' variants in main app - It passes the test data in required format to `configure_content' Allows maintaining the more realistic tests from before while using new indexing mechanism (rather than the deprecated server side indexing mechanism
2026-03-06 05:39:12 +00:00 · 2025-07-11 14:35:05 -07:00
parent d9d24dd638
commit 892d57314e
12 changed files with 295 additions and 604 deletions
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -1,3 +1,5 @@
+import glob
+import logging
 import os
 from datetime import datetime

@@ -17,6 +19,9 @@ from khoj.database.models import (
    UserConversationConfig,
 )
 from khoj.processor.conversation.utils import message_to_log
+from khoj.utils.helpers import get_absolute_path, is_none_or_empty
+
+logger = logging.getLogger(__name__)


 def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
@@ -61,6 +66,140 @@ def generate_chat_history(message_list):
    return chat_history


+def get_sample_data(type):
+    sample_data = {
+        "org": {
+            "elisp.org": """
+* Emacs Khoj
+  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
+
+** Requirements
+   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
+
+** Installation
+*** Direct
+     - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
+     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
+       #+begin_src elisp
+         ;; Khoj Package
+         (use-package khoj
+           :load-path "~/.emacs.d/lisp/khoj.el"
+           :bind ("C-c s" . 'khoj))
+       #+end_src
+
+*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
+     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
+     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
+       #+begin_src elisp
+         ;; Khoj Package
+         (use-package khoj
+           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
+           :bind ("C-c s" . 'khoj))
+       #+end_src
+
+** Usage
+   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
+   2. Enter Query in Natural Language
+      e.g. "What is the meaning of life?" "What are my life goals?"
+   3. Wait for results
+      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
+   4. (Optional) Narrow down results further
+      Include/Exclude specific words from results by adding to query
+      e.g. "What is the meaning of life? -god +none"
+
+""",
+            "readme.org": """
+* Khoj
+  /Allow natural language search on user content like notes, images using transformer based models/
+
+  All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
+
+** Dependencies
+   - Python3
+   - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
+
+** Install
+   #+begin_src shell
+   git clone https://github.com/khoj-ai/khoj && cd khoj
+   conda env create -f environment.yml
+   conda activate khoj
+   #+end_src""",
+        },
+        "markdown": {
+            "readme.markdown": """
+# Khoj
+Allow natural language search on user content like notes, images using transformer based models
+
+All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
+
+## Dependencies
+- Python3
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
+
+## Install
+```shell
+git clone
+conda env create -f environment.yml
+conda activate khoj
+```
+"""
+        },
+        "plaintext": {
+            "readme.txt": """
+Khoj
+Allow natural language search on user content like notes, images using transformer based models
+
+All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
+
+Dependencies
+- Python3
+- Miniconda
+
+Install
+git clone
+conda env create -f environment.yml
+conda activate khoj
+"""
+        },
+    }
+
+    return sample_data[type]
+
+
+def get_index_files(
+    input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
+) -> dict[str, str]:
+    # Input Validation
+    if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
+        logger.debug("At least one of input_files or input_filter is required to be specified")
+        return {}
+
+    # Get files to process
+    absolute_files, filtered_files = set(), set()
+    if input_files:
+        absolute_files = {get_absolute_path(input_file) for input_file in input_files}
+    if input_filters:
+        filtered_files = {
+            filtered_file
+            for file_filter in input_filters
+            for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
+            if os.path.isfile(filtered_file)
+        }
+
+    all_files = sorted(absolute_files | filtered_files)
+
+    filename_to_content_map = {}
+    for file in all_files:
+        with open(file, "r", encoding="utf8") as f:
+            try:
+                filename_to_content_map[file] = f.read()
+            except Exception as e:
+                logger.warning(f"Unable to read file: {file}. Skipping file.")
+                logger.warning(e, exc_info=True)
+
+    return filename_to_content_map
+
+
 class UserFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = KhojUser