Resolve merge conflicts

2026-03-07 21:29:13 +00:00 · 2023-10-19 14:39:05 -07:00
parent c125995d94 e3f8a95784
commit 963cd165eb
42 changed files with 941 additions and 590 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
 from khoj.utils.rawconfig import (
    ContentConfig,
    ConversationProcessorConfig,
+    OfflineChatProcessorConfig,
    OpenAIProcessorConfig,
    ProcessorConfig,
    TextContentConfig,
@@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):

    # Setup conversation processor
    processor_config = ProcessorConfig()
+    offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
    processor_config.conversation = ConversationProcessorConfig(
-        enable_offline_chat=True,
+        offline_chat=offline_chat,
        conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
    )

--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -6,6 +6,7 @@ from urllib.parse import quote

 # External Packages
 from fastapi.testclient import TestClient
+import pytest

 # Internal Packages
 from app.main import app
@@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):


 # ----------------------------------------------------------------------------------------------------
-def test_index_batch(client):
+def test_index_update(client):
    # Arrange
-    request_body = get_sample_files_data()
+    files = get_sample_files_data()
    headers = {"x-api-key": "secret"}

    # Act
-    response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
+    response = client.post("/api/v1/index/update", files=files, headers=headers)

    # Assert
    assert response.status_code == 200
@@ -76,12 +77,11 @@ def test_index_batch(client):
 def test_regenerate_with_valid_content_type(client):
    for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
        # Arrange
-        request_body = get_sample_files_data()
-
+        files = get_sample_files_data()
        headers = {"x-api-key": "secret"}

        # Act
-        response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
+        response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
        # Assert
        assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"

@@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
    response = client.get(f"/api/update?force=true&t=github")

    # Arrange
-    request_body = get_sample_files_data()
-
+    files = get_sample_files_data()
    headers = {"x-api-key": "secret"}

    # Act
-    response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
+    response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
    # Assert
    assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"


 # ----------------------------------------------------------------------------------------------------
+@pytest.mark.skip(reason="Flaky test on parallel test runs")
 def test_get_configured_types_via_api(client):
    # Act
    response = client.get(f"/api/config/types")
@@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(

 def get_sample_files_data():
    return {
-        "org": {
-            "path/to/filename.org": "* practicing piano",
-            "path/to/filename1.org": "** top 3 reasons why I moved to SF",
-            "path/to/filename2.org": "* how to build a search engine",
-        },
-        "pdf": {
-            "path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
-            "path/to/filename1.pdf": "The sun is a ball of helium",
-            "path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
-        },
-        "plaintext": {
-            "path/to/filename.txt": "data,column,value",
-            "path/to/filename1.txt": "<html>my first web page</html>",
-            "path/to/filename2.txt": "2021-02-02 Journal Entry",
-        },
-        "markdown": {
-            "path/to/filename.md": "# Notes from client call",
-            "path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
-            "path/to/filename2.md": "**Understanding science through the lens of art**",
-        },
+        "files": ("path/to/filename.org", "* practicing piano", "text/org"),
+        "files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
+        "files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
+        "files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
+        "files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
+        "files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
+        "files": ("path/to/filename.txt", "data,column,value", "text/plain"),
+        "files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
+        "files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
+        "files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
+        "files": (
+            "path/to/filename1.md",
+            "## Studying anthropological records from the Fatimid caliphate",
+            "text/markdown",
+        ),
+        "files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
    }
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model

 from khoj.processor.conversation.utils import message_to_log

-MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
+MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"


@pytest.fixture(scope="session")
@@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
@pytest.mark.chatquality
 def test_extract_multiple_implicit_questions_from_message(loaded_model):
    # Act
-    response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
+    response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)

    # Assert
-    expected_responses = ["height", "taller", "shorter", "heights"]
+    expected_responses = ["height", "taller", "shorter", "heights", "who"]
    assert len(response) <= 3

    for question in response:
        assert any([expected_response in question.lower() for expected_response in expected_responses]), (
-            "Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
+            "Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
        )


@@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
 def test_generate_search_query_using_question_from_chat_history(loaded_model):
    # Arrange
    message_list = [
-        ("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
+        ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
    ]

    # Act
@@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
        use_history=True,
    )

-    expected_responses = [
-        "Vader",
-        "sons",
+    all_expected_in_response = [
+        "Anderson",
+    ]
+
+    any_expected_in_response = [
        "son",
-        "Darth",
+        "sons",
        "children",
    ]

    # Assert
    assert len(response) >= 1
-    assert any([expected_response in response[0] for expected_response in expected_responses]), (
+    assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
+        "Expected chat actor to ask for clarification in response, but got: " + response[0]
+    )
+    assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
        "Expected chat actor to ask for clarification in response, but got: " + response[0]
    )

@@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
 def test_generate_search_query_using_answer_from_chat_history(loaded_model):
    # Arrange
    message_list = [
-        ("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
+        ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
    ]

    # Act
    response = extract_questions_offline(
-        "Is she a Jedi?",
+        "Is she a Doctor?",
        conversation_log=populate_chat_history(message_list),
        loaded_model=loaded_model,
        use_history=True,
    )

    expected_responses = [
-        "Leia",
-        "Vader",
+        "Barbara",
+        "Robert",
        "daughter",
    ]

--- a/tests/test_pdf_to_jsonl.py
+++ b/tests/test_pdf_to_jsonl.py
@@ -1,7 +1,6 @@
 # Standard Packages
 import json
 import os
-import base64

 # Internal Packages
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
    # Extract Entries from specified Pdf files
    # Read singlepage.pdf into memory as bytes
    with open("tests/data/pdf/singlepage.pdf", "rb") as f:
-        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
+        pdf_bytes = f.read()

    data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
    # Act
    # Extract Entries from specified Pdf files
    with open("tests/data/pdf/multipage.pdf", "rb") as f:
-        pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
+        pdf_bytes = f.read()

    data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -1,26 +1,25 @@
 # System Packages
 import logging
+import locale
 from pathlib import Path
 import os

 # External Packages
 import pytest
-from khoj.utils.config import SearchModels

 # Internal Packages
 from khoj.utils.state import content_index, search_models
 from khoj.search_type import text_search
-from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.github.github_to_jsonl import GithubToJsonl
+from khoj.utils.config import SearchModels
 from khoj.utils.fs_syncer import get_org_files
+from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig


 # Test
 # ----------------------------------------------------------------------------------------------------
-def test_text_search_setup_with_missing_file_raises_error(
-    org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
-):
+def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
    # Arrange
    # Ensure file mentioned in org.input-files is missing
    single_new_file = Path(org_config_with_only_new_file.input_files[0])
@@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
    # Act
    # Generate notes embeddings during asymmetric setup
    with pytest.raises(FileNotFoundError):
-        data = get_org_files(org_config_with_only_new_file)
+        get_org_files(org_config_with_only_new_file)
+
+
+# ----------------------------------------------------------------------------------------------------
+def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
+    # Arrange
+    orgfile = tmp_path / "directory.org" / "file.org"
+    orgfile.parent.mkdir()
+    with open(orgfile, "w") as f:
+        f.write("* Heading\n- List item\n")
+    org_content_config = TextContentConfig(
+        input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
+    )
+
+    # Act
+    # should not raise IsADirectoryError and return orgfile
+    assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}


 # ----------------------------------------------------------------------------------------------------
@@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
 def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
    # Arrange
    data = get_org_files(content_config.org)
+
    # Act
    # Regenerate notes embeddings during asymmetric setup
    notes_model = text_search.setup(