Resolve merge conflicts

This commit is contained in:
sabaimran
2023-10-19 14:39:05 -07:00
42 changed files with 941 additions and 590 deletions

View File

@@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import (
ContentConfig,
ConversationProcessorConfig,
OfflineChatProcessorConfig,
OpenAIProcessorConfig,
ProcessorConfig,
TextContentConfig,
@@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):
# Setup conversation processor
processor_config = ProcessorConfig()
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
processor_config.conversation = ConversationProcessorConfig(
enable_offline_chat=True,
offline_chat=offline_chat,
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
)

View File

@@ -6,6 +6,7 @@ from urllib.parse import quote
# External Packages
from fastapi.testclient import TestClient
import pytest
# Internal Packages
from app.main import app
@@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
# ----------------------------------------------------------------------------------------------------
def test_index_batch(client):
def test_index_update(client):
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
response = client.post("/api/v1/index/update", files=files, headers=headers)
# Assert
assert response.status_code == 200
@@ -76,12 +77,11 @@ def test_index_batch(client):
def test_regenerate_with_valid_content_type(client):
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
@@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
response = client.get(f"/api/update?force=true&t=github")
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="Flaky test on parallel test runs")
def test_get_configured_types_via_api(client):
# Act
response = client.get(f"/api/config/types")
@@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
def get_sample_files_data():
return {
"org": {
"path/to/filename.org": "* practicing piano",
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
"path/to/filename2.org": "* how to build a search engine",
},
"pdf": {
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
"path/to/filename1.pdf": "The sun is a ball of helium",
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
},
"plaintext": {
"path/to/filename.txt": "data,column,value",
"path/to/filename1.txt": "<html>my first web page</html>",
"path/to/filename2.txt": "2021-02-02 Journal Entry",
},
"markdown": {
"path/to/filename.md": "# Notes from client call",
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
"path/to/filename2.md": "**Understanding science through the lens of art**",
},
"files": ("path/to/filename.org", "* practicing piano", "text/org"),
"files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
"files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
"files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
"files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
"files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
"files": ("path/to/filename.txt", "data,column,value", "text/plain"),
"files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
"files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
"files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
"files": (
"path/to/filename1.md",
"## Studying anthropological records from the Fatimid caliphate",
"text/markdown",
),
"files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
}

View File

@@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
@pytest.fixture(scope="session")
@@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
@pytest.mark.chatquality
def test_extract_multiple_implicit_questions_from_message(loaded_model):
# Act
response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
# Assert
expected_responses = ["height", "taller", "shorter", "heights"]
expected_responses = ["height", "taller", "shorter", "heights", "who"]
assert len(response) <= 3
for question in response:
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
"Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
"Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
)
@@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
def test_generate_search_query_using_question_from_chat_history(loaded_model):
# Arrange
message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
]
# Act
@@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
use_history=True,
)
expected_responses = [
"Vader",
"sons",
all_expected_in_response = [
"Anderson",
]
any_expected_in_response = [
"son",
"Darth",
"sons",
"children",
]
# Assert
assert len(response) >= 1
assert any([expected_response in response[0] for expected_response in expected_responses]), (
assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0]
)
assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0]
)
@@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
# Arrange
message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
]
# Act
response = extract_questions_offline(
"Is she a Jedi?",
"Is she a Doctor?",
conversation_log=populate_chat_history(message_list),
loaded_model=loaded_model,
use_history=True,
)
expected_responses = [
"Leia",
"Vader",
"Barbara",
"Robert",
"daughter",
]

View File

@@ -1,7 +1,6 @@
# Standard Packages
import json
import os
import base64
# Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
# Extract Entries from specified Pdf files
# Read singlepage.pdf into memory as bytes
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
# Act
# Extract Entries from specified Pdf files
with open("tests/data/pdf/multipage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)

View File

@@ -1,26 +1,25 @@
# System Packages
import logging
import locale
from pathlib import Path
import os
# External Packages
import pytest
from khoj.utils.config import SearchModels
# Internal Packages
from khoj.utils.state import content_index, search_models
from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.utils.config import SearchModels
from khoj.utils.fs_syncer import get_org_files
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
# Test
# ----------------------------------------------------------------------------------------------------
def test_text_search_setup_with_missing_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
# Arrange
# Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0])
@@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError):
data = get_org_files(org_config_with_only_new_file)
get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
# Arrange
orgfile = tmp_path / "directory.org" / "file.org"
orgfile.parent.mkdir()
with open(orgfile, "w") as f:
f.write("* Heading\n- List item\n")
org_content_config = TextContentConfig(
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
)
# Act
# should not raise IsADirectoryError and return orgfile
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
# ----------------------------------------------------------------------------------------------------
@@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Arrange
data = get_org_files(content_config.org)
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup(