Address Notion, Image tech debt in indexing code path (#687)

* Add support for using OAuth2.0 in the Notion integration
* Add notion to the admin page
* Remove unnecessary content_index and image search/setup references
* Trigger background job to start indexing Notion after user configures it
* Add a log line when a new Notion integration is setup
* Fix references to the configure_content methods
This commit is contained in:
sabaimran
2024-04-04 23:40:03 -07:00
committed by GitHub
parent 69dee75c34
commit f57f9f672d
16 changed files with 145 additions and 599 deletions

View File

@@ -25,7 +25,7 @@ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
from khoj.routers.indexer import configure_content
from khoj.search_type import image_search, text_search
from khoj.search_type import text_search
from khoj.utils import fs_syncer, state
from khoj.utils.config import SearchModels
from khoj.utils.constants import web_directory
@@ -207,7 +207,6 @@ def openai_agent():
@pytest.fixture(scope="session")
def search_models(search_config: SearchConfig):
search_models = SearchModels()
search_models.image_search = image_search.initialize_model(search_config.image)
return search_models
@@ -232,8 +231,6 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
use_xmp_metadata=False,
)
image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
LocalOrgConfig.objects.create(
input_files=None,
input_filter=["tests/data/org/*.org"],
@@ -305,9 +302,7 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
# Index Markdown Content for Search
all_files = fs_syncer.collect_files(user=user)
state.content_index, _ = configure_content(
state.content_index, state.config.content_type, all_files, state.search_models, user=user
)
success = configure_content(all_files, user=user)
# Initialize Processor from Config
if os.getenv("OPENAI_API_KEY"):
@@ -349,16 +344,12 @@ def client(
state.cross_encoder_model["default"] = CrossEncoderModel()
# These lines help us Mock the Search models for these search types
state.search_models.image_search = image_search.initialize_model(search_config.image)
text_search.setup(
OrgToEntries,
get_sample_data("org"),
regenerate=False,
user=api_user.user,
)
state.content_index.image = image_search.setup(
content_config.image, state.search_models.image_search, regenerate=False
)
text_search.setup(
PlaintextToEntries,
get_sample_data("plaintext"),
@@ -388,9 +379,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
)
all_files = fs_syncer.collect_files(user=default_user2)
configure_content(
state.content_index, state.config.content_type, all_files, state.search_models, user=default_user2
)
configure_content(all_files, user=default_user2)
# Initialize Processor from Config
OfflineChatProcessorConversationConfigFactory(enabled=True)

View File

@@ -12,10 +12,9 @@ from khoj.configure import configure_routes, configure_search_types
from khoj.database.adapters import EntryAdapters
from khoj.database.models import KhojApiUser, KhojUser
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
from khoj.search_type import image_search, text_search
from khoj.search_type import text_search
from khoj.utils import state
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.utils.state import config, content_index, search_models
# Test
@@ -298,34 +297,6 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
assert response.json() == ["all"]
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
search_models.image_search = image_search.initialize_model(search_config.image)
content_index.image = image_search.setup(
content_config.image, search_models.image_search.image_encoder, regenerate=False
)
query_expected_image_pairs = [
("kitten", "kitten_park.jpg"),
("a horse and dog on a leash", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg"),
]
for query, expected_image_name in query_expected_image_pairs:
# Act
response = client.get(f"/api/search?q={query}&n=1&t=image", headers=headers)
# Assert
assert response.status_code == 200
actual_image = Image.open(BytesIO(client.get(response.json()[0]["entry"]).content))
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
# Assert
assert expected_image == actual_image
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):

View File

@@ -1,162 +0,0 @@
# Standard Modules
import logging
from pathlib import Path
import pytest
from PIL import Image
from khoj.search_type import image_search
from khoj.utils.config import SearchModels
from khoj.utils.constants import web_directory
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.utils.state import content_index, search_models
# Test
# ----------------------------------------------------------------------------------------------------
def test_image_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Act
# Regenerate image search embeddings during image setup
image_search_model = image_search.setup(
content_config.image, search_models.image_search.image_encoder, regenerate=True
)
# Assert
assert len(image_search_model.image_names) == 3
assert len(image_search_model.image_embeddings) == 3
# ----------------------------------------------------------------------------------------------------
def test_image_metadata(content_config: ContentConfig):
"Verify XMP Description and Subjects Extracted from Image"
# Arrange
expected_metadata_image_name_pairs = [
(["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
(["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
(["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg"),
]
test_image_paths = [
Path(content_config.image.input_directories[0] / image_name[1])
for image_name in expected_metadata_image_name_pairs
]
for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths):
# Act
actual_metadata = image_search.extract_metadata(test_image_path)
# Assert
for expected_snippet in expected_metadata[0]:
assert expected_snippet in actual_metadata
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
async def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
search_models.image_search = image_search.initialize_model(search_config.image)
content_index.image = image_search.setup(
content_config.image, search_models.image_search.image_encoder, regenerate=False
)
output_directory = resolve_absolute_path(web_directory)
query_expected_image_pairs = [
("kitten", "kitten_park.jpg"),
("horse and dog in a farm", "horse_dog.jpg"),
("A guinea pig eating grass", "guineapig_grass.jpg"),
]
# Act
for query, expected_image_name in query_expected_image_pairs:
hits = await image_search.query(
query, count=1, search_model=search_models.image_search, content=content_index.image
)
results = image_search.collate_results(
hits,
content_index.image.image_names,
output_directory=output_directory,
image_files_url="/static/images",
count=1,
)
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
actual_image = Image.open(actual_image_path)
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
# Assert
assert expected_image == actual_image
# Cleanup
# Delete the image files copied to results directory
actual_image_path.unlink()
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
async def test_image_search_query_truncated(content_config: ContentConfig, search_config: SearchConfig, caplog):
# Arrange
search_models.image_search = image_search.initialize_model(search_config.image)
content_index.image = image_search.setup(
content_config.image, search_models.image_search.image_encoder, regenerate=False
)
max_words_supported = 10
query = " ".join(["hello"] * 100)
truncated_query = " ".join(["hello"] * max_words_supported)
# Act
try:
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
await image_search.query(
query, count=1, search_model=search_models.image_search, content=content_index.image
)
# Assert
except RuntimeError as e:
if "The size of tensor a (102) must match the size of tensor b (77)" in str(e):
assert False, f"Query length exceeds max tokens supported by model\n"
assert f"Find Images by Text: {truncated_query}" in caplog.text, "Query not truncated"
# ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
async def test_image_search_by_filepath(content_config: ContentConfig, search_config: SearchConfig, caplog):
# Arrange
search_models.image_search = image_search.initialize_model(search_config.image)
content_index.image = image_search.setup(
content_config.image, search_models.image_search.image_encoder, regenerate=False
)
output_directory = resolve_absolute_path(web_directory)
image_directory = content_config.image.input_directories[0]
query = f"file:{image_directory.joinpath('kitten_park.jpg')}"
expected_image_path = f"{image_directory.joinpath('kitten_park.jpg')}"
# Act
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
hits = await image_search.query(
query, count=1, search_model=search_models.image_search, content=content_index.image
)
results = image_search.collate_results(
hits,
content_index.image.image_names,
output_directory=output_directory,
image_files_url="/static/images",
count=1,
)
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
actual_image = Image.open(actual_image_path)
expected_image = Image.open(expected_image_path)
# Assert
# Ensure file search triggered instead of query with file path as string
assert (
f"Find Images by Image: {resolve_absolute_path(expected_image_path)}" in caplog.text
), "File search not triggered"
# Ensure the correct image is returned
assert expected_image == actual_image, "Incorrect image returned by file search"
# Cleanup
# Delete the image files copied to results directory
actual_image_path.unlink()