mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Address Notion, Image tech debt in indexing code path (#687)
* Add support for using OAuth2.0 in the Notion integration * Add notion to the admin page * Remove unnecessary content_index and image search/setup references * Trigger background job to start indexing Notion after user configures it * Add a log line when a new Notion integration is setup * Fix references to the configure_content methods
This commit is contained in:
@@ -25,7 +25,7 @@ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
|
||||
from khoj.routers.indexer import configure_content
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils import fs_syncer, state
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.constants import web_directory
|
||||
@@ -207,7 +207,6 @@ def openai_agent():
|
||||
@pytest.fixture(scope="session")
|
||||
def search_models(search_config: SearchConfig):
|
||||
search_models = SearchModels()
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
|
||||
return search_models
|
||||
|
||||
@@ -232,8 +231,6 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
|
||||
use_xmp_metadata=False,
|
||||
)
|
||||
|
||||
image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
|
||||
|
||||
LocalOrgConfig.objects.create(
|
||||
input_files=None,
|
||||
input_filter=["tests/data/org/*.org"],
|
||||
@@ -305,9 +302,7 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
|
||||
|
||||
# Index Markdown Content for Search
|
||||
all_files = fs_syncer.collect_files(user=user)
|
||||
state.content_index, _ = configure_content(
|
||||
state.content_index, state.config.content_type, all_files, state.search_models, user=user
|
||||
)
|
||||
success = configure_content(all_files, user=user)
|
||||
|
||||
# Initialize Processor from Config
|
||||
if os.getenv("OPENAI_API_KEY"):
|
||||
@@ -349,16 +344,12 @@ def client(
|
||||
state.cross_encoder_model["default"] = CrossEncoderModel()
|
||||
|
||||
# These lines help us Mock the Search models for these search types
|
||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
text_search.setup(
|
||||
OrgToEntries,
|
||||
get_sample_data("org"),
|
||||
regenerate=False,
|
||||
user=api_user.user,
|
||||
)
|
||||
state.content_index.image = image_search.setup(
|
||||
content_config.image, state.search_models.image_search, regenerate=False
|
||||
)
|
||||
text_search.setup(
|
||||
PlaintextToEntries,
|
||||
get_sample_data("plaintext"),
|
||||
@@ -388,9 +379,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
|
||||
)
|
||||
|
||||
all_files = fs_syncer.collect_files(user=default_user2)
|
||||
configure_content(
|
||||
state.content_index, state.config.content_type, all_files, state.search_models, user=default_user2
|
||||
)
|
||||
configure_content(all_files, user=default_user2)
|
||||
|
||||
# Initialize Processor from Config
|
||||
OfflineChatProcessorConversationConfigFactory(enabled=True)
|
||||
|
||||
@@ -12,10 +12,9 @@ from khoj.configure import configure_routes, configure_search_types
|
||||
from khoj.database.adapters import EntryAdapters
|
||||
from khoj.database.models import KhojApiUser, KhojUser
|
||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils import state
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from khoj.utils.state import config, content_index, search_models
|
||||
|
||||
|
||||
# Test
|
||||
@@ -298,34 +297,6 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
|
||||
assert response.json() == ["all"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
headers = {"Authorization": "Bearer kk-secret"}
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
query_expected_image_pairs = [
|
||||
("kitten", "kitten_park.jpg"),
|
||||
("a horse and dog on a leash", "horse_dog.jpg"),
|
||||
("A guinea pig eating grass", "guineapig_grass.jpg"),
|
||||
]
|
||||
|
||||
for query, expected_image_name in query_expected_image_pairs:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q={query}&n=1&t=image", headers=headers)
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
actual_image = Image.open(BytesIO(client.get(response.json()[0]["entry"]).content))
|
||||
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
|
||||
|
||||
# Assert
|
||||
assert expected_image == actual_image
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
# Standard Modules
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from khoj.search_type import image_search
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.constants import web_directory
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from khoj.utils.state import content_index, search_models
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Act
|
||||
# Regenerate image search embeddings during image setup
|
||||
image_search_model = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=True
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert len(image_search_model.image_names) == 3
|
||||
assert len(image_search_model.image_embeddings) == 3
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_metadata(content_config: ContentConfig):
|
||||
"Verify XMP Description and Subjects Extracted from Image"
|
||||
# Arrange
|
||||
expected_metadata_image_name_pairs = [
|
||||
(["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
|
||||
(["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
|
||||
(["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg"),
|
||||
]
|
||||
|
||||
test_image_paths = [
|
||||
Path(content_config.image.input_directories[0] / image_name[1])
|
||||
for image_name in expected_metadata_image_name_pairs
|
||||
]
|
||||
|
||||
for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths):
|
||||
# Act
|
||||
actual_metadata = image_search.extract_metadata(test_image_path)
|
||||
|
||||
# Assert
|
||||
for expected_snippet in expected_metadata[0]:
|
||||
assert expected_snippet in actual_metadata
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
output_directory = resolve_absolute_path(web_directory)
|
||||
query_expected_image_pairs = [
|
||||
("kitten", "kitten_park.jpg"),
|
||||
("horse and dog in a farm", "horse_dog.jpg"),
|
||||
("A guinea pig eating grass", "guineapig_grass.jpg"),
|
||||
]
|
||||
|
||||
# Act
|
||||
for query, expected_image_name in query_expected_image_pairs:
|
||||
hits = await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
|
||||
results = image_search.collate_results(
|
||||
hits,
|
||||
content_index.image.image_names,
|
||||
output_directory=output_directory,
|
||||
image_files_url="/static/images",
|
||||
count=1,
|
||||
)
|
||||
|
||||
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
|
||||
actual_image = Image.open(actual_image_path)
|
||||
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
|
||||
|
||||
# Assert
|
||||
assert expected_image == actual_image
|
||||
|
||||
# Cleanup
|
||||
# Delete the image files copied to results directory
|
||||
actual_image_path.unlink()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search_query_truncated(content_config: ContentConfig, search_config: SearchConfig, caplog):
|
||||
# Arrange
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
max_words_supported = 10
|
||||
query = " ".join(["hello"] * 100)
|
||||
truncated_query = " ".join(["hello"] * max_words_supported)
|
||||
|
||||
# Act
|
||||
try:
|
||||
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
|
||||
await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
# Assert
|
||||
except RuntimeError as e:
|
||||
if "The size of tensor a (102) must match the size of tensor b (77)" in str(e):
|
||||
assert False, f"Query length exceeds max tokens supported by model\n"
|
||||
assert f"Find Images by Text: {truncated_query}" in caplog.text, "Query not truncated"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.anyio
|
||||
async def test_image_search_by_filepath(content_config: ContentConfig, search_config: SearchConfig, caplog):
|
||||
# Arrange
|
||||
search_models.image_search = image_search.initialize_model(search_config.image)
|
||||
content_index.image = image_search.setup(
|
||||
content_config.image, search_models.image_search.image_encoder, regenerate=False
|
||||
)
|
||||
output_directory = resolve_absolute_path(web_directory)
|
||||
image_directory = content_config.image.input_directories[0]
|
||||
|
||||
query = f"file:{image_directory.joinpath('kitten_park.jpg')}"
|
||||
expected_image_path = f"{image_directory.joinpath('kitten_park.jpg')}"
|
||||
|
||||
# Act
|
||||
with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
|
||||
hits = await image_search.query(
|
||||
query, count=1, search_model=search_models.image_search, content=content_index.image
|
||||
)
|
||||
|
||||
results = image_search.collate_results(
|
||||
hits,
|
||||
content_index.image.image_names,
|
||||
output_directory=output_directory,
|
||||
image_files_url="/static/images",
|
||||
count=1,
|
||||
)
|
||||
|
||||
actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
|
||||
actual_image = Image.open(actual_image_path)
|
||||
expected_image = Image.open(expected_image_path)
|
||||
|
||||
# Assert
|
||||
# Ensure file search triggered instead of query with file path as string
|
||||
assert (
|
||||
f"Find Images by Image: {resolve_absolute_path(expected_image_path)}" in caplog.text
|
||||
), "File search not triggered"
|
||||
# Ensure the correct image is returned
|
||||
assert expected_image == actual_image, "Incorrect image returned by file search"
|
||||
|
||||
# Cleanup
|
||||
# Delete the image files copied to results directory
|
||||
actual_image_path.unlink()
|
||||
Reference in New Issue
Block a user