mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Merge Symmetric, Asymmetric Search Types into a single Text Search Type
- The code for both the text search types were mostly the same It was earlier done this way for expedience while experimenting - The minor differences were reconciled and merged into a single text_search type - This simplifies the app and making it easier to process other text types
This commit is contained in:
@@ -3,8 +3,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
# Internal Packages
|
||||
from src.search_type import asymmetric, image_search
|
||||
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, SymmetricSearchConfig, AsymmetricSearchConfig, ImageSearchConfig
|
||||
from src.search_type import image_search, text_search
|
||||
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@@ -13,13 +14,13 @@ def search_config(tmp_path_factory):
|
||||
|
||||
search_config = SearchConfig()
|
||||
|
||||
search_config.asymmetric = SymmetricSearchConfig(
|
||||
search_config.symmetric = TextSearchConfig(
|
||||
encoder = "sentence-transformers/all-MiniLM-L6-v2",
|
||||
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
model_directory = model_dir
|
||||
)
|
||||
|
||||
search_config.asymmetric = AsymmetricSearchConfig(
|
||||
search_config.asymmetric = TextSearchConfig(
|
||||
encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
||||
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
model_directory = model_dir
|
||||
@@ -55,7 +56,7 @@ def model_dir(search_config):
|
||||
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
|
||||
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
|
||||
|
||||
asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False, device=device, verbose=True)
|
||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, device=device, verbose=True)
|
||||
|
||||
return model_dir
|
||||
|
||||
|
||||
@@ -3,8 +3,9 @@ from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from src.main import model
|
||||
from src.search_type import asymmetric
|
||||
from src.search_type import text_search
|
||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
|
||||
|
||||
# Test
|
||||
@@ -12,7 +13,7 @@ from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Act
|
||||
# Regenerate notes embeddings during asymmetric setup
|
||||
notes_model = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=True)
|
||||
notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
|
||||
# Assert
|
||||
assert len(notes_model.entries) == 10
|
||||
@@ -22,15 +23,15 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
query = "How to git install application?"
|
||||
|
||||
# Act
|
||||
hits, entries = asymmetric.query(
|
||||
hits, entries = text_search.query(
|
||||
query,
|
||||
model = model.notes_search)
|
||||
|
||||
results = asymmetric.collate_results(
|
||||
results = text_search.collate_results(
|
||||
hits,
|
||||
entries,
|
||||
count=1)
|
||||
@@ -44,7 +45,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
initial_notes_model= asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
initial_notes_model= text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
assert len(initial_notes_model.entries) == 10
|
||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||
@@ -57,11 +58,11 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||
|
||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||
regenerated_notes_model = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=True)
|
||||
regenerated_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||
|
||||
# Act
|
||||
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
|
||||
initial_notes_model = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
|
||||
# Assert
|
||||
assert len(regenerated_notes_model.entries) == 11
|
||||
|
||||
@@ -8,9 +8,9 @@ import pytest
|
||||
|
||||
# Internal Packages
|
||||
from src.main import app, model, config
|
||||
from src.search_type import asymmetric, image_search
|
||||
from src.utils.helpers import resolve_absolute_path
|
||||
from src.search_type import text_search, image_search
|
||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||
from src.processor.org_mode import org_to_jsonl
|
||||
|
||||
|
||||
# Arrange
|
||||
@@ -115,7 +115,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
user_query = "How to git install application?"
|
||||
|
||||
# Act
|
||||
@@ -131,7 +131,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
user_query = "How to git install application? +Emacs"
|
||||
|
||||
# Act
|
||||
@@ -147,7 +147,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||
# Arrange
|
||||
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
|
||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||
user_query = "How to git install application? -clone"
|
||||
|
||||
# Act
|
||||
|
||||
Reference in New Issue
Block a user