mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Move tests out to project root. Use absolute import in project
tests/ directory in project root is more standard. Just had to use absolute path for internal module imports to get it to work
This commit is contained in:
@@ -8,10 +8,10 @@ import uvicorn
|
||||
from fastapi import FastAPI
|
||||
|
||||
# Internal Packages
|
||||
from search_type import asymmetric, symmetric_ledger, image_search
|
||||
from utils.helpers import get_from_dict
|
||||
from utils.cli import cli
|
||||
from utils.config import SearchType, SearchModels, TextSearchConfig, ImageSearchConfig, SearchConfig
|
||||
from src.search_type import asymmetric, symmetric_ledger, image_search
|
||||
from src.utils.helpers import get_from_dict
|
||||
from src.utils.cli import cli
|
||||
from src.utils.config import SearchType, SearchModels, TextSearchConfig, ImageSearchConfig, SearchConfig
|
||||
|
||||
|
||||
# Application Global State
|
||||
|
||||
@@ -8,8 +8,8 @@ import glob
|
||||
import gzip
|
||||
|
||||
# Internal Packages
|
||||
from processor.org_mode import orgnode
|
||||
from utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.processor.org_mode import orgnode
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
|
||||
|
||||
# Define Functions
|
||||
|
||||
@@ -8,8 +8,8 @@ import glob
|
||||
import gzip
|
||||
|
||||
# Internal Packages
|
||||
from processor.org_mode import orgnode
|
||||
from utils.helpers import get_absolute_path, is_none_or_empty
|
||||
from src.processor.org_mode import orgnode
|
||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||
|
||||
|
||||
# Define Functions
|
||||
|
||||
@@ -15,9 +15,9 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||
|
||||
# Internal Packages
|
||||
from utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
from processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from utils.config import TextSearchModel, TextSearchConfig
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.utils.config import TextSearchModel, TextSearchConfig
|
||||
|
||||
|
||||
def initialize_model():
|
||||
@@ -106,7 +106,7 @@ def explicit_filter(hits, entries, required_words, blocked_words):
|
||||
hits_by_word_set = [(set(word.lower()
|
||||
for word
|
||||
in re.split(
|
||||
',|\.| |\]|\[\(|\)|\{|\}',
|
||||
r',|\.| |\]|\[\(|\)|\{|\}',
|
||||
entries[hit['corpus_id']])
|
||||
if word != ""),
|
||||
hit)
|
||||
|
||||
@@ -10,9 +10,9 @@ from tqdm import trange
|
||||
import torch
|
||||
|
||||
# Internal Packages
|
||||
from utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
import utils.exiftool as exiftool
|
||||
from utils.config import ImageSearchModel, ImageSearchConfig
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
import src.utils.exiftool as exiftool
|
||||
from src.utils.config import ImageSearchModel, ImageSearchConfig
|
||||
|
||||
|
||||
def initialize_model():
|
||||
|
||||
@@ -13,9 +13,9 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||
|
||||
# Internal Packages
|
||||
from utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
from processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||
from utils.config import TextSearchModel, TextSearchConfig
|
||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path
|
||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||
from src.utils.config import TextSearchModel, TextSearchConfig
|
||||
|
||||
|
||||
def initialize_model():
|
||||
@@ -98,7 +98,7 @@ def explicit_filter(hits, entries, required_words, blocked_words):
|
||||
hits_by_word_set = [(set(word.lower()
|
||||
for word
|
||||
in re.split(
|
||||
',|\.| |\]|\[\(|\)|\{|\}',
|
||||
r',|\.| |\]|\[\(|\)|\{|\}',
|
||||
entries[hit['corpus_id']])
|
||||
if word != ""),
|
||||
hit)
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
content-type:
|
||||
org:
|
||||
input-files: [ "~/first_from_config.org", "~/second_from_config.org" ]
|
||||
input-filter: "*.org"
|
||||
compressed-jsonl: ".notes.json.gz"
|
||||
embeddings-file: ".note_embeddings.pt"
|
||||
|
||||
search-type:
|
||||
asymmetric:
|
||||
encoder: "sentence-transformers/msmarco-MiniLM-L-6-v3"
|
||||
cross-encoder: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 170 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 330 KiB |
@@ -1,44 +0,0 @@
|
||||
* Emacs Semantic Search
|
||||
/An Emacs interface for [[https://github.com/debanjum/semantic-search][semantic-search]]/
|
||||
|
||||
** Requirements
|
||||
- Install and Run [[https://github.com/debanjum/semantic-search][semantic-search]]
|
||||
|
||||
** Installation
|
||||
- Direct Install
|
||||
- Put ~semantic-search.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
|
||||
|
||||
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
|
||||
#+begin_src elisp
|
||||
;; Org-Semantic Search Library
|
||||
(use-package semantic-search
|
||||
:load-path "~/.emacs.d/lisp/semantic-search.el"
|
||||
:bind ("C-c s" . 'semantic-search))
|
||||
#+end_src
|
||||
|
||||
- Use [[https://github.com/quelpa/quelpa#installation][Quelpa]]
|
||||
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
|
||||
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
|
||||
#+begin_src elisp
|
||||
;; Org-Semantic Search Library
|
||||
(use-package semantic-search
|
||||
:quelpa (semantic-search :fetcher url :url "https://raw.githubusercontent.com/debanjum/semantic-search/master/interface/emacs/semantic-search.el")
|
||||
:bind ("C-c s" . 'semantic-search))
|
||||
#+end_src
|
||||
|
||||
** Usage
|
||||
1. Call ~semantic-search~ using keybinding ~C-c s~ or ~M-x semantic-search~
|
||||
|
||||
2. Enter Query in Natural Language
|
||||
|
||||
e.g "What is the meaning of life?" "What are my life goals?"
|
||||
|
||||
3. Wait for results
|
||||
|
||||
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
|
||||
|
||||
4. (Optional) Narrow down results further
|
||||
|
||||
Include/Exclude specific words from results by adding to query
|
||||
|
||||
e.g "What is the meaning of life? -god +none"
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 268 KiB |
@@ -1,47 +0,0 @@
|
||||
* Semantic Search
|
||||
/Allow natural language search on user content like notes, images using transformer based models/
|
||||
|
||||
All data is processed locally. User can interface with semantic-search app via [[./interface/emacs/semantic-search.el][Emacs]], API or Commandline
|
||||
|
||||
** Dependencies
|
||||
- Python3
|
||||
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
||||
|
||||
** Install
|
||||
#+begin_src shell
|
||||
git clone https://github.com/debanjum/semantic-search && cd semantic-search
|
||||
conda env create -f environment.yml
|
||||
conda activate semantic-search
|
||||
#+end_src
|
||||
|
||||
** Run
|
||||
Load ML model, generate embeddings and expose API to query specified org-mode files
|
||||
|
||||
#+begin_src shell
|
||||
python3 main.py --input-files ~/Notes/Schedule.org ~/Notes/Incoming.org --verbose
|
||||
#+end_src
|
||||
|
||||
** Use
|
||||
- *Semantic Search via Emacs*
|
||||
- [[https://github.com/debanjum/semantic-search/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/semantic-search.el][semantic-search.el]]
|
||||
- Run ~M-x semantic-search <user-query>~ or Call ~C-c C-s~
|
||||
|
||||
- *Semantic Search via API*
|
||||
- Query: ~GET~ [[http://localhost:8000/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:8000/search?q="What is the meaning of life"]]
|
||||
- Regenerate Embeddings: ~GET~ [[http://localhost:8000/regenerate][http://localhost:8000/regenerate]]
|
||||
- [[http://localhost:8000/docs][Semantic Search API Docs]]
|
||||
|
||||
- *Call Semantic Search via Python Script Directly*
|
||||
#+begin_src shell
|
||||
python3 search_types/asymmetric.py \
|
||||
--compressed-jsonl .notes.jsonl.gz \
|
||||
--embeddings .notes_embeddings.pt \
|
||||
--results-count 5 \
|
||||
--verbose \
|
||||
--interactive
|
||||
#+end_src
|
||||
|
||||
** Acknowledgments
|
||||
- [[https://huggingface.co/sentence-transformers/msmarco-MiniLM-L-6-v3][MiniLM Model]] for Asymmetric Text Search. See [[https://www.sbert.net/examples/applications/retrieve_rerank/README.html][SBert Documentation]]
|
||||
- [[https://github.com/openai/CLIP][OpenAI CLIP Model]] for Image Search. See [[https://www.sbert.net/examples/applications/image-search/README.html][SBert Documentation]]
|
||||
- Charles Cave for [[http://members.optusnet.com.au/~charles57/GTD/orgnode.html][OrgNode Parser]]
|
||||
@@ -1,70 +0,0 @@
|
||||
# Standard Modules
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from utils.cli import cli
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_cli_minimal_default():
|
||||
# Act
|
||||
actual_args = cli(['--config-file=tests/data/config.yml'])
|
||||
|
||||
# Assert
|
||||
assert actual_args.config_file == Path('tests/data/config.yml')
|
||||
assert actual_args.regenerate == False
|
||||
assert actual_args.verbose == 0
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_cli_flags():
|
||||
# Act
|
||||
actual_args = cli(['--config-file=tests/data/config.yml',
|
||||
'--regenerate',
|
||||
'-vvv'])
|
||||
|
||||
# Assert
|
||||
assert actual_args.config_file == Path('tests/data/config.yml')
|
||||
assert actual_args.regenerate == True
|
||||
assert actual_args.verbose == 3
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_cli_config_from_file():
|
||||
# Act
|
||||
actual_args = cli(['--config-file=tests/data/config.yml',
|
||||
'--regenerate',
|
||||
'-vvv'])
|
||||
|
||||
# Assert
|
||||
assert actual_args.config_file == Path('tests/data/config.yml')
|
||||
assert actual_args.regenerate == True
|
||||
assert actual_args.config is not None
|
||||
assert actual_args.config['content-type']['org']['input-files'] == ['~/first_from_config.org', '~/second_from_config.org']
|
||||
assert actual_args.verbose == 3
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_cli_config_from_cmd_args():
|
||||
""
|
||||
# Act
|
||||
actual_args = cli(['--org-files=first.org'])
|
||||
|
||||
# Assert
|
||||
assert actual_args.org_files == ['first.org']
|
||||
assert actual_args.config_file is None
|
||||
assert actual_args.config is not None
|
||||
assert actual_args.config['content-type']['org']['input-files'] == ['first.org']
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_cli_config_from_cmd_args_override_config_file():
|
||||
# Act
|
||||
actual_args = cli(['--config-file=tests/data/config.yml',
|
||||
'--org-files=first.org'])
|
||||
|
||||
# Assert
|
||||
assert actual_args.org_files == ['first.org']
|
||||
assert actual_args.config_file == Path('tests/data/config.yml')
|
||||
assert actual_args.config is not None
|
||||
assert actual_args.config['content-type']['org']['input-files'] == ['first.org']
|
||||
@@ -1,30 +0,0 @@
|
||||
from utils import helpers
|
||||
|
||||
def test_get_from_null_dict():
|
||||
# null handling
|
||||
assert helpers.get_from_dict(dict()) == dict()
|
||||
assert helpers.get_from_dict(dict(), None) == None
|
||||
|
||||
# key present in nested dictionary
|
||||
# 1-level dictionary
|
||||
assert helpers.get_from_dict({'a': 1, 'b': 2}, 'a') == 1
|
||||
assert helpers.get_from_dict({'a': 1, 'b': 2}, 'c') == None
|
||||
|
||||
# 2-level dictionary
|
||||
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'a') == {'a_a': 1}
|
||||
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'a', 'a_a') == 1
|
||||
|
||||
# key not present in nested dictionary
|
||||
# 2-level_dictionary
|
||||
assert helpers.get_from_dict({'a': {'a_a': 1}, 'b': 2}, 'b', 'b_a') == None
|
||||
|
||||
|
||||
def test_merge_dicts():
|
||||
# basic merge of dicts with non-overlapping keys
|
||||
assert helpers.merge_dicts(priority_dict={'a': 1}, default_dict={'b': 2}) == {'a': 1, 'b': 2}
|
||||
|
||||
# use default dict items when not present in priority dict
|
||||
assert helpers.merge_dicts(priority_dict={}, default_dict={'b': 2}) == {'b': 2}
|
||||
|
||||
# do not override existing key in priority_dict with default dict
|
||||
assert helpers.merge_dicts(priority_dict={'a': 1}, default_dict={'a': 2}) == {'a': 1}
|
||||
@@ -1,162 +0,0 @@
|
||||
# Standard Modules
|
||||
from pathlib import Path
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
# Internal Packages
|
||||
from main import app, search_config, model
|
||||
from search_type import asymmetric, image_search
|
||||
from utils.config import SearchConfig, TextSearchConfig, ImageSearchConfig
|
||||
from utils.helpers import resolve_absolute_path
|
||||
|
||||
|
||||
# Arrange
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_invalid_search_type():
|
||||
# Arrange
|
||||
user_query = "How to call semantic search from Emacs?"
|
||||
|
||||
# Act
|
||||
response = client.get(f"/search?q={user_query}&t=invalid_search_type")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_valid_search_type():
|
||||
# Arrange
|
||||
for search_type in ["notes", "ledger", "music", "image"]:
|
||||
# Act
|
||||
response = client.get(f"/search?q=random&t={search_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_invalid_search_type():
|
||||
# Act
|
||||
response = client.get(f"/regenerate?t=invalid_search_type")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 422
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_search_type():
|
||||
# Arrange
|
||||
for search_type in ["notes", "ledger", "music", "image"]:
|
||||
# Act
|
||||
response = client.get(f"/regenerate?t={search_type}")
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_search():
|
||||
# Arrange
|
||||
search_config = SearchConfig()
|
||||
search_config.notes = TextSearchConfig(
|
||||
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')],
|
||||
input_filter = None,
|
||||
compressed_jsonl = Path('tests/data/.test.jsonl.gz'),
|
||||
embeddings_file = Path('tests/data/.test_embeddings.pt'),
|
||||
verbose = 0)
|
||||
|
||||
# Act
|
||||
# Regenerate embeddings during asymmetric setup
|
||||
notes_model = asymmetric.setup(search_config.notes, regenerate=True)
|
||||
|
||||
# Assert
|
||||
assert len(notes_model.entries) == 10
|
||||
assert len(notes_model.corpus_embeddings) == 10
|
||||
|
||||
# Arrange
|
||||
model.notes_search = notes_model
|
||||
user_query = "How to call semantic search from Emacs?"
|
||||
|
||||
# Act
|
||||
response = client.get(f"/search?q={user_query}&n=1&t=notes")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
# assert actual_data contains "Semantic Search via Emacs"
|
||||
search_result = response.json()[0]["Entry"]
|
||||
assert "Semantic Search via Emacs" in search_result
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_image_search():
|
||||
# Arrange
|
||||
search_config = SearchConfig()
|
||||
search_config.image = ImageSearchConfig(
|
||||
input_directory = Path('tests/data'),
|
||||
embeddings_file = Path('tests/data/.image_embeddings.pt'),
|
||||
batch_size = 10,
|
||||
use_xmp_metadata = False,
|
||||
verbose = 2)
|
||||
|
||||
# Act
|
||||
model.image_search = image_search.setup(search_config.image, regenerate=True)
|
||||
|
||||
# Assert
|
||||
assert len(model.image_search.image_names) == 3
|
||||
assert len(model.image_search.image_embeddings) == 3
|
||||
|
||||
# Arrange
|
||||
for query, expected_image_name in [("kitten in a park", "kitten_park.jpg"),
|
||||
("horse and dog in a farm", "horse_dog.jpg"),
|
||||
("A guinea pig eating grass", "guineapig_grass.jpg")]:
|
||||
# Act
|
||||
hits = image_search.query(
|
||||
query,
|
||||
count = 1,
|
||||
model = model.image_search)
|
||||
|
||||
results = image_search.collate_results(
|
||||
hits,
|
||||
model.image_search.image_names,
|
||||
search_config.image.input_directory,
|
||||
count=1)
|
||||
|
||||
actual_image = results[0]["Entry"]
|
||||
expected_image = resolve_absolute_path(search_config.image.input_directory.joinpath(expected_image_name))
|
||||
|
||||
# Assert
|
||||
assert expected_image == actual_image
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_notes_regenerate():
|
||||
# Arrange
|
||||
search_config = SearchConfig()
|
||||
search_config.notes = TextSearchConfig(
|
||||
input_files = [Path('tests/data/main_readme.org'), Path('tests/data/interface_emacs_readme.org')],
|
||||
input_filter = None,
|
||||
compressed_jsonl = Path('tests/data/.test.jsonl.gz'),
|
||||
embeddings_file = Path('tests/data/.test_embeddings.pt'),
|
||||
verbose = 0)
|
||||
|
||||
# Act
|
||||
# Regenerate embeddings during asymmetric setup
|
||||
notes_model = asymmetric.setup(search_config.notes, regenerate=True)
|
||||
|
||||
# Assert
|
||||
assert len(notes_model.entries) == 10
|
||||
assert len(notes_model.corpus_embeddings) == 10
|
||||
|
||||
# Arrange
|
||||
model.notes_search = notes_model
|
||||
|
||||
# Act
|
||||
response = client.get(f"/regenerate?t=notes")
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
@@ -6,7 +6,7 @@ import pathlib
|
||||
import yaml
|
||||
|
||||
# Internal Packages
|
||||
from utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, get_from_dict, merge_dicts
|
||||
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path, get_from_dict, merge_dicts
|
||||
|
||||
def cli(args=None):
|
||||
if is_none_or_empty(args):
|
||||
|
||||
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Internal Packages
|
||||
from utils.helpers import get_from_dict
|
||||
from src.utils.helpers import get_from_dict
|
||||
|
||||
|
||||
class SearchType(str, Enum):
|
||||
|
||||
Reference in New Issue
Block a user