Do not store word filters index to file. Not necessary for now

- It's more of a hassle to not let word filter go stale on entry
  updates
- Generating index on 120K lines of notes takes 1s. Loading from file
  takes 0.2s. For less content load time difference will be even smaller
- Let go of startup time improvement for simplicity for now
This commit is contained in:
Debanjum Singh Solanky
2022-09-07 02:43:58 +03:00
parent 91d11ccb49
commit c17a0fd05b
5 changed files with 33 additions and 62 deletions

View File

@@ -58,7 +58,7 @@ def model_dir(search_config: SearchConfig):
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()]
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
return model_dir

View File

@@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
filters = [WordFilter()]
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
user_query = 'How to git install application? +"Emacs"'
@@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
# ----------------------------------------------------------------------------------------------------
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
filters = [WordFilter()]
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
user_query = 'How to git install application? -"clone"'

View File

@@ -1,15 +1,12 @@
# External Packages
import torch
# Application Packages
from src.search_filter.word_filter import WordFilter
from src.utils.config import SearchType
def test_no_word_filter(tmp_path):
def test_no_word_filter():
# Arrange
word_filter = WordFilter(tmp_path, SearchType.Org)
embeddings, entries = arrange_content()
word_filter = WordFilter()
entries = arrange_content()
q_with_no_filter = 'head tail'
# Act
@@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path):
assert entry_indices == {0, 1, 2, 3}
def test_word_exclude_filter(tmp_path):
def test_word_exclude_filter():
# Arrange
word_filter = WordFilter(tmp_path, SearchType.Org)
embeddings, entries = arrange_content()
word_filter = WordFilter()
entries = arrange_content()
q_with_exclude_filter = 'head -"exclude_word" tail'
# Act
@@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path):
assert entry_indices == {0, 2}
def test_word_include_filter(tmp_path):
def test_word_include_filter():
# Arrange
word_filter = WordFilter(tmp_path, SearchType.Org)
embeddings, entries = arrange_content()
word_filter = WordFilter()
entries = arrange_content()
query_with_include_filter = 'head +"include_word" tail'
# Act
@@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path):
assert entry_indices == {2, 3}
def test_word_include_and_exclude_filter(tmp_path):
def test_word_include_and_exclude_filter():
# Arrange
word_filter = WordFilter(tmp_path, SearchType.Org)
embeddings, entries = arrange_content()
word_filter = WordFilter()
entries = arrange_content()
query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail'
# Act
@@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path):
def arrange_content():
embeddings = torch.randn(4, 10)
entries = [
{'compiled': '', 'raw': 'Minimal Entry'},
{'compiled': '', 'raw': 'Entry with exclude_word'},
{'compiled': '', 'raw': 'Entry with include_word'},
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
return embeddings, entries
return entries