Deep copy entries, embeddings in filters. Defer till actual filtering

- Only the filter knows when entries, embeddings are to be manipulated.
  So move the responsibility to deep copy before manipulating entries,
  embeddings to the filters

- Create deep copy in filters. Avoids creating deep copy of entries,
  embeddings when filter results are being loaded from cache etc
This commit is contained in:
Debanjum Singh Solanky
2022-09-04 02:22:42 +03:00
parent 3308e68edf
commit 28d3dc1434
3 changed files with 18 additions and 17 deletions

View File

@@ -3,7 +3,6 @@ import argparse
import pathlib
import logging
import time
from copy import deepcopy
# External Packages
import torch
@@ -77,22 +76,11 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False):
def query(raw_query: str, model: TextSearchModel, rank_results=False):
"Search for entries that answer the query"
query = raw_query
# Use deep copy of original embeddings, entries to filter if query contains filters
start = time.time()
filters_in_query = [filter for filter in model.filters if filter.can_filter(query)]
if filters_in_query:
corpus_embeddings = deepcopy(model.corpus_embeddings)
entries = deepcopy(model.entries)
else:
corpus_embeddings = model.corpus_embeddings
entries = model.entries
end = time.time()
logger.debug(f"Copy Time: {end - start:.3f} seconds")
query, entries, corpus_embeddings = raw_query, model.entries, model.corpus_embeddings
# Filter query, entries and embeddings before semantic search
start = time.time()
filters_in_query = [filter for filter in model.filters if filter.can_filter(query)]
for filter in filters_in_query:
query, entries, corpus_embeddings = filter.apply(query, entries, corpus_embeddings)
end = time.time()