Extract Entries in a standardized format across text search types

Issue:
 - Had different schema of extracted entries for symmetric_ledger vs asymmetric

 - Entry extraction for asymmetric was dirty, relying on cryptic
   indices to store raw entry vs cleaned entry meant to be passed to embeddings

 - This was pushing the load of figuring out what property to extract
   from each entry to downstream processes like the filters

 - This limited the filters to only work for asymmetric search, not for
   symmetric_ledger

- Fix
   - Use consistent format for extracted entries
     {
       'embed': entry_string_meant_to_be_passed_to_model_and_get_embeddings,
       'raw'  : raw_entry_string_meant_to_be_passed_to_use
     }

 - Result
   - Now filters can be applied across search types, and the specific
     field they should be applied on can be configured by each search
     type
This commit is contained in:
Debanjum Singh Solanky
2022-07-19 20:52:25 +04:00
parent e66cd5bf59
commit b673d26a12
5 changed files with 18 additions and 18 deletions

View File

@@ -17,7 +17,7 @@ import dateparser as dtparse
date_regex = r"dt([:><=]{1,2})\"(.*?)\""
def date_filter(query, entries, embeddings):
def date_filter(query, entries, embeddings, entry_key='raw'):
"Find entries containing any dates that fall within date range specified in query"
# extract date range specified in date filter of query
query_daterange = extract_date_range(query)
@@ -34,7 +34,7 @@ def date_filter(query, entries, embeddings):
entries_to_include = set()
for id, entry in enumerate(entries):
# Extract dates from entry
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]):
for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[entry_key]):
# Convert date string in entry to unix timestamp
try:
date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()

View File

@@ -5,7 +5,7 @@ import re
import torch
def explicit_filter(raw_query, entries, embeddings):
def explicit_filter(raw_query, entries, embeddings, entry_key='raw'):
# Separate natural query from explicit required, blocked words filters
query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")])
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
@@ -19,7 +19,7 @@ def explicit_filter(raw_query, entries, embeddings):
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
entries_by_word_set = [set(word.lower()
for word
in re.split(entry_splitter, entry[1])
in re.split(entry_splitter, entry[entry_key])
if word != "")
for entry in entries]