Extract Entries in a standardized format across text search types

Issue: - Had different schema of extracted entries for symmetric_ledger vs asymmetric - Entry extraction for asymmetric was dirty, relying on cryptic indices to store raw entry vs cleaned entry meant to be passed to embeddings - This was pushing the load of figuring out what property to extract from each entry to downstream processes like the filters - This limited the filters to only work for asymmetric search, not for symmetric_ledger - Fix - Use consistent format for extracted entries { 'embed': entry_string_meant_to_be_passed_to_model_and_get_embeddings, 'raw' : raw_entry_string_meant_to_be_passed_to_use } - Result - Now filters can be applied across search types, and the specific field they should be applied on can be configured by each search type
2026-03-06 05:39:12 +00:00 · 2022-07-19 20:52:25 +04:00
parent e66cd5bf59
commit b673d26a12
5 changed files with 18 additions and 18 deletions
--- a/src/search_filter/date_filter.py
+++ b/src/search_filter/date_filter.py
@@ -17,7 +17,7 @@ import dateparser as dtparse
 date_regex = r"dt([:><=]{1,2})\"(.*?)\""


-def date_filter(query, entries, embeddings):
+def date_filter(query, entries, embeddings, entry_key='raw'):
    "Find entries containing any dates that fall within date range specified in query"
    # extract date range specified in date filter of query
    query_daterange = extract_date_range(query)
@@ -34,7 +34,7 @@ def date_filter(query, entries, embeddings):
    entries_to_include = set()
    for id, entry in enumerate(entries):
        # Extract dates from entry
-        for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[1]):
+        for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[entry_key]):
            # Convert date string in entry to unix timestamp
            try:
                date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp()
--- a/src/search_filter/explicit_filter.py
+++ b/src/search_filter/explicit_filter.py
@@ -5,7 +5,7 @@ import re
 import torch


-def explicit_filter(raw_query, entries, embeddings):
+def explicit_filter(raw_query, entries, embeddings, entry_key='raw'):
    # Separate natural query from explicit required, blocked words filters
    query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")])
    required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
@@ -19,7 +19,7 @@ def explicit_filter(raw_query, entries, embeddings):
    entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
    entries_by_word_set = [set(word.lower()
                             for word
-                             in re.split(entry_splitter, entry[1])
+                             in re.split(entry_splitter, entry[entry_key])
                             if word != "")
                        for entry in entries]