mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 05:39:11 +00:00
Extract explicit pre-search filter function into a separate module
Details -- - Move explicit_filters function into separate module under search_filter - Update signature of explicit filter to take and return query, entries, embeddings - Use this explicit_filter func from search_filters module in query Reason -- Abstraction will simplify adding other pre-search filters. E.g datetime filter
This commit is contained in:
46
src/search_filter/explicit_filter.py
Normal file
46
src/search_filter/explicit_filter.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# Standard Packages
|
||||
import re
|
||||
|
||||
# External Packages
|
||||
import torch
|
||||
|
||||
|
||||
def explicit_filter(raw_query, entries, embeddings):
|
||||
# Separate natural query from explicit required, blocked words filters
|
||||
query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")])
|
||||
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
|
||||
blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
|
||||
|
||||
if len(required_words) == 0 and len(blocked_words) == 0:
|
||||
return query, entries, embeddings
|
||||
|
||||
# convert each entry to a set of words
|
||||
entries_by_word_set = [set(word.lower()
|
||||
for word
|
||||
in re.split(
|
||||
r',|\.| |\]|\[\(|\)|\{|\}', # split on fullstop, comma or any brackets
|
||||
entry[0])
|
||||
if word != "")
|
||||
for entry in entries]
|
||||
|
||||
# track id of entries to exclude
|
||||
entries_to_exclude = set()
|
||||
|
||||
# mark entries that do not contain all required_words for exclusion
|
||||
if len(required_words) > 0:
|
||||
for id, words_in_entry in enumerate(entries_by_word_set):
|
||||
if not required_words.issubset(words_in_entry):
|
||||
entries_to_exclude.add(id)
|
||||
|
||||
# mark entries that contain any blocked_words for exclusion
|
||||
if len(blocked_words) > 0:
|
||||
for id, words_in_entry in enumerate(entries_by_word_set):
|
||||
if words_in_entry.intersection(blocked_words):
|
||||
entries_to_exclude.add(id)
|
||||
|
||||
# delete entries (and their embeddings) marked for exclusion
|
||||
for id in sorted(list(entries_to_exclude), reverse=True):
|
||||
del entries[id]
|
||||
embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
|
||||
|
||||
return query, entries, embeddings
|
||||
Reference in New Issue
Block a user