Support exclusion file filters (#826)

### Overview
Support exclude file filter in user search queries

### Details
- All of the exclude file filter terms need to be satisfied
- Any one of the include file filter terms should be satisfied

### Example
- **Search Query**: *what happened yesterday? -file:"tasks.org" -file:"work.md" file:"diary.org" file:"journal.org*
- **Behavior**: Query will try find relevant notes in any of `journal.org` or `diary.org` and not in `tasks.org` and not in `work.md`

### Details
* Add support for exclusion file filters
* Translate file filter to valid Django DB entry filter regex
* Exclude all files when multiple exclude file filter in query

Previously we were applying an "Or" filter, which would exclude any
file mentioned in a query with multiple exclude file filter.

This is not what we naturally mean when we ask excluding a file in a query

* Rename, rearrange, deduplicate and add file filter tests

Closes #728
---------

Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
srikary12
2024-08-12 18:11:54 +05:30
committed by GitHub
parent 7815e02dd4
commit 05c0aa3882
3 changed files with 77 additions and 29 deletions

View File

@@ -1048,7 +1048,7 @@ class FileObjectAdapters:
class EntryAdapters:
word_filer = WordFilter()
word_filter = WordFilter()
file_filter = FileFilter()
date_filter = DateFilter()
@@ -1150,14 +1150,14 @@ class EntryAdapters:
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
q_filter_terms = Q()
explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query)
word_filters = EntryAdapters.word_filter.get_filter_terms(query)
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
if len(word_filters) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
return Entry.objects.filter(user=user)
for term in explicit_word_terms:
for term in word_filters:
if term.startswith("+"):
q_filter_terms &= Q(raw__icontains=term[1:])
elif term.startswith("-"):
@@ -1167,7 +1167,16 @@ class EntryAdapters:
if len(file_filters) > 0:
for term in file_filters:
q_file_filter_terms |= Q(file_path__regex=term)
if term.startswith("-"):
# Convert the glob term to a regex pattern
regex_term = re.escape(term[1:]).replace(r"\*", ".*").replace(r"\?", ".")
# Exclude all files that match the regex term
q_file_filter_terms &= ~Q(file_path__regex=regex_term)
else:
# Convert the glob term to a regex pattern
regex_term = re.escape(term).replace(r"\*", ".*").replace(r"\?", ".")
# Include any files that match the regex term
q_file_filter_terms |= Q(file_path__regex=regex_term)
q_filter_terms &= q_file_filter_terms
@@ -1182,9 +1191,7 @@ class EntryAdapters:
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
relevant_entries = Entry.objects.filter(user=user).filter(
q_filter_terms,
)
relevant_entries = Entry.objects.filter(user=user).filter(q_filter_terms)
if file_type_filter:
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
return relevant_entries

View File

@@ -11,7 +11,8 @@ logger = logging.getLogger(__name__)
class FileFilter(BaseFilter):
file_filter_regex = r'file:"(.+?)" ?'
file_filter_regex = r'(?<!-)file:"(.+?)" ?'
excluded_file_filter_regex = r'-file:"(.+?)" ?'
def __init__(self, entry_key="file"):
self.entry_key = entry_key
@@ -20,7 +21,9 @@ class FileFilter(BaseFilter):
def get_filter_terms(self, query: str) -> List[str]:
"Get all filter terms in query"
return [f"{self.convert_to_regex(term)}" for term in re.findall(self.file_filter_regex, query)]
required_files = [f"{required_file}" for required_file in re.findall(self.file_filter_regex, query)]
excluded_files = [f"-{excluded_file}" for excluded_file in re.findall(self.excluded_file_filter_regex, query)]
return required_files + excluded_files
def convert_to_regex(self, file_filter: str) -> str:
"Convert file filter to regex"