Support exclusion file filters (#826)

### Overview
Support exclude file filter in user search queries

### Details
- All of the exclude file filter terms need to be satisfied
- Any one of the include file filter terms should be satisfied

### Example
- **Search Query**: *what happened yesterday? -file:"tasks.org" -file:"work.md" file:"diary.org" file:"journal.org*
- **Behavior**: Query will try find relevant notes in any of `journal.org` or `diary.org` and not in `tasks.org` and not in `work.md`

### Details
* Add support for exclusion file filters
* Translate file filter to valid Django DB entry filter regex
* Exclude all files when multiple exclude file filter in query

Previously we were applying an "Or" filter, which would exclude any
file mentioned in a query with multiple exclude file filter.

This is not what we naturally mean when we ask excluding a file in a query

* Rename, rearrange, deduplicate and add file filter tests

Closes #728
---------

Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
srikary12
2024-08-12 18:11:54 +05:30
committed by GitHub
parent 7815e02dd4
commit 05c0aa3882
3 changed files with 77 additions and 29 deletions

View File

@@ -3,7 +3,7 @@ from khoj.search_filter.file_filter import FileFilter
from khoj.utils.rawconfig import Entry
def test_no_file_filter():
def test_can_filter_no_file_filter():
# Arrange
file_filter = FileFilter()
q_with_no_filter = "head tail"
@@ -15,76 +15,114 @@ def test_no_file_filter():
assert can_filter == False
def test_file_filter_with_non_existent_file():
def test_can_filter_non_existent_file():
# Arrange
file_filter = FileFilter()
q_with_no_filter = 'head file:"nonexistent.org" tail'
q_with_filter = 'head file:"nonexistent.org" tail'
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
can_filter = file_filter.can_filter(q_with_filter)
# Assert
assert can_filter == True
def test_single_file_filter():
def test_can_filter_single_file_include():
# Arrange
file_filter = FileFilter()
q_with_no_filter = 'head file:"file 1.org" tail'
q_with_filter = 'head file:"file 1.org" tail'
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
can_filter = file_filter.can_filter(q_with_filter)
# Assert
assert can_filter == True
def test_file_filter_with_partial_match():
def test_can_filter_single_file_exclude():
# Arrange
file_filter = FileFilter()
q_with_no_filter = 'head file:"1.org" tail'
q_with_filter = 'head -file:"1.org" tail'
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
can_filter = file_filter.can_filter(q_with_filter)
# Assert
assert can_filter == True
def test_file_filter_with_regex_match():
def test_can_filter_file_with_regex_match():
# Arrange
file_filter = FileFilter()
q_with_no_filter = 'head file:"*.org" tail'
q_with_filter = 'head file:"*.org" tail'
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
can_filter = file_filter.can_filter(q_with_filter)
# Assert
assert can_filter == True
def test_multiple_file_filter():
def test_can_filter_multiple_file_includes():
# Arrange
file_filter = FileFilter()
q_with_no_filter = 'head tail file:"file 1.org" file:"file2.org"'
q_with_filter = 'head tail file:"file 1.org" file:"file2.org"'
# Act
can_filter = file_filter.can_filter(q_with_no_filter)
can_filter = file_filter.can_filter(q_with_filter)
# Assert
assert can_filter == True
def test_get_file_filter_terms():
def test_get_single_include_file_filter_terms():
# Arrange
file_filter = FileFilter()
q_with_filter_terms = 'head tail file:"file 1.org" file:"/path/to/dir/*.org"'
q_with_filter_terms = 'head tail file:"/path/to/dir/*.org"'
# Act
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
# Assert
assert filter_terms == ["file 1\\.org", "/path/to/dir/.*\\.org"]
assert filter_terms == ["/path/to/dir/*.org"]
def test_get_single_exclude_file_filter_terms():
# Arrange
file_filter = FileFilter()
q_with_filter_terms = 'head tail -file:"file 1.org"'
# Act
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
# Assert
assert filter_terms == ["-file 1.org"]
def test_get_single_include_exclude_file_filter_terms():
# Arrange
file_filter = FileFilter()
q_with_filter_terms = 'head tail -file:"file 1.org" file:"/path/to/dir/*.org"'
# Act
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
# Assert
assert filter_terms == ["/path/to/dir/*.org", "-file 1.org"]
def test_get_multiple_include_exclude_file_filter_terms():
# Arrange
file_filter = FileFilter()
q_with_filter_terms = (
'head -file:"file 1.org" file:"file 1.org" file:"/path/to/dir/.*.org" -file:"/path/to/dir/*.org" tail'
)
# Act
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
# Assert
assert filter_terms == ["file 1.org", "/path/to/dir/.*.org", "-file 1.org", "-/path/to/dir/*.org"]
def arrange_content():