mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Extract More Dates from entries to improve Date Filter (#683)
- Overview - Extract more structured date variants (e.g with dot(.) & slash(/) separators, 2-digit year) - Extract some natural, partial dates as well from entries - Capability Add ability to extract the following additional date forms: - Natural Dates: 21st April 2000, February 29 2024 - Partial Natural Dates: March 24, Mar 2024 - Structured Dates: 20/12/24, 20.12.2024, 2024/12/20 Note: Previously only YYYY-MM-DD ISO-8601 structured date form was extracted for date filters - Performance Using regexes is MUCH faster than using the `dateparser' python library It's a little crude but gives acceptable performance for large datasets
This commit is contained in:
@@ -116,7 +116,7 @@ def test_date_filter_regex():
|
||||
assert dtrange_match == []
|
||||
|
||||
|
||||
def test_get_file_filter_terms():
|
||||
def test_get_date_filter_terms():
|
||||
dtrange_match = DateFilter().get_filter_terms('multi word head dt>"today" dt:"1984-01-01"')
|
||||
assert dtrange_match == ["dt>'today'", "dt:'1984-01-01'"]
|
||||
|
||||
@@ -134,3 +134,83 @@ def test_get_file_filter_terms():
|
||||
|
||||
dtrange_match = DateFilter().get_filter_terms("head tail")
|
||||
assert dtrange_match == []
|
||||
|
||||
|
||||
def test_date_extraction():
|
||||
extracted_dates = DateFilter().extract_dates("")
|
||||
assert extracted_dates == [], "Expected to handle empty string"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head tail")
|
||||
assert extracted_dates == [], "Expected to handle no dates"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today tail")
|
||||
assert extracted_dates == [], "Expected relative date to be ignored"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected single deduplicated date extracted from logbook entry"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20")
|
||||
expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)]
|
||||
assert all(
|
||||
[dt in extracted_dates for dt in expected_dates]
|
||||
), "Expected multiple different dates extracted from logbook entry"
|
||||
|
||||
|
||||
def test_natual_date_extraction():
|
||||
extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail")
|
||||
assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail")
|
||||
assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail")
|
||||
assert (
|
||||
datetime(1984, 4, 11, 0, 0, 0) in extracted_dates
|
||||
), "Expected natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail")
|
||||
assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 31st march 84 tail")
|
||||
assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head April 1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head Apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 84 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with 2-digit year to be extracted"
|
||||
|
||||
Reference in New Issue
Block a user