mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Improve date filter regexes to extract structured, natural, partial dates
- Much faster than using dateparser
- It took 2x-4x for improved regex to extracts 1-15% more dates
- Whereas It took 33x to 100x for dateparser to extract 65% - 400% more dates
- Improve date extractor tests to test deduping dates, natural,
structured date extraction from content
- Extract some natural, partial dates and more structured dates
Using regex is much faster than using dateparser. It's a little
crude but should pay off in performance.
Supports dates of form:
- (Day-of-Month) Month|AbbreviatedMonth Year|2DigitYear
- Month|AbbreviatedMonth (Day-of-Month) Year|2DigitYear
This commit is contained in:
@@ -147,16 +147,70 @@ def test_date_extraction():
|
||||
assert extracted_dates == [], "Expected relative date to be ignored"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted"
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted"
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted"
|
||||
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20")
|
||||
expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)]
|
||||
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected single deduplicated date extracted from logbook entry"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20")
|
||||
expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)]
|
||||
assert all(
|
||||
[dt in extracted_dates for dt in expected_dates]
|
||||
), "Expected multiple non-english dates extracted from logbook entry"
|
||||
), "Expected multiple different dates extracted from logbook entry"
|
||||
|
||||
|
||||
def test_natual_date_extraction():
|
||||
extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail")
|
||||
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail")
|
||||
assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail")
|
||||
assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail")
|
||||
assert (
|
||||
datetime(1984, 4, 11, 0, 0, 0) in extracted_dates
|
||||
), "Expected natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail")
|
||||
assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head 31st march 84 tail")
|
||||
assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head April 1984 tail")
|
||||
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head Apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with short month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 1984 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with lowercase month to be extracted"
|
||||
|
||||
extracted_dates = DateFilter().extract_dates("head apr 84 tail")
|
||||
assert extracted_dates == [
|
||||
datetime(1984, 4, 1, 0, 0, 0)
|
||||
], "Expected partial natural date with 2-digit year to be extracted"
|
||||
|
||||
Reference in New Issue
Block a user