Improve date filter regexes to extract structured, natural, partial dates

- Much faster than using dateparser
  - It took 2x-4x for improved regex to extracts 1-15% more dates
  - Whereas It took 33x to 100x for dateparser to extract 65% - 400% more dates
  - Improve date extractor tests to test deduping dates, natural,
    structured date extraction from content

- Extract some natural, partial dates and more structured dates
  Using regex is much faster than using dateparser. It's a little
  crude but should pay off in performance.

  Supports dates of form:
  - (Day-of-Month) Month|AbbreviatedMonth Year|2DigitYear
  - Month|AbbreviatedMonth (Day-of-Month) Year|2DigitYear
This commit is contained in:
Debanjum Singh Solanky
2024-03-29 18:22:41 +05:30
parent 104eeea274
commit 7923903d21
2 changed files with 128 additions and 39 deletions

View File

@@ -147,16 +147,70 @@ def test_date_extraction():
assert extracted_dates == [], "Expected relative date to be ignored"
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only YMD structured date to be extracted"
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 01-04-1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected DMY structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head Updates from April 1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected natural date to be extracted"
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted"
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 mer 09:50]--[1984-04-01 mer 10:10] => 24:20")
expected_dates = [datetime(1984, 4, 1, 9, 50, 0), datetime(1984, 4, 1, 10, 10, 0)]
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted"
extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted"
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20")
assert extracted_dates == [
datetime(1984, 4, 1, 0, 0, 0)
], "Expected single deduplicated date extracted from logbook entry"
extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20")
expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)]
assert all(
[dt in extracted_dates for dt in expected_dates]
), "Expected multiple non-english dates extracted from logbook entry"
), "Expected multiple different dates extracted from logbook entry"
def test_natual_date_extraction():
extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail")
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail")
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail")
assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted"
extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail")
assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail")
assert (
datetime(1984, 4, 11, 0, 0, 0) in extracted_dates
), "Expected natural date with lowercase month to be extracted"
extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail")
assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
extracted_dates = DateFilter().extract_dates("head 31st march 84 tail")
assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
extracted_dates = DateFilter().extract_dates("head April 1984 tail")
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted"
extracted_dates = DateFilter().extract_dates("head Apr 1984 tail")
assert extracted_dates == [
datetime(1984, 4, 1, 0, 0, 0)
], "Expected partial natural date with short month to be extracted"
extracted_dates = DateFilter().extract_dates("head apr 1984 tail")
assert extracted_dates == [
datetime(1984, 4, 1, 0, 0, 0)
], "Expected partial natural date with lowercase month to be extracted"
extracted_dates = DateFilter().extract_dates("head apr 84 tail")
assert extracted_dates == [
datetime(1984, 4, 1, 0, 0, 0)
], "Expected partial natural date with 2-digit year to be extracted"