mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 13:22:12 +00:00
Merge with origin/master
This commit is contained in:
@@ -0,0 +1,32 @@
|
|||||||
|
# Generated by Django 4.2.10 on 2024-04-02 10:54
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
def update_chat_model(apps, schema_editor):
|
||||||
|
ChatModelOptions = apps.get_model("database", "ChatModelOptions")
|
||||||
|
for chat_model_option in ChatModelOptions.objects.filter(chat_model="mistral-7b-instruct-v0.1.Q4_0.gguf"):
|
||||||
|
chat_model_option.chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
|
||||||
|
chat_model_option.save()
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_update_chat_model(apps, schema_editor):
|
||||||
|
ChatModelOptions = apps.get_model("database", "ChatModelOptions")
|
||||||
|
for chat_model_option in ChatModelOptions.objects.filter(chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"):
|
||||||
|
chat_model_option.chat_model = "mistral-7b-instruct-v0.1.Q4_0.gguf"
|
||||||
|
chat_model_option.save()
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0033_rename_tuning_agent_personality"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="chatmodeloptions",
|
||||||
|
name="chat_model",
|
||||||
|
field=models.CharField(default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_length=200),
|
||||||
|
),
|
||||||
|
migrations.RunPython(update_chat_model, reverse_code=reverse_update_chat_model),
|
||||||
|
]
|
||||||
@@ -1,15 +1,18 @@
|
|||||||
|
import calendar
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from math import inf
|
from math import inf
|
||||||
from typing import List
|
from typing import List, Tuple
|
||||||
|
|
||||||
import dateparser as dtparse
|
import dateparser as dtparse
|
||||||
|
from dateparser.search import search_dates
|
||||||
|
from dateparser_data.settings import default_parsers
|
||||||
from dateutil.relativedelta import relativedelta
|
from dateutil.relativedelta import relativedelta
|
||||||
|
|
||||||
from khoj.search_filter.base_filter import BaseFilter
|
from khoj.search_filter.base_filter import BaseFilter
|
||||||
from khoj.utils.helpers import LRU, timer
|
from khoj.utils.helpers import LRU, merge_dicts, timer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -21,25 +24,82 @@ class DateFilter(BaseFilter):
|
|||||||
# - dt>="last week"
|
# - dt>="last week"
|
||||||
# - dt:"2 years ago"
|
# - dt:"2 years ago"
|
||||||
date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
|
date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
|
||||||
raw_date_regex = r"\d{4}-\d{2}-\d{2}"
|
|
||||||
|
|
||||||
def __init__(self, entry_key="compiled"):
|
def __init__(self, entry_key="compiled"):
|
||||||
self.entry_key = entry_key
|
self.entry_key = entry_key
|
||||||
self.date_to_entry_ids = defaultdict(set)
|
self.date_to_entry_ids = defaultdict(set)
|
||||||
self.cache = LRU()
|
self.cache = LRU()
|
||||||
|
self.dtparser_regexes = self.compile_date_regexes()
|
||||||
|
self.dtparser_ordinal_suffixes = re.compile(r"(st|nd|rd|th)")
|
||||||
|
self.dtparser_settings = {
|
||||||
|
"PREFER_DAY_OF_MONTH": "first",
|
||||||
|
"DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates
|
||||||
|
}
|
||||||
|
|
||||||
|
def compile_date_regexes(self):
|
||||||
|
months = calendar.month_name[1:]
|
||||||
|
abbr_months = calendar.month_abbr[1:]
|
||||||
|
# Extract natural dates from content like 1st April 1984, 31 April 84, Apr 4th 1984, 13 Apr 84
|
||||||
|
dBY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
|
||||||
|
dBy_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
|
||||||
|
BdY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
|
||||||
|
Bdy_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
|
||||||
|
dbY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
|
||||||
|
dby_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
|
||||||
|
bdY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
|
||||||
|
bdy_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
|
||||||
|
# Extract natural of form Month, Year like January 2021, Jan 2021, Jan 21
|
||||||
|
BY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
|
||||||
|
By_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
|
||||||
|
bY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
|
||||||
|
by_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
|
||||||
|
# Extract structured dates from content like 1984-04-01, 1984/04/01, 01-04-1984, 01/04/1984, 01.04.1984, 01-04-84, 01/04/84
|
||||||
|
Ymd_date_regex = re.compile(r"\b\d{4}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
|
||||||
|
dmY_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{4}\b", re.IGNORECASE)
|
||||||
|
dmy_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
|
||||||
|
dmY_dot_date_regex = re.compile(r"\b\d{2}[\.]\d{2}[\.]\d{4}\b", re.IGNORECASE)
|
||||||
|
|
||||||
|
# Combine date formatter and date identifier regex pairs
|
||||||
|
dtparser_regexes: List[Tuple[str, re.Pattern[str]]] = [
|
||||||
|
# Structured dates
|
||||||
|
("%Y-%m-%d", Ymd_date_regex),
|
||||||
|
("%Y/%m/%d", Ymd_date_regex),
|
||||||
|
("%d-%m-%Y", dmY_date_regex),
|
||||||
|
("%d/%m/%Y", dmY_date_regex),
|
||||||
|
("%d.%m.%Y", dmY_dot_date_regex),
|
||||||
|
("%d-%m-%y", dmy_date_regex),
|
||||||
|
("%d/%m/%y", dmy_date_regex),
|
||||||
|
# Natural dates
|
||||||
|
("%d %B %Y", dBY_regex),
|
||||||
|
("%d %B %y", dBy_regex),
|
||||||
|
("%B %d %Y", BdY_regex),
|
||||||
|
("%B %d %y", Bdy_regex),
|
||||||
|
("%d %b %Y", dbY_regex),
|
||||||
|
("%d %b %y", dby_regex),
|
||||||
|
("%b %d %Y", bdY_regex),
|
||||||
|
("%b %d %y", bdy_regex),
|
||||||
|
# Partial natural dates
|
||||||
|
("%B %Y", BY_regex),
|
||||||
|
("%B %y", By_regex),
|
||||||
|
("%b %Y", bY_regex),
|
||||||
|
("%b %y", by_regex),
|
||||||
|
]
|
||||||
|
return dtparser_regexes
|
||||||
|
|
||||||
def extract_dates(self, content):
|
def extract_dates(self, content):
|
||||||
pattern_matched_dates = re.findall(self.raw_date_regex, content)
|
"Extract natural and structured dates from content"
|
||||||
|
valid_dates = set()
|
||||||
|
for date_format, date_regex in self.dtparser_regexes:
|
||||||
|
matched_dates = date_regex.findall(content)
|
||||||
|
for date_str in matched_dates:
|
||||||
|
# Remove ordinal suffixes to parse date
|
||||||
|
date_str = self.dtparser_ordinal_suffixes.sub("", date_str)
|
||||||
|
try:
|
||||||
|
valid_dates.add(datetime.strptime(date_str, date_format))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
# Filter down to valid dates
|
return list(valid_dates)
|
||||||
valid_dates = []
|
|
||||||
for date_str in pattern_matched_dates:
|
|
||||||
try:
|
|
||||||
valid_dates.append(datetime.strptime(date_str, "%Y-%m-%d"))
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
return valid_dates
|
|
||||||
|
|
||||||
def get_filter_terms(self, query: str) -> List[str]:
|
def get_filter_terms(self, query: str) -> List[str]:
|
||||||
"Get all filter terms in query"
|
"Get all filter terms in query"
|
||||||
@@ -120,18 +180,13 @@ class DateFilter(BaseFilter):
|
|||||||
# clean date string to handle future date parsing by date parser
|
# clean date string to handle future date parsing by date parser
|
||||||
future_strings = ["later", "from now", "from today"]
|
future_strings = ["later", "from now", "from today"]
|
||||||
prefer_dates_from = {True: "future", False: "past"}[any([True for fstr in future_strings if fstr in date_str])]
|
prefer_dates_from = {True: "future", False: "past"}[any([True for fstr in future_strings if fstr in date_str])]
|
||||||
clean_date_str = re.sub("|".join(future_strings), "", date_str)
|
dtquery_settings = {"RELATIVE_BASE": relative_base or datetime.now(), "PREFER_DATES_FROM": prefer_dates_from}
|
||||||
|
dtparser_settings = merge_dicts(dtquery_settings, self.dtparser_settings)
|
||||||
|
|
||||||
# parse date passed in query date filter
|
# parse date passed in query date filter
|
||||||
|
clean_date_str = re.sub("|".join(future_strings), "", date_str)
|
||||||
try:
|
try:
|
||||||
parsed_date = dtparse.parse(
|
parsed_date = dtparse.parse(clean_date_str, settings=dtparser_settings)
|
||||||
clean_date_str,
|
|
||||||
settings={
|
|
||||||
"RELATIVE_BASE": relative_base or datetime.now(),
|
|
||||||
"PREFER_DAY_OF_MONTH": "first",
|
|
||||||
"PREFER_DATES_FROM": prefer_dates_from,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to parse date string: {date_str} with error: {e}")
|
logger.error(f"Failed to parse date string: {date_str} with error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ def test_date_filter_regex():
|
|||||||
assert dtrange_match == []
|
assert dtrange_match == []
|
||||||
|
|
||||||
|
|
||||||
def test_get_file_filter_terms():
|
def test_get_date_filter_terms():
|
||||||
dtrange_match = DateFilter().get_filter_terms('multi word head dt>"today" dt:"1984-01-01"')
|
dtrange_match = DateFilter().get_filter_terms('multi word head dt>"today" dt:"1984-01-01"')
|
||||||
assert dtrange_match == ["dt>'today'", "dt:'1984-01-01'"]
|
assert dtrange_match == ["dt>'today'", "dt:'1984-01-01'"]
|
||||||
|
|
||||||
@@ -134,3 +134,83 @@ def test_get_file_filter_terms():
|
|||||||
|
|
||||||
dtrange_match = DateFilter().get_filter_terms("head tail")
|
dtrange_match = DateFilter().get_filter_terms("head tail")
|
||||||
assert dtrange_match == []
|
assert dtrange_match == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_date_extraction():
|
||||||
|
extracted_dates = DateFilter().extract_dates("")
|
||||||
|
assert extracted_dates == [], "Expected to handle empty string"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head tail")
|
||||||
|
assert extracted_dates == [], "Expected to handle no dates"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head CREATED: today tail")
|
||||||
|
assert extracted_dates == [], "Expected relative date to be ignored"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984-04-01 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y-m-d structured date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01-04-1984 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d-m-Y structured date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head CREATED: today SCHEDULED: 1984/04/01 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected only Y/m/d structured date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head SCHEDULED: 01/04/1984 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d/m/Y structured date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head DEADLINE: 01.04.1984 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected d.m.Y structured date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("CLOCK: [1984-04-01 Sun 09:50]--[1984-04-01 Sun 10:10] => 24:20")
|
||||||
|
assert extracted_dates == [
|
||||||
|
datetime(1984, 4, 1, 0, 0, 0)
|
||||||
|
], "Expected single deduplicated date extracted from logbook entry"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("CLOCK: [1984/03/31 mer 09:50]--[1984/04/01 mer 10:10] => 24:20")
|
||||||
|
expected_dates = [datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 3, 31, 0, 0, 0)]
|
||||||
|
assert all(
|
||||||
|
[dt in extracted_dates for dt in expected_dates]
|
||||||
|
), "Expected multiple different dates extracted from logbook entry"
|
||||||
|
|
||||||
|
|
||||||
|
def test_natual_date_extraction():
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 1 April 1984 tail")
|
||||||
|
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 1st April 1984 tail")
|
||||||
|
assert datetime(1984, 4, 1, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 2nd Apr 1984 tail")
|
||||||
|
assert datetime(1984, 4, 2, 0, 0, 0) in extracted_dates, "Expected natural date with short month to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 4th Apr 1984 tail")
|
||||||
|
assert datetime(1984, 4, 4, 0, 0, 0) in extracted_dates, "Expected natural date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 11th april 1984 tail")
|
||||||
|
assert (
|
||||||
|
datetime(1984, 4, 11, 0, 0, 0) in extracted_dates
|
||||||
|
), "Expected natural date with lowercase month to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 23rd april 84 tail")
|
||||||
|
assert datetime(1984, 4, 23, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head 31st march 84 tail")
|
||||||
|
assert datetime(1984, 3, 31, 0, 0, 0) in extracted_dates, "Expected natural date with 2-digit year to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head April 1984 tail")
|
||||||
|
assert extracted_dates == [datetime(1984, 4, 1, 0, 0, 0)], "Expected partial natural date to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head Apr 1984 tail")
|
||||||
|
assert extracted_dates == [
|
||||||
|
datetime(1984, 4, 1, 0, 0, 0)
|
||||||
|
], "Expected partial natural date with short month to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head apr 1984 tail")
|
||||||
|
assert extracted_dates == [
|
||||||
|
datetime(1984, 4, 1, 0, 0, 0)
|
||||||
|
], "Expected partial natural date with lowercase month to be extracted"
|
||||||
|
|
||||||
|
extracted_dates = DateFilter().extract_dates("head apr 84 tail")
|
||||||
|
assert extracted_dates == [
|
||||||
|
datetime(1984, 4, 1, 0, 0, 0)
|
||||||
|
], "Expected partial natural date with 2-digit year to be extracted"
|
||||||
|
|||||||
Reference in New Issue
Block a user