From f094c862042276d939e1f37318c87b299a6447a2 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 26 Jul 2022 21:03:53 +0400 Subject: [PATCH 01/15] Trace query response performance and display timings in verbose mode --- src/main.py | 49 +++++++++++++++++++------- src/processor/org_mode/org_to_jsonl.py | 2 +- src/search_type/text_search.py | 27 +++++++++++++- 3 files changed, 64 insertions(+), 14 deletions(-) diff --git a/src/main.py b/src/main.py index 3c3ce5b5..5adbdd3e 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,6 @@ # Standard Packages import sys, json, yaml, os +import time from typing import Optional # External Packages @@ -66,50 +67,74 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") user_query = q results_count = n + results = {} if (t == SearchType.Org or t == None) and model.orgmode_search: # query org-mode notes - hits, entries = text_search.query(user_query, model.orgmode_search, device=device, filters=[explicit_filter, date_filter]) + query_start = time.time() + hits, entries = text_search.query(user_query, model.orgmode_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + query_end = time.time() # collate and return results - return text_search.collate_results(hits, entries, results_count) + collate_start = time.time() + results = text_search.collate_results(hits, entries, results_count) + collate_end = time.time() if (t == SearchType.Music or t == None) and model.music_search: # query music library - hits, entries = text_search.query(user_query, model.music_search, device=device, filters=[explicit_filter, date_filter]) + query_start = time.time() + hits, entries = text_search.query(user_query, model.music_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + query_end = time.time() # collate and return results - return text_search.collate_results(hits, entries, results_count) + collate_start = time.time() + results = text_search.collate_results(hits, entries, results_count) + collate_end = time.time() if (t == SearchType.Markdown or t == None) and model.orgmode_search: # query markdown files - hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter]) + query_start = time.time() + hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + query_end = time.time() # collate and return results - return text_search.collate_results(hits, entries, results_count) + collate_start = time.time() + results = text_search.collate_results(hits, entries, results_count) + collate_end = time.time() if (t == SearchType.Ledger or t == None) and model.ledger_search: # query transactions - hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter]) + query_start = time.time() + hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter], verbose=verbose) + query_end = time.time() # collate and return results - return text_search.collate_results(hits, entries, results_count) + collate_start = time.time() + results = text_search.collate_results(hits, entries, results_count) + collate_end = time.time() if (t == SearchType.Image or t == None) and model.image_search: # query images - hits = image_search.query(user_query, results_count, model.image_search) + query_start = time.time() + hits = image_search.query(user_query, results_count, model.image_search, verbose=verbose) output_directory = f'{os.getcwd()}/{web_directory}' + query_end = time.time() # collate and return results - return image_search.collate_results( + collate_start = time.time() + results = image_search.collate_results( hits, image_names=model.image_search.image_names, output_directory=output_directory, static_files_url='/static', count=results_count) + collate_end = time.time() - else: - return {} + if verbose > 1: + print(f"Query took {query_end - query_start:.3f} seconds") + print(f"Collating results took {collate_end - collate_start:.3f} seconds") + + return results @app.get('/reload') diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index bf147faa..caad7715 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -82,7 +82,7 @@ def convert_org_entries_to_jsonl(entries, verbose=0): continue entry_dict["compiled"] = f'{entry.Heading()}.' - if verbose > 1: + if verbose > 2: print(f"Title: {entry.Heading()}") if entry.Tags(): diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 39ae19b8..93c8c344 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -2,6 +2,7 @@ import argparse import pathlib from copy import deepcopy +import time # External Packages import torch @@ -62,38 +63,62 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, d return corpus_embeddings -def query(raw_query: str, model: TextSearchModel, device='cpu', filters: list = []): +def query(raw_query: str, model: TextSearchModel, device='cpu', filters: list = [], verbose=0): "Search for entries that answer the query" # Copy original embeddings, entries to filter them for query + start = time.time() query = raw_query corpus_embeddings = deepcopy(model.corpus_embeddings) entries = deepcopy(model.entries) + end = time.time() + if verbose > 1: + print(f"Copy Time: {end - start:.3f} seconds") # Filter query, entries and embeddings before semantic search + start = time.time() for filter in filters: query, entries, corpus_embeddings = filter(query, entries, corpus_embeddings) if entries is None or len(entries) == 0: return [], [] + end = time.time() + if verbose > 1: + print(f"Filter Time: {end - start:.3f} seconds") # Encode the query using the bi-encoder + start = time.time() question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True) question_embedding.to(device) question_embedding = util.normalize_embeddings(question_embedding) + end = time.time() + if verbose > 1: + print(f"Query Encode Time: {end - start:.3f} seconds") # Find relevant entries for the query + start = time.time() hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=model.top_k, score_function=util.dot_score)[0] + end = time.time() + if verbose > 1: + print(f"Search Time: {end - start:.3f} seconds") # Score all retrieved entries using the cross-encoder + start = time.time() cross_inp = [[query, entries[hit['corpus_id']]['compiled']] for hit in hits] cross_scores = model.cross_encoder.predict(cross_inp) + end = time.time() + if verbose > 1: + print(f"Cross-Encoder Predict Time: {end - start:.3f} seconds") # Store cross-encoder scores in results dictionary for ranking for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Order results by cross-encoder score followed by bi-encoder score + start = time.time() hits.sort(key=lambda x: x['score'], reverse=True) # sort by bi-encoder score hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross-encoder score + end = time.time() + if verbose > 1: + print(f"Rank Time: {end - start:.3f} seconds") return hits, entries From b1e64fd4a88c2079cccb3efc0b61ee5101ecbc34 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 26 Jul 2022 22:47:26 +0400 Subject: [PATCH 02/15] Improve search speed. Only apply filter if filter keywords in query - Formalize filters into class with can_filter() and filter() methods - Use can_filter() method to decide whether to apply filter and create deep copies of entries and embeddings for it - Improve search speed for queries with no filters as deep copying entries, embeddings takes the most time after cross-encodes scoring when calling the /search API Earlier we would create deep copies of entries, embeddings even if the query did not contain any filter keywords --- src/main.py | 12 +- src/search_filter/date_filter.py | 239 ++++++++++++++------------- src/search_filter/explicit_filter.py | 85 +++++----- src/search_type/text_search.py | 23 ++- tests/test_date_filter.py | 64 +++---- 5 files changed, 223 insertions(+), 200 deletions(-) diff --git a/src/main.py b/src/main.py index 5adbdd3e..cb975322 100644 --- a/src/main.py +++ b/src/main.py @@ -21,8 +21,8 @@ from src.utils.cli import cli from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel from src.utils.rawconfig import FullConfig from src.processor.conversation.gpt import converse, extract_search_type, message_to_log, message_to_prompt, understand, summarize -from src.search_filter.explicit_filter import explicit_filter -from src.search_filter.date_filter import date_filter +from src.search_filter.explicit_filter import ExplicitFilter +from src.search_filter.date_filter import DateFilter # Application Global State config = FullConfig() @@ -72,7 +72,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Org or t == None) and model.orgmode_search: # query org-mode notes query_start = time.time() - hits, entries = text_search.query(user_query, model.orgmode_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + hits, entries = text_search.query(user_query, model.orgmode_search, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -83,7 +83,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Music or t == None) and model.music_search: # query music library query_start = time.time() - hits, entries = text_search.query(user_query, model.music_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + hits, entries = text_search.query(user_query, model.music_search, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -94,7 +94,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Markdown or t == None) and model.orgmode_search: # query markdown files query_start = time.time() - hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[explicit_filter, date_filter], verbose=verbose) + hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -105,7 +105,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Ledger or t == None) and model.ledger_search: # query transactions query_start = time.time() - hits, entries = text_search.query(user_query, model.ledger_search, filters=[explicit_filter, date_filter], verbose=verbose) + hits, entries = text_search.query(user_query, model.ledger_search, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) query_end = time.time() # collate and return results diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 97ed2a8a..68e75274 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -9,138 +9,143 @@ import torch import dateparser as dtparse -# Date Range Filter Regexes -# Example filter queries: -# - dt>="yesterday" dt<"tomorrow" -# - dt>="last week" -# - dt:"2 years ago" -date_regex = r"dt([:><=]{1,2})\"(.*?)\"" +class DateFilter: + # Date Range Filter Regexes + # Example filter queries: + # - dt>="yesterday" dt<"tomorrow" + # - dt>="last week" + # - dt:"2 years ago" + date_regex = r"dt([:><=]{1,2})\"(.*?)\"" + + def can_filter(self, raw_query): + "Check if query contains date filters" + return self.extract_date_range(raw_query) is not None -def date_filter(query, entries, embeddings, entry_key='raw'): - "Find entries containing any dates that fall within date range specified in query" - # extract date range specified in date filter of query - query_daterange = extract_date_range(query) + def filter(self, query, entries, embeddings, entry_key='raw'): + "Find entries containing any dates that fall within date range specified in query" + # extract date range specified in date filter of query + query_daterange = self.extract_date_range(query) + + # if no date in query, return all entries + if query_daterange is None: + return query, entries, embeddings + + # remove date range filter from query + query = re.sub(f'\s+{self.date_regex}', ' ', query) + query = re.sub(r'\s{2,}', ' ', query).strip() # remove multiple spaces + + # find entries containing any dates that fall with date range specified in query + entries_to_include = set() + for id, entry in enumerate(entries): + # Extract dates from entry + for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[entry_key]): + # Convert date string in entry to unix timestamp + try: + date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() + except ValueError: + continue + # Check if date in entry is within date range specified in query + if query_daterange[0] <= date_in_entry < query_daterange[1]: + entries_to_include.add(id) + break + + # delete entries (and their embeddings) marked for exclusion + entries_to_exclude = set(range(len(entries))) - entries_to_include + for id in sorted(list(entries_to_exclude), reverse=True): + del entries[id] + embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) - # if no date in query, return all entries - if query_daterange is None: return query, entries, embeddings - # remove date range filter from query - query = re.sub(f'\s+{date_regex}', ' ', query) - query = re.sub(r'\s{2,}', ' ', query).strip() # remove multiple spaces - # find entries containing any dates that fall with date range specified in query - entries_to_include = set() - for id, entry in enumerate(entries): - # Extract dates from entry - for date_in_entry_string in re.findall(r'\d{4}-\d{2}-\d{2}', entry[entry_key]): - # Convert date string in entry to unix timestamp - try: - date_in_entry = datetime.strptime(date_in_entry_string, '%Y-%m-%d').timestamp() - except ValueError: - continue - # Check if date in entry is within date range specified in query - if query_daterange[0] <= date_in_entry < query_daterange[1]: - entries_to_include.add(id) - break + def extract_date_range(self, query): + # find date range filter in query + date_range_matches = re.findall(self.date_regex, query) - # delete entries (and their embeddings) marked for exclusion - entries_to_exclude = set(range(len(entries))) - entries_to_include - for id in sorted(list(entries_to_exclude), reverse=True): - del entries[id] - embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) + if len(date_range_matches) == 0: + return None - return query, entries, embeddings + # extract, parse natural dates ranges from date range filter passed in query + # e.g today maps to (start_of_day, start_of_tomorrow) + date_ranges_from_filter = [] + for (cmp, date_str) in date_range_matches: + if self.parse(date_str): + dt_start, dt_end = self.parse(date_str) + date_ranges_from_filter += [[cmp, (dt_start.timestamp(), dt_end.timestamp())]] + + # Combine dates with their comparators to form date range intervals + # For e.g + # >=yesterday maps to [start_of_yesterday, inf) + # ': + date_range_considering_comparator += [[dtrange_end, inf]] + elif cmp == '>=': + date_range_considering_comparator += [[dtrange_start, inf]] + elif cmp == '<': + date_range_considering_comparator += [[0, dtrange_start]] + elif cmp == '<=': + date_range_considering_comparator += [[0, dtrange_end]] + elif cmp == '=' or cmp == ':' or cmp == '==': + date_range_considering_comparator += [[dtrange_start, dtrange_end]] + + # Combine above intervals (via AND/intersect) + # In the above example, this gives us [start_of_yesterday, start_of_tomorrow) + # This is the effective date range to filter entries by + # --- + for date_range in date_range_considering_comparator: + effective_date_range = [ + max(effective_date_range[0], date_range[0]), + min(effective_date_range[1], date_range[1])] + + if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]: + return None + else: + return effective_date_range -def extract_date_range(query): - # find date range filter in query - date_range_matches = re.findall(date_regex, query) + def parse(self, date_str, relative_base=None): + "Parse date string passed in date filter of query to datetime object" + # clean date string to handle future date parsing by date parser + future_strings = ['later', 'from now', 'from today'] + prefer_dates_from = {True: 'future', False: 'past'}[any([True for fstr in future_strings if fstr in date_str])] + clean_date_str = re.sub('|'.join(future_strings), '', date_str) - if len(date_range_matches) == 0: - return None + # parse date passed in query date filter + parsed_date = dtparse.parse( + clean_date_str, + settings= { + 'RELATIVE_BASE': relative_base or datetime.now(), + 'PREFER_DAY_OF_MONTH': 'first', + 'PREFER_DATES_FROM': prefer_dates_from + }) - # extract, parse natural dates ranges from date range filter passed in query - # e.g today maps to (start_of_day, start_of_tomorrow) - date_ranges_from_filter = [] - for (cmp, date_str) in date_range_matches: - if parse(date_str): - dt_start, dt_end = parse(date_str) - date_ranges_from_filter += [[cmp, (dt_start.timestamp(), dt_end.timestamp())]] + if parsed_date is None: + return None - # Combine dates with their comparators to form date range intervals - # For e.g - # >=yesterday maps to [start_of_yesterday, inf) - # ': - date_range_considering_comparator += [[dtrange_end, inf]] - elif cmp == '>=': - date_range_considering_comparator += [[dtrange_start, inf]] - elif cmp == '<': - date_range_considering_comparator += [[0, dtrange_start]] - elif cmp == '<=': - date_range_considering_comparator += [[0, dtrange_end]] - elif cmp == '=' or cmp == ':' or cmp == '==': - date_range_considering_comparator += [[dtrange_start, dtrange_end]] - - # Combine above intervals (via AND/intersect) - # In the above example, this gives us [start_of_yesterday, start_of_tomorrow) - # This is the effective date range to filter entries by - # --- - for date_range in date_range_considering_comparator: - effective_date_range = [ - max(effective_date_range[0], date_range[0]), - min(effective_date_range[1], date_range[1])] - - if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]: - return None - else: - return effective_date_range + return self.date_to_daterange(parsed_date, date_str) -def parse(date_str, relative_base=None): - "Parse date string passed in date filter of query to datetime object" - # clean date string to handle future date parsing by date parser - future_strings = ['later', 'from now', 'from today'] - prefer_dates_from = {True: 'future', False: 'past'}[any([True for fstr in future_strings if fstr in date_str])] - clean_date_str = re.sub('|'.join(future_strings), '', date_str) + def date_to_daterange(self, parsed_date, date_str): + "Convert parsed date to date ranges at natural granularity (day, week, month or year)" - # parse date passed in query date filter - parsed_date = dtparse.parse( - clean_date_str, - settings= { - 'RELATIVE_BASE': relative_base or datetime.now(), - 'PREFER_DAY_OF_MONTH': 'first', - 'PREFER_DATES_FROM': prefer_dates_from - }) + start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0) - if parsed_date is None: - return None - - return date_to_daterange(parsed_date, date_str) - - -def date_to_daterange(parsed_date, date_str): - "Convert parsed date to date ranges at natural granularity (day, week, month or year)" - - start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0) - - if 'year' in date_str: - return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0)) - if 'month' in date_str: - start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0) - next_month = start_of_month + relativedelta(months=1) - return (start_of_month, next_month) - if 'week' in date_str: - # if week in date string, dateparser parses it to next week start - # so today = end of this week - start_of_week = start_of_day - timedelta(days=7) - return (start_of_week, start_of_day) - else: - next_day = start_of_day + relativedelta(days=1) + if 'year' in date_str: + return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year+1, 1, 1, 0, 0, 0)) + if 'month' in date_str: + start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0) + next_month = start_of_month + relativedelta(months=1) + return (start_of_month, next_month) + if 'week' in date_str: + # if week in date string, dateparser parses it to next week start + # so today = end of this week + start_of_week = start_of_day - timedelta(days=7) + return (start_of_week, start_of_day) + else: + next_day = start_of_day + relativedelta(days=1) return (start_of_day, next_day) diff --git a/src/search_filter/explicit_filter.py b/src/search_filter/explicit_filter.py index 61576bdf..b7bb6754 100644 --- a/src/search_filter/explicit_filter.py +++ b/src/search_filter/explicit_filter.py @@ -5,42 +5,53 @@ import re import torch -def explicit_filter(raw_query, entries, embeddings, entry_key='raw'): - # Separate natural query from explicit required, blocked words filters - query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")]) - required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")]) - blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")]) +class ExplicitFilter: + def can_filter(self, raw_query): + "Check if query contains explicit filters" + # Extract explicit query portion with required, blocked words to filter from natural query + required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")]) + blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")]) + + return len(required_words) != 0 or len(blocked_words) != 0 + + + def filter(self, raw_query, entries, embeddings, entry_key='raw'): + "Find entries containing required and not blocked words specified in query" + # Separate natural query from explicit required, blocked words filters + query = " ".join([word for word in raw_query.split() if not word.startswith("+") and not word.startswith("-")]) + required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")]) + blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")]) + + if len(required_words) == 0 and len(blocked_words) == 0: + return query, entries, embeddings + + # convert each entry to a set of words + # split on fullstop, comma, colon, tab, newline or any brackets + entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' + entries_by_word_set = [set(word.lower() + for word + in re.split(entry_splitter, entry[entry_key]) + if word != "") + for entry in entries] + + # track id of entries to exclude + entries_to_exclude = set() + + # mark entries that do not contain all required_words for exclusion + if len(required_words) > 0: + for id, words_in_entry in enumerate(entries_by_word_set): + if not required_words.issubset(words_in_entry): + entries_to_exclude.add(id) + + # mark entries that contain any blocked_words for exclusion + if len(blocked_words) > 0: + for id, words_in_entry in enumerate(entries_by_word_set): + if words_in_entry.intersection(blocked_words): + entries_to_exclude.add(id) + + # delete entries (and their embeddings) marked for exclusion + for id in sorted(list(entries_to_exclude), reverse=True): + del entries[id] + embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) - if len(required_words) == 0 and len(blocked_words) == 0: return query, entries, embeddings - - # convert each entry to a set of words - # split on fullstop, comma, colon, tab, newline or any brackets - entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' - entries_by_word_set = [set(word.lower() - for word - in re.split(entry_splitter, entry[entry_key]) - if word != "") - for entry in entries] - - # track id of entries to exclude - entries_to_exclude = set() - - # mark entries that do not contain all required_words for exclusion - if len(required_words) > 0: - for id, words_in_entry in enumerate(entries_by_word_set): - if not required_words.issubset(words_in_entry): - entries_to_exclude.add(id) - - # mark entries that contain any blocked_words for exclusion - if len(blocked_words) > 0: - for id, words_in_entry in enumerate(entries_by_word_set): - if words_in_entry.intersection(blocked_words): - entries_to_exclude.add(id) - - # delete entries (and their embeddings) marked for exclusion - for id in sorted(list(entries_to_exclude), reverse=True): - del entries[id] - embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) - - return query, entries, embeddings \ No newline at end of file diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 93c8c344..739aba40 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -65,25 +65,32 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, d def query(raw_query: str, model: TextSearchModel, device='cpu', filters: list = [], verbose=0): "Search for entries that answer the query" - # Copy original embeddings, entries to filter them for query - start = time.time() query = raw_query - corpus_embeddings = deepcopy(model.corpus_embeddings) - entries = deepcopy(model.entries) + + # Use deep copy of original embeddings, entries to filter if query contains filters + start = time.time() + filters_in_query = [filter for filter in filters if filter.can_filter(query)] + if filters_in_query: + corpus_embeddings = deepcopy(model.corpus_embeddings) + entries = deepcopy(model.entries) + else: + corpus_embeddings = model.corpus_embeddings + entries = model.entries end = time.time() if verbose > 1: print(f"Copy Time: {end - start:.3f} seconds") # Filter query, entries and embeddings before semantic search start = time.time() - for filter in filters: - query, entries, corpus_embeddings = filter(query, entries, corpus_embeddings) - if entries is None or len(entries) == 0: - return [], [] + for filter in filters_in_query: + query, entries, corpus_embeddings = filter.filter(query, entries, corpus_embeddings) end = time.time() if verbose > 1: print(f"Filter Time: {end - start:.3f} seconds") + if entries is None or len(entries) == 0: + return [], [] + # Encode the query using the bi-encoder start = time.time() question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True) diff --git a/tests/test_date_filter.py b/tests/test_date_filter.py index a1f63a05..88e31c86 100644 --- a/tests/test_date_filter.py +++ b/tests/test_date_filter.py @@ -7,7 +7,7 @@ from math import inf import torch # Application Packages -from src.search_filter import date_filter +from src.search_filter.date_filter import DateFilter def test_date_filter(): @@ -18,99 +18,99 @@ def test_date_filter(): {'compiled': '', 'raw': 'Entry with date:1984-04-02'}] q_with_no_date_filter = 'head tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(q_with_no_date_filter, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(q_with_no_date_filter, entries.copy(), embeddings) assert ret_query == 'head tail' assert len(ret_emb) == 3 assert ret_entries == entries q_with_dtrange_non_overlapping_at_boundary = 'head dt>"1984-04-01" dt<"1984-04-02" tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(q_with_dtrange_non_overlapping_at_boundary, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(q_with_dtrange_non_overlapping_at_boundary, entries.copy(), embeddings) assert ret_query == 'head tail' assert len(ret_emb) == 0 assert ret_entries == [] query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<"1984-04-03" tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(query_with_overlapping_dtrange, entries.copy(), embeddings) assert ret_query == 'head tail' assert ret_entries == [entries[2]] assert len(ret_emb) == 1 query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<"1984-04-02" tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(query_with_overlapping_dtrange, entries.copy(), embeddings) assert ret_query == 'head tail' assert ret_entries == [entries[1]] assert len(ret_emb) == 1 query_with_overlapping_dtrange = 'head dt>"1984-04-01" dt<="1984-04-02" tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(query_with_overlapping_dtrange, entries.copy(), embeddings) assert ret_query == 'head tail' assert ret_entries == [entries[2]] assert len(ret_emb) == 1 query_with_overlapping_dtrange = 'head dt>="1984-04-01" dt<="1984-04-02" tail' - ret_query, ret_entries, ret_emb = date_filter.date_filter(query_with_overlapping_dtrange, entries.copy(), embeddings) + ret_query, ret_entries, ret_emb = DateFilter().filter(query_with_overlapping_dtrange, entries.copy(), embeddings) assert ret_query == 'head tail' assert ret_entries == [entries[1], entries[2]] assert len(ret_emb) == 2 def test_extract_date_range(): - assert date_filter.extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [datetime(1984, 1, 5, 0, 0, 0).timestamp(), datetime(1984, 1, 7, 0, 0, 0).timestamp()] - assert date_filter.extract_date_range('head dt<="1984-01-01"') == [0, datetime(1984, 1, 2, 0, 0, 0).timestamp()] - assert date_filter.extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf] - assert date_filter.extract_date_range('head dt:"1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp()] + assert DateFilter().extract_date_range('head dt>"1984-01-04" dt<"1984-01-07" tail') == [datetime(1984, 1, 5, 0, 0, 0).timestamp(), datetime(1984, 1, 7, 0, 0, 0).timestamp()] + assert DateFilter().extract_date_range('head dt<="1984-01-01"') == [0, datetime(1984, 1, 2, 0, 0, 0).timestamp()] + assert DateFilter().extract_date_range('head dt>="1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), inf] + assert DateFilter().extract_date_range('head dt:"1984-01-01"') == [datetime(1984, 1, 1, 0, 0, 0).timestamp(), datetime(1984, 1, 2, 0, 0, 0).timestamp()] # Unparseable date filter specified in query - assert date_filter.extract_date_range('head dt:"Summer of 69" tail') == None + assert DateFilter().extract_date_range('head dt:"Summer of 69" tail') == None # No date filter specified in query - assert date_filter.extract_date_range('head tail') == None + assert DateFilter().extract_date_range('head tail') == None # Non intersecting date ranges - assert date_filter.extract_date_range('head dt>"1984-01-01" dt<"1984-01-01" tail') == None + assert DateFilter().extract_date_range('head dt>"1984-01-01" dt<"1984-01-01" tail') == None def test_parse(): test_now = datetime(1984, 4, 1, 21, 21, 21) # day variations - assert date_filter.parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0)) - assert date_filter.parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0)) - assert date_filter.parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0)) - assert date_filter.parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0)) + assert DateFilter().parse('today', relative_base=test_now) == (datetime(1984, 4, 1, 0, 0, 0), datetime(1984, 4, 2, 0, 0, 0)) + assert DateFilter().parse('tomorrow', relative_base=test_now) == (datetime(1984, 4, 2, 0, 0, 0), datetime(1984, 4, 3, 0, 0, 0)) + assert DateFilter().parse('yesterday', relative_base=test_now) == (datetime(1984, 3, 31, 0, 0, 0), datetime(1984, 4, 1, 0, 0, 0)) + assert DateFilter().parse('5 days ago', relative_base=test_now) == (datetime(1984, 3, 27, 0, 0, 0), datetime(1984, 3, 28, 0, 0, 0)) # week variations - assert date_filter.parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0)) - assert date_filter.parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0)) + assert DateFilter().parse('last week', relative_base=test_now) == (datetime(1984, 3, 18, 0, 0, 0), datetime(1984, 3, 25, 0, 0, 0)) + assert DateFilter().parse('2 weeks ago', relative_base=test_now) == (datetime(1984, 3, 11, 0, 0, 0), datetime(1984, 3, 18, 0, 0, 0)) # month variations - assert date_filter.parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0)) - assert date_filter.parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0)) + assert DateFilter().parse('next month', relative_base=test_now) == (datetime(1984, 5, 1, 0, 0, 0), datetime(1984, 6, 1, 0, 0, 0)) + assert DateFilter().parse('2 months ago', relative_base=test_now) == (datetime(1984, 2, 1, 0, 0, 0), datetime(1984, 3, 1, 0, 0, 0)) # year variations - assert date_filter.parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0)) - assert date_filter.parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0)) + assert DateFilter().parse('this year', relative_base=test_now) == (datetime(1984, 1, 1, 0, 0, 0), datetime(1985, 1, 1, 0, 0, 0)) + assert DateFilter().parse('20 years later', relative_base=test_now) == (datetime(2004, 1, 1, 0, 0, 0), datetime(2005, 1, 1, 0, 0, 0)) # specific month/date variation - assert date_filter.parse('in august', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) - assert date_filter.parse('on 1983-08-01', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) + assert DateFilter().parse('in august', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) + assert DateFilter().parse('on 1983-08-01', relative_base=test_now) == (datetime(1983, 8, 1, 0, 0, 0), datetime(1983, 8, 2, 0, 0, 0)) def test_date_filter_regex(): - dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>"today" dt:"1984-01-01"') + dtrange_match = re.findall(DateFilter().date_regex, 'multi word head dt>"today" dt:"1984-01-01"') assert dtrange_match == [('>', 'today'), (':', '1984-01-01')] - dtrange_match = re.findall(date_filter.date_regex, 'head dt>"today" dt:"1984-01-01" multi word tail') + dtrange_match = re.findall(DateFilter().date_regex, 'head dt>"today" dt:"1984-01-01" multi word tail') assert dtrange_match == [('>', 'today'), (':', '1984-01-01')] - dtrange_match = re.findall(date_filter.date_regex, 'multi word head dt>="today" dt="1984-01-01"') + dtrange_match = re.findall(DateFilter().date_regex, 'multi word head dt>="today" dt="1984-01-01"') assert dtrange_match == [('>=', 'today'), ('=', '1984-01-01')] - dtrange_match = re.findall(date_filter.date_regex, 'dt<"multi word date" multi word tail') + dtrange_match = re.findall(DateFilter().date_regex, 'dt<"multi word date" multi word tail') assert dtrange_match == [('<', 'multi word date')] - dtrange_match = re.findall(date_filter.date_regex, 'head dt<="multi word date"') + dtrange_match = re.findall(DateFilter().date_regex, 'head dt<="multi word date"') assert dtrange_match == [('<=', 'multi word date')] - dtrange_match = re.findall(date_filter.date_regex, 'head tail') + dtrange_match = re.findall(DateFilter().date_regex, 'head tail') assert dtrange_match == [] \ No newline at end of file From 1168244c923e7bdd578aba956e38f20fe18aea9f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 26 Jul 2022 22:56:36 +0400 Subject: [PATCH 03/15] Make cross-encoder re-rank results if query param set on /search API - Improve search speed by ~10x Tested on corpus of 125K lines, 12.5K entries - Allow cross-encoder to re-rank results by settings &?r=true when querying /search API - It's an optional param that default to False - Earlier all results were re-ranked by cross-encoder - Making this configurable allows for much faster results, if desired but for lower accuracy --- src/main.py | 10 +++++----- src/search_type/text_search.py | 26 ++++++++++++++------------ tests/test_asymmetric_search.py | 3 ++- tests/test_client.py | 2 +- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/main.py b/src/main.py index cb975322..de8e287f 100644 --- a/src/main.py +++ b/src/main.py @@ -59,7 +59,7 @@ async def config_data(updated_config: FullConfig): return config @app.get('/search') -def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): +def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False): if q is None or q == '': print(f'No query param (q) passed in API call to initiate search') return {} @@ -72,7 +72,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Org or t == None) and model.orgmode_search: # query org-mode notes query_start = time.time() - hits, entries = text_search.query(user_query, model.orgmode_search, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) + hits, entries = text_search.query(user_query, model.orgmode_search, rank_results=r, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -83,7 +83,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Music or t == None) and model.music_search: # query music library query_start = time.time() - hits, entries = text_search.query(user_query, model.music_search, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) + hits, entries = text_search.query(user_query, model.music_search, rank_results=r, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -94,7 +94,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Markdown or t == None) and model.orgmode_search: # query markdown files query_start = time.time() - hits, entries = text_search.query(user_query, model.markdown_search, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) + hits, entries = text_search.query(user_query, model.markdown_search, rank_results=r, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) query_end = time.time() # collate and return results @@ -105,7 +105,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None): if (t == SearchType.Ledger or t == None) and model.ledger_search: # query transactions query_start = time.time() - hits, entries = text_search.query(user_query, model.ledger_search, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) + hits, entries = text_search.query(user_query, model.ledger_search, rank_results=r, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose) query_end = time.time() # collate and return results diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 739aba40..81c92605 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -63,7 +63,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, d return corpus_embeddings -def query(raw_query: str, model: TextSearchModel, device='cpu', filters: list = [], verbose=0): +def query(raw_query: str, model: TextSearchModel, rank_results=False, device='cpu', filters: list = [], verbose=0): "Search for entries that answer the query" query = raw_query @@ -108,21 +108,23 @@ def query(raw_query: str, model: TextSearchModel, device='cpu', filters: list = print(f"Search Time: {end - start:.3f} seconds") # Score all retrieved entries using the cross-encoder - start = time.time() - cross_inp = [[query, entries[hit['corpus_id']]['compiled']] for hit in hits] - cross_scores = model.cross_encoder.predict(cross_inp) - end = time.time() - if verbose > 1: - print(f"Cross-Encoder Predict Time: {end - start:.3f} seconds") + if rank_results: + start = time.time() + cross_inp = [[query, entries[hit['corpus_id']]['compiled']] for hit in hits] + cross_scores = model.cross_encoder.predict(cross_inp) + end = time.time() + if verbose > 1: + print(f"Cross-Encoder Predict Time: {end - start:.3f} seconds") - # Store cross-encoder scores in results dictionary for ranking - for idx in range(len(cross_scores)): - hits[idx]['cross-score'] = cross_scores[idx] + # Store cross-encoder scores in results dictionary for ranking + for idx in range(len(cross_scores)): + hits[idx]['cross-score'] = cross_scores[idx] # Order results by cross-encoder score followed by bi-encoder score start = time.time() hits.sort(key=lambda x: x['score'], reverse=True) # sort by bi-encoder score - hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross-encoder score + if rank_results: + hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross-encoder score end = time.time() if verbose > 1: print(f"Rank Time: {end - start:.3f} seconds") @@ -152,7 +154,7 @@ def collate_results(hits, entries, count=5): return [ { "entry": entries[hit['corpus_id']]['raw'], - "score": f"{hit['cross-score']:.3f}" + "score": f"{hit['cross-score'] if 'cross-score' in hit else hit['score']:.3f}" } for hit in hits[0:count]] diff --git a/tests/test_asymmetric_search.py b/tests/test_asymmetric_search.py index b14cc10d..135f9680 100644 --- a/tests/test_asymmetric_search.py +++ b/tests/test_asymmetric_search.py @@ -29,7 +29,8 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC # Act hits, entries = text_search.query( query, - model = model.notes_search) + model = model.notes_search, + rank_results=True) results = text_search.collate_results( hits, diff --git a/tests/test_client.py b/tests/test_client.py index 3efce1b8..04d26a80 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -119,7 +119,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig user_query = "How to git install application?" # Act - response = client.get(f"/search?q={user_query}&n=1&t=org") + response = client.get(f"/search?q={user_query}&n=1&t=org&r=true") # Assert assert response.status_code == 200 From 3fa7d8f03ae8be2b4d2327efb7fd670efa975223 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 02:48:27 +0400 Subject: [PATCH 04/15] Skeleton to allow incremental search on Khoj via Emacs --- src/interface/emacs/khoj.el | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 33f6edaf..2c8c2691 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -122,6 +122,19 @@ (let ((encoded-query (url-hexify-string query))) (format "%s/search?q=%s&t=%s" khoj--server-url encoded-query search-type))) +(defun query-khoj (beg end len) + (let ((query (minibuffer-contents))) + (message "t"))) + +(defun remove-khoj () + (remove-hook 'after-change-functions #'query-khoj)) + +(minibuffer-with-setup-hook + (lambda () + (add-hook 'after-change-functions #'query-khoj) + (add-hook 'minibuffer-exit-hook #'remove-khoj)) + (read-string "Query: ")) + ;;;###autoload (defun khoj (query) "Search your content naturally using the Khoj API" From fd1963d78197dd510369b731ef442557e2217071 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 03:05:00 +0400 Subject: [PATCH 05/15] Implement Basic Incremental Search Interface in Emacs for Org Mode Notes --- src/interface/emacs/khoj.el | 51 +++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 2c8c2691..53e57f0f 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -122,18 +122,53 @@ (let ((encoded-query (url-hexify-string query))) (format "%s/search?q=%s&t=%s" khoj--server-url encoded-query search-type))) -(defun query-khoj (beg end len) - (let ((query (minibuffer-contents))) - (message "t"))) +;; Incremental Search on Khoj (defun remove-khoj () (remove-hook 'after-change-functions #'query-khoj)) -(minibuffer-with-setup-hook - (lambda () - (add-hook 'after-change-functions #'query-khoj) - (add-hook 'minibuffer-exit-hook #'remove-khoj)) - (read-string "Query: ")) +(defun khoj-incremental () + (interactive) + (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) + (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music" "image") nil t default-type)) + (buff (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) + (switch-to-buffer buff) + (minibuffer-with-setup-hook + (lambda () + (add-hook 'after-change-functions #'query-khoj) + (add-hook 'minibuffer-exit-hook #'remove-khoj)) + (read-string "Query: ")))) + +(defun query-khoj (beg end len) + (let* ((query (minibuffer-contents-no-properties)) + (search-type "org") + (buff (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) + ;; get json response from api + (with-current-buffer buff + (let ((url (khoj--construct-api-query query search-type)) + (inhibit-read-only t)) + (erase-buffer) + (url-insert-file-contents url))) + ;; render json response into formatted entries + (with-current-buffer buff + (let ((inhibit-read-only t) + (json-response (json-parse-buffer :object-type 'alist))) + (erase-buffer) + (insert + (cond ((or (equal search-type "org") (equal search-type "music")) (khoj--extract-entries-as-org json-response query)) + ((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query)) + ((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query)) + ((equal search-type "image") (khoj--extract-entries-as-images json-response query)) + (t (format "%s" json-response)))) + (cond ((equal search-type "org") (org-mode)) + ((equal search-type "markdown") (markdown-mode)) + ((equal search-type "ledger") (beancount-mode)) + ((equal search-type "music") (progn (org-mode) + (org-music-mode))) + ((equal search-type "image") (progn (shr-render-region (point-min) (point-max)) + (goto-char (point-min)))) + (t (fundamental-mode)))) + (read-only-mode t)))) ;;;###autoload (defun khoj (query) From 0d49398954e1848d686c9dbe2a49cbacd6bed0e4 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 04:14:14 +0400 Subject: [PATCH 06/15] Reuse code to query api, render results. Formalize method, arg names --- src/interface/emacs/khoj.el | 127 +++++++++++++++++------------------- 1 file changed, 60 insertions(+), 67 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 53e57f0f..a7485422 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -46,6 +46,9 @@ :group 'khoj :type 'integer) +(defconst khoj--query-prompt "Khoj: " + "Query prompt shown to user in the minibuffer.") + (defun khoj--extract-entries-as-markdown (json-response query) "Convert json response from API to markdown entries" ;; remove leading (, ) or SPC from extracted entries string @@ -122,44 +125,23 @@ (let ((encoded-query (url-hexify-string query))) (format "%s/search?q=%s&t=%s" khoj--server-url encoded-query search-type))) - -;; Incremental Search on Khoj -(defun remove-khoj () - (remove-hook 'after-change-functions #'query-khoj)) - -(defun khoj-incremental () - (interactive) - (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) - (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music" "image") nil t default-type)) - (buff (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) - (switch-to-buffer buff) - (minibuffer-with-setup-hook - (lambda () - (add-hook 'after-change-functions #'query-khoj) - (add-hook 'minibuffer-exit-hook #'remove-khoj)) - (read-string "Query: ")))) - -(defun query-khoj (beg end len) - (let* ((query (minibuffer-contents-no-properties)) - (search-type "org") - (buff (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) - ;; get json response from api - (with-current-buffer buff - (let ((url (khoj--construct-api-query query search-type)) - (inhibit-read-only t)) - (erase-buffer) - (url-insert-file-contents url))) - ;; render json response into formatted entries - (with-current-buffer buff - (let ((inhibit-read-only t) - (json-response (json-parse-buffer :object-type 'alist))) - (erase-buffer) - (insert - (cond ((or (equal search-type "org") (equal search-type "music")) (khoj--extract-entries-as-org json-response query)) - ((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query)) - ((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query)) - ((equal search-type "image") (khoj--extract-entries-as-images json-response query)) - (t (format "%s" json-response)))) +(defun khoj--query-api-and-render-results (query search-type query-url buffer-name) + ;; get json response from api + (with-current-buffer buffer-name + (let ((inhibit-read-only t)) + (erase-buffer) + (url-insert-file-contents query-url))) + ;; render json response into formatted entries + (with-current-buffer buffer-name + (let ((inhibit-read-only t) + (json-response (json-parse-buffer :object-type 'alist))) + (erase-buffer) + (insert + (cond ((or (equal search-type "org") (equal search-type "music")) (khoj--extract-entries-as-org json-response query)) + ((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query)) + ((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query)) + ((equal search-type "image") (khoj--extract-entries-as-images json-response query)) + (t (format "%s" json-response)))) (cond ((equal search-type "org") (org-mode)) ((equal search-type "markdown") (markdown-mode)) ((equal search-type "ledger") (beancount-mode)) @@ -168,7 +150,38 @@ ((equal search-type "image") (progn (shr-render-region (point-min) (point-max)) (goto-char (point-min)))) (t (fundamental-mode)))) - (read-only-mode t)))) + (read-only-mode t))) + +;; Incremental Search on Khoj +(defun khoj--incremental-query (beg end len) + (let* ((in-khoj-prompt (equal (minibuffer-prompt) khoj--query-prompt)) + (search-type "org") + (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type))) + (query (minibuffer-contents-no-properties)) + (query-url (khoj--construct-api-query query search-type))) + (khoj--query-api-and-render-results + query + search-type + query-url + buffer-name))) + +(defun khoj--remove-incremental-query () + (remove-hook 'after-change-functions #'khoj--incremental-query) + (remove-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) + +;;;###autoload +(defun khoj-incremental () + "Natural, Incremental Search for your personal notes, transactions and music using Khoj" + (interactive) + (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) + (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music") nil t default-type)) + (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) + (switch-to-buffer buffer-name) + (minibuffer-with-setup-hook + (lambda () + (add-hook 'after-change-functions #'khoj--incremental-query) + (add-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) + (read-string khoj--query-prompt)))) ;;;###autoload (defun khoj (query) @@ -176,34 +189,14 @@ (interactive "sQuery: ") (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music" "image") nil t default-type)) - (url (khoj--construct-api-query query search-type)) - (buff (get-buffer-create (format "*Khoj (q:%s t:%s)*" query search-type)))) - ;; get json response from api - (with-current-buffer buff - (let ((inhibit-read-only t)) - (erase-buffer) - (url-insert-file-contents url))) - ;; render json response into formatted entries - (with-current-buffer buff - (let ((inhibit-read-only t) - (json-response (json-parse-buffer :object-type 'alist))) - (erase-buffer) - (insert - (cond ((or (equal search-type "org") (equal search-type "music")) (khoj--extract-entries-as-org json-response query)) - ((equal search-type "markdown") (khoj--extract-entries-as-markdown json-response query)) - ((equal search-type "ledger") (khoj--extract-entries-as-ledger json-response query)) - ((equal search-type "image") (khoj--extract-entries-as-images json-response query)) - (t (format "%s" json-response)))) - (cond ((equal search-type "org") (org-mode)) - ((equal search-type "markdown") (markdown-mode)) - ((equal search-type "ledger") (beancount-mode)) - ((equal search-type "music") (progn (org-mode) - (org-music-mode))) - ((equal search-type "image") (progn (shr-render-region (point-min) (point-max)) - (goto-char (point-min)))) - (t (fundamental-mode)))) - (read-only-mode t)) - (switch-to-buffer buff))) + (query-url (khoj--construct-api-query query search-type)) + (buffer-name (get-buffer-create (format "*Khoj (q:%s t:%s)*" query search-type)))) + (khoj--query-api-and-render-results + query + search-type + query-url + buffer-name) + (switch-to-buffer buffer-name))) (provide 'khoj) From bfcb962cbe113cc6f52dc9be4fbed1f94e61f312 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 05:41:22 +0400 Subject: [PATCH 07/15] Use post-command-hook to only query on user input - Hooking into after-change-functions results in system logs triggering query --- src/interface/emacs/khoj.el | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index a7485422..d3ca9d67 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -153,20 +153,19 @@ (read-only-mode t))) ;; Incremental Search on Khoj -(defun khoj--incremental-query (beg end len) - (let* ((in-khoj-prompt (equal (minibuffer-prompt) khoj--query-prompt)) - (search-type "org") +(defun khoj--incremental-query () + (let* ((search-type "org") (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type))) (query (minibuffer-contents-no-properties)) (query-url (khoj--construct-api-query query search-type))) (khoj--query-api-and-render-results - query - search-type - query-url - buffer-name))) + query + search-type + query-url + buffer-name))) (defun khoj--remove-incremental-query () - (remove-hook 'after-change-functions #'khoj--incremental-query) + (remove-hook 'post-command-hook #'khoj--incremental-query) (remove-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) ;;;###autoload @@ -179,8 +178,8 @@ (switch-to-buffer buffer-name) (minibuffer-with-setup-hook (lambda () - (add-hook 'after-change-functions #'khoj--incremental-query) - (add-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) + (add-hook 'post-command-hook #'khoj--incremental-query nil 'local) + (add-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query nil 'local)) (read-string khoj--query-prompt)))) ;;;###autoload From ad242cafa738d9b1c0cefa5f34d84cb96afa30a6 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 06:13:04 +0400 Subject: [PATCH 08/15] Support querying all text search types in incremental search - Before incremental search was hard-coded to only query org --- src/interface/emacs/khoj.el | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index d3ca9d67..e149c162 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -49,6 +49,9 @@ (defconst khoj--query-prompt "Khoj: " "Query prompt shown to user in the minibuffer.") +(defvar khoj--search-type "org" + "The type of content to perform search on.") + (defun khoj--extract-entries-as-markdown (json-response query) "Convert json response from API to markdown entries" ;; remove leading (, ) or SPC from extracted entries string @@ -154,7 +157,7 @@ ;; Incremental Search on Khoj (defun khoj--incremental-query () - (let* ((search-type "org") + (let* ((search-type khoj--search-type) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type))) (query (minibuffer-contents-no-properties)) (query-url (khoj--construct-api-query query search-type))) @@ -175,6 +178,7 @@ (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music") nil t default-type)) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) + (setq khoj--search-type search-type) (switch-to-buffer buffer-name) (minibuffer-with-setup-hook (lambda () From 9ab3edf6d60f337b9ea626106fde9c4922358c7a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 06:58:36 +0400 Subject: [PATCH 09/15] Re-rank incremental search results using cross-encoder if user idle This provides a relatively smooth mechanism - to improve relevance of results on idle - while providing the rapid, incremental results while typing --- src/interface/emacs/khoj.el | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index e149c162..b774136b 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -46,6 +46,14 @@ :group 'khoj :type 'integer) +(defcustom khoj--rerank-after-idle-time 1.0 + "Idle time (in seconds) to trigger cross-encoder to rerank incremental search results" + :group 'khoj + :type 'float) + +(defvar khoj--rerank-timer nil + "Idle timer to make cross-encoder re-rank incremental search results if user idle.") + (defconst khoj--query-prompt "Khoj: " "Query prompt shown to user in the minibuffer.") @@ -124,9 +132,10 @@ ((or (equal file-extension "markdown") (equal file-extension "md")) "markdown") (t "org")))) -(defun khoj--construct-api-query (query search-type) - (let ((encoded-query (url-hexify-string query))) - (format "%s/search?q=%s&t=%s" khoj--server-url encoded-query search-type))) +(defun khoj--construct-api-query (query search-type &optional rerank) + (let ((rerank (or rerank "false")) + (encoded-query (url-hexify-string query))) + (format "%s/search?q=%s&t=%s&r=%s" khoj--server-url encoded-query search-type rerank))) (defun khoj--query-api-and-render-results (query search-type query-url buffer-name) ;; get json response from api @@ -156,11 +165,12 @@ (read-only-mode t))) ;; Incremental Search on Khoj -(defun khoj--incremental-query () - (let* ((search-type khoj--search-type) +(defun khoj--incremental-query (&optional rerank) + (let* ((rerank (cond (rerank "true") (t "false"))) + (search-type khoj--search-type) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type))) (query (minibuffer-contents-no-properties)) - (query-url (khoj--construct-api-query query search-type))) + (query-url (khoj--construct-api-query query search-type rerank))) (khoj--query-api-and-render-results query search-type @@ -168,6 +178,8 @@ buffer-name))) (defun khoj--remove-incremental-query () + (khoj--incremental-query t) + (cancel-timer khoj--rerank-timer) (remove-hook 'post-command-hook #'khoj--incremental-query) (remove-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) @@ -179,6 +191,7 @@ (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music") nil t default-type)) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) (setq khoj--search-type search-type) + (setq khoj--rerank-timer (run-with-idle-timer khoj--rerank-after-idle-time t 'khoj--incremental-query t)) (switch-to-buffer buffer-name) (minibuffer-with-setup-hook (lambda () From 09727ac3be5bd264d6d2df2a5b6fbfda53f974a6 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 07:26:02 +0400 Subject: [PATCH 10/15] Make bi-encoder return fewer results to reduce cross-encoder latency --- src/search_type/text_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 81c92605..73f48930 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -20,7 +20,7 @@ def initialize_model(search_config: TextSearchConfig): torch.set_num_threads(4) # Number of entries we want to retrieve with the bi-encoder - top_k = 30 + top_k = 15 # The bi-encoder encodes all entries to use for semantic search bi_encoder = load_model( From 9302b45fe0b7674ee38376e33e0aa6b681dd8fb5 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 18:18:17 +0400 Subject: [PATCH 11/15] Use khoj-incremental as the main khoj func. Rename khoj to khoj-simple - Update khoj-simple to work cross-encoder re-ranked results like before - Increment major version as incremental search considered a breaking change and a major update to search capability --- src/interface/emacs/khoj.el | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index b774136b..43b6ec60 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -1,9 +1,9 @@ -;;; khoj.el --- Natural Search via Emacs +;;; khoj.el --- Natural, Incremental Search via Emacs ;; Copyright (C) 2021-2022 Debanjum Singh Solanky ;; Author: Debanjum Singh Solanky -;; Version: 1.0 +;; Version: 2.0 ;; Keywords: search, org-mode, outlines, markdown, image ;; URL: http://github.com/debanjum/khoj/interface/emacs @@ -26,10 +26,10 @@ ;;; Commentary: -;; This package provides natural language search on org-mode notes, -;; markdown files, beancount transactions and images. +;; This package provides a natural, incremental search interface to your +;; org-mode notes, markdown files, beancount transactions and images. ;; It is a wrapper that interfaces with transformer based ML models. -;; The models search capabilities are exposed via the Khoj HTTP API +;; The models search capabilities are exposed via the Khoj HTTP API. ;;; Code: @@ -184,7 +184,7 @@ (remove-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) ;;;###autoload -(defun khoj-incremental () +(defun khoj () "Natural, Incremental Search for your personal notes, transactions and music using Khoj" (interactive) (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) @@ -200,12 +200,13 @@ (read-string khoj--query-prompt)))) ;;;###autoload -(defun khoj (query) - "Search your content naturally using the Khoj API" +(defun khoj-simple (query) + "Natural Search for your personal notes, transactions, music and images using Khoj" (interactive "sQuery: ") - (let* ((default-type (khoj--buffer-name-to-search-type (buffer-name))) + (let* ((rerank "true") + (default-type (khoj--buffer-name-to-search-type (buffer-name))) (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music" "image") nil t default-type)) - (query-url (khoj--construct-api-query query search-type)) + (query-url (khoj--construct-api-query query search-type rerank)) (buffer-name (get-buffer-create (format "*Khoj (q:%s t:%s)*" query search-type)))) (khoj--query-api-and-render-results query From 9a6eee31be00185abfc432ff16f42c39dcba9d7f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 18:55:18 +0400 Subject: [PATCH 12/15] Make number of results to get from Khoj API customizable in khoj.el --- src/interface/emacs/khoj.el | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 43b6ec60..90562bf7 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -42,15 +42,20 @@ :type 'string) (defcustom khoj--image-width 156 - "Width of rendered images returned by Khoj" + "Width of rendered images returned by Khoj." :group 'khoj :type 'integer) (defcustom khoj--rerank-after-idle-time 1.0 - "Idle time (in seconds) to trigger cross-encoder to rerank incremental search results" + "Idle time (in seconds) to trigger cross-encoder to rerank incremental search results." :group 'khoj :type 'float) +(defcustom khoj--results-count 5 + "Number of results to get from Khoj API for each query." + :group 'khoj + :type 'integer) + (defvar khoj--rerank-timer nil "Idle timer to make cross-encoder re-rank incremental search results if user idle.") @@ -134,8 +139,9 @@ (defun khoj--construct-api-query (query search-type &optional rerank) (let ((rerank (or rerank "false")) + (results-count (or khoj--results-count 5)) (encoded-query (url-hexify-string query))) - (format "%s/search?q=%s&t=%s&r=%s" khoj--server-url encoded-query search-type rerank))) + (format "%s/search?q=%s&t=%s&r=%s&n=%s" khoj--server-url encoded-query search-type rerank results-count))) (defun khoj--query-api-and-render-results (query search-type query-url buffer-name) ;; get json response from api From 1b759597dfc1fcfec27e9fef1e2e4a992ddf4abb Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 27 Jul 2022 20:08:37 +0400 Subject: [PATCH 13/15] Make incremental search more robust. Follow standard user expectations - Rename functions to more standard, descriptive names - Keep known, required code for incremental search - E.g Do not set buffer local flag in hooks on minibuffer setup - Only query when user in khoj minibuffer - Use active-minibuffer-window and track khoj minibuffer - (minibuffer-prompt) is not useful for our use-case here - (For now) Run re-rank only if user idle while querying - Do not run rerank on teardown/completion - The reranking lag (~2s) is annoying; hit enter, wait to see results - Also triggered when user exits abnormally, so C-g also results in rerank which is even more annoying - Emacs will still hang if re-ranking gets triggered on idle but that's better than always getting triggered. And better than not having mechanism to get results re-ranked via cross-encoder at all --- src/interface/emacs/khoj.el | 54 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 90562bf7..a31cb3bc 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -36,6 +36,7 @@ (require 'url) (require 'json) + (defcustom khoj--server-url "http://localhost:8000" "Location of Khoj API server." :group 'khoj @@ -59,12 +60,16 @@ (defvar khoj--rerank-timer nil "Idle timer to make cross-encoder re-rank incremental search results if user idle.") +(defvar khoj--minibuffer-window nil + "Minibuffer window being used by user to enter query.") + (defconst khoj--query-prompt "Khoj: " "Query prompt shown to user in the minibuffer.") (defvar khoj--search-type "org" "The type of content to perform search on.") + (defun khoj--extract-entries-as-markdown (json-response query) "Convert json response from API to markdown entries" ;; remove leading (, ) or SPC from extracted entries string @@ -170,24 +175,33 @@ (t (fundamental-mode)))) (read-only-mode t))) + ;; Incremental Search on Khoj -(defun khoj--incremental-query (&optional rerank) - (let* ((rerank (cond (rerank "true") (t "false"))) +(defun khoj--incremental-search (&optional rerank) + (let* ((rerank-str (cond (rerank "true") (t "false"))) (search-type khoj--search-type) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type))) (query (minibuffer-contents-no-properties)) - (query-url (khoj--construct-api-query query search-type rerank))) - (khoj--query-api-and-render-results - query - search-type - query-url - buffer-name))) + (query-url (khoj--construct-api-query query search-type rerank-str))) + ;; Query khoj API only when user in khoj minibuffer. + ;; Prevents querying during recursive edits or with contents of other buffers user may jump to + (when (and (active-minibuffer-window) (equal (current-buffer) khoj--minibuffer-window)) + (khoj--query-api-and-render-results + query + search-type + query-url + buffer-name)))) + +(defun khoj--teardown-incremental-search () + ;; unset khoj minibuffer window + (setq khoj--minibuffer-window nil) + ;; cancel rerank timer + (when (timerp khoj--rerank-timer) + (cancel-timer khoj--rerank-timer)) + ;; remove hooks for khoj incremental query and self + (remove-hook 'post-command-hook #'khoj--incremental-search) + (remove-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) -(defun khoj--remove-incremental-query () - (khoj--incremental-query t) - (cancel-timer khoj--rerank-timer) - (remove-hook 'post-command-hook #'khoj--incremental-query) - (remove-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query)) ;;;###autoload (defun khoj () @@ -197,17 +211,23 @@ (search-type (completing-read "Type: " '("org" "markdown" "ledger" "music") nil t default-type)) (buffer-name (get-buffer-create (format "*Khoj (t:%s)*" search-type)))) (setq khoj--search-type search-type) - (setq khoj--rerank-timer (run-with-idle-timer khoj--rerank-after-idle-time t 'khoj--incremental-query t)) + ;; setup rerank to improve results once user idle for KHOJ--RERANK-AFTER-IDLE-TIME seconds + (setq khoj--rerank-timer (run-with-idle-timer khoj--rerank-after-idle-time t 'khoj--incremental-search t)) + ;; switch to khoj results buffer (switch-to-buffer buffer-name) + ;; open and setup minibuffer for incremental search (minibuffer-with-setup-hook (lambda () - (add-hook 'post-command-hook #'khoj--incremental-query nil 'local) - (add-hook 'minibuffer-exit-hook #'khoj--remove-incremental-query nil 'local)) + ;; set current (mini-)buffer entered as khoj minibuffer + ;; used to query khoj API only when user in khoj minibuffer + (setq khoj--minibuffer-window (current-buffer)) + (add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action + (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit (read-string khoj--query-prompt)))) ;;;###autoload (defun khoj-simple (query) - "Natural Search for your personal notes, transactions, music and images using Khoj" + "Natural Search for QUERY in your personal notes, transactions, music and images using Khoj" (interactive "sQuery: ") (let* ((rerank "true") (default-type (khoj--buffer-name-to-search-type (buffer-name))) From 80ac10835c6302c99e20cc549dd3ecccad5fa6e9 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 28 Jul 2022 03:37:16 +0400 Subject: [PATCH 14/15] Rerank results on normal minibuffer exit In current state: - Rerank results: - If user idles while entering query OR - exits normally - Do not rerank results: - If user exits abnormally, e.g via C-g from query --- src/interface/emacs/khoj.el | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index a31cb3bc..9491d28e 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -186,13 +186,18 @@ ;; Query khoj API only when user in khoj minibuffer. ;; Prevents querying during recursive edits or with contents of other buffers user may jump to (when (and (active-minibuffer-window) (equal (current-buffer) khoj--minibuffer-window)) - (khoj--query-api-and-render-results - query - search-type - query-url - buffer-name)))) + (progn + (when rerank + (message "[Khoj]: Rerank Results")) + (khoj--query-api-and-render-results + query + search-type + query-url + buffer-name))))) (defun khoj--teardown-incremental-search () + ;; remove advice to rerank results on normal exit from minibuffer + (advice-remove 'exit-minibuffer #'khoj--minibuffer-exit-advice) ;; unset khoj minibuffer window (setq khoj--minibuffer-window nil) ;; cancel rerank timer @@ -202,6 +207,8 @@ (remove-hook 'post-command-hook #'khoj--incremental-search) (remove-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) +(defun khoj--minibuffer-exit-advice (&rest _args) + (khoj--incremental-search t)) ;;;###autoload (defun khoj () @@ -221,6 +228,8 @@ ;; set current (mini-)buffer entered as khoj minibuffer ;; used to query khoj API only when user in khoj minibuffer (setq khoj--minibuffer-window (current-buffer)) + ;; rerank results on normal exit from minibuffer + (advice-add 'exit-minibuffer :before #'khoj--minibuffer-exit-advice) (add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit (read-string khoj--query-prompt)))) From af1dd31401faac6597f15026ea7b70f15d2eccd3 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 28 Jul 2022 04:32:34 +0400 Subject: [PATCH 15/15] Do not pass verbose argument to image_search.query() as not supported --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index de8e287f..ed1f6a1d 100644 --- a/src/main.py +++ b/src/main.py @@ -116,7 +116,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti if (t == SearchType.Image or t == None) and model.image_search: # query images query_start = time.time() - hits = image_search.query(user_query, results_count, model.image_search, verbose=verbose) + hits = image_search.query(user_query, results_count, model.image_search) output_directory = f'{os.getcwd()}/{web_directory}' query_end = time.time()