mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-03 13:19:16 +00:00
Only get text search results above confidence threshold via API
- During the migration, the confidence score stopped being used. It was being passed down from API to some point and went unused - Remove score thresholding for images as image search confidence score different from text search model distance score - Default score threshold of 0.15 is experimentally determined by manually looking at search results vs distance for a few queries - Use distance instead of confidence as metric for search result quality Previously we'd moved text search to a distance metric from a confidence score. Now convert even cross encoder, image search scores to distance metric for consistent results sorting
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import math
|
||||
from typing import Optional, Type, TypeVar, List
|
||||
from datetime import date, datetime, timedelta
|
||||
import secrets
|
||||
@@ -437,12 +438,19 @@ class EntryAdapters:
|
||||
|
||||
@staticmethod
|
||||
def search_with_embeddings(
|
||||
user: KhojUser, embeddings: Tensor, max_results: int = 10, file_type_filter: str = None, raw_query: str = None
|
||||
user: KhojUser,
|
||||
embeddings: Tensor,
|
||||
max_results: int = 10,
|
||||
file_type_filter: str = None,
|
||||
raw_query: str = None,
|
||||
max_distance: float = math.inf,
|
||||
):
|
||||
relevant_entries = EntryAdapters.apply_filters(user, raw_query, file_type_filter)
|
||||
relevant_entries = relevant_entries.filter(user=user).annotate(
|
||||
distance=CosineDistance("embeddings", embeddings)
|
||||
)
|
||||
relevant_entries = relevant_entries.filter(distance__lte=max_distance)
|
||||
|
||||
if file_type_filter:
|
||||
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||
relevant_entries = relevant_entries.order_by("distance")
|
||||
|
||||
Reference in New Issue
Block a user