mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 05:40:17 +00:00
Deduplicate results for user query by raw text before returning results
- Required because entries are now split by the max_word count supported by the ML models - This would now result in potentially duplicate hits, entries being returned to user - Do deduplication after ranking to get the top ranked deduplicated results
This commit is contained in:
@@ -150,6 +150,17 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
|
|||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
|
logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
|
||||||
|
|
||||||
|
# Deduplicate entries by raw entry text
|
||||||
|
# Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
|
||||||
|
start = time.time()
|
||||||
|
seen, original_hits_count = set(), len(hits)
|
||||||
|
hits = [hit for hit in hits
|
||||||
|
if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
|
||||||
|
duplicate_hits = original_hits_count - len(hits)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
|
||||||
|
logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
|
||||||
|
|
||||||
return hits, entries
|
return hits, entries
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user