Fix comments, use minimal test case, regenerate test index, merge debug logs

- Remove property drawer from test entry for max_words splitting test - Property drawer is not required for the test - Keep minimal test case to reduce chance for confusion
2026-03-05 21:29:11 +00:00 · 2022-12-25 21:45:40 -03:00
parent b283650991
commit 24676f95d8
3 changed files with 7 additions and 10 deletions
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@@ -150,16 +150,16 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
    end = time.time()
    logger.debug(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")

-    # Deduplicate entries by raw entry text
-    # Required because entries are split by max_word count supported by ML model. This results in duplicate hits, entries
+    # Deduplicate entries by raw entry text before showing to users
+    # Compiled entries are split by max tokens supported by ML models.
+    # This can result in duplicate hits, entries shown to user.
    start = time.time()
    seen, original_hits_count = set(), len(hits)
    hits = [hit for hit in hits
            if entries[hit['corpus_id']].raw not in seen and not seen.add(entries[hit['corpus_id']].raw)]
    duplicate_hits = original_hits_count - len(hits)
    end = time.time()
-    logger.debug(f"Removed {duplicate_hits} Duplicate Hits")
-    logger.debug(f"Deduplication Time: {end - start:.3f} seconds")
+    logger.debug(f"Deduplication Time: {end - start:.3f} seconds. Removed {duplicate_hits} duplicates")

    return hits, entries