From 170a8036fe2b22e661eb6d56b3d0cf5f112d46b7 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 22 Jun 2025 18:10:18 -0700 Subject: [PATCH] Fix 2 document retrieval bugs to not drop valid search results 1. Due to the interaction of two changes: - dedupe by corpus_id, where corpus_id tracks logical content blocks like files, org/md headings. - return compiled, not logical blocks, where compiled track smaller content chunks that fit within search model, llm context windows. When combined they showed only 1 hit compiled chunk per logical block. Even if multiple chunks match within a logical content block. Fix is to either dedupe by compiled text or to return deduped logical content blocks (by corpus_id) corresponding to matched compiled chunks. This commit fixes it by the first method. 2. Due to inferred query, search results zip which resulted in a single search result being returned per query! This silently cut down matching search results and went undetected. --- src/khoj/routers/helpers.py | 33 +++++++++++++++++------------ src/khoj/search_type/text_search.py | 5 +++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 11e729cf..baeb9141 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -1242,22 +1242,29 @@ async def search_documents( async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"): yield {ChatEvent.STATUS: event} for query in inferred_queries: - search_results.extend( - await execute_search( - user if not should_limit_to_agent_knowledge else None, - f"{query} {filters_in_query}", - n=n, - t=SearchType.All, - r=True, - max_distance=d, - dedupe=False, - agent=agent, - ) + results = await execute_search( + user if not should_limit_to_agent_knowledge else None, + f"{query} {filters_in_query}", + n=n, + t=SearchType.All, + r=True, + max_distance=d, + dedupe=False, + agent=agent, ) + # Attach associated query to each search result + for item in results: + item.additional["query"] = query + search_results.append(item) + search_results = text_search.deduplicated_search_responses(search_results) compiled_references = [ - {"query": q, "compiled": item.additional["compiled"], "file": item.additional["file"]} - for q, item in zip(inferred_queries, search_results) + { + "query": item.additional["query"], + "compiled": item.additional["compiled"], + "file": item.additional["file"], + } + for item in search_results ] yield compiled_references, inferred_queries, defiltered_query diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 6d7667e5..3fafa44b 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -167,11 +167,11 @@ def collate_results(hits, dedupe=True): def deduplicated_search_responses(hits: List[SearchResponse]): hit_ids = set() for hit in hits: - if hit.corpus_id in hit_ids: + if hit.additional["compiled"] in hit_ids: continue else: - hit_ids.add(hit.corpus_id) + hit_ids.add(hit.additional["compiled"]) yield SearchResponse.model_validate( { "entry": hit.entry, @@ -180,6 +180,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]): "additional": { "source": hit.additional["source"], "file": hit.additional["file"], + "query": hit.additional["query"], "compiled": hit.additional["compiled"], "heading": hit.additional["heading"], },