Fix 2 document retrieval bugs to not drop valid search results

1. Due to the interaction of two changes: - dedupe by corpus_id, where corpus_id tracks logical content blocks like files, org/md headings. - return compiled, not logical blocks, where compiled track smaller content chunks that fit within search model, llm context windows. When combined they showed only 1 hit compiled chunk per logical block. Even if multiple chunks match within a logical content block. Fix is to either dedupe by compiled text or to return deduped logical content blocks (by corpus_id) corresponding to matched compiled chunks. This commit fixes it by the first method. 2. Due to inferred query, search results zip which resulted in a single search result being returned per query! This silently cut down matching search results and went undetected.
2026-03-02 13:18:18 +00:00 · 2025-06-22 18:10:18 -07:00
parent 73c384b052
commit 170a8036fe
2 changed files with 23 additions and 15 deletions
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1242,22 +1242,29 @@ async def search_documents(
            async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"):
                yield {ChatEvent.STATUS: event}
        for query in inferred_queries:
-            search_results.extend(
-                await execute_search(
-                    user if not should_limit_to_agent_knowledge else None,
-                    f"{query} {filters_in_query}",
-                    n=n,
-                    t=SearchType.All,
-                    r=True,
-                    max_distance=d,
-                    dedupe=False,
-                    agent=agent,
-                )
+            results = await execute_search(
+                user if not should_limit_to_agent_knowledge else None,
+                f"{query} {filters_in_query}",
+                n=n,
+                t=SearchType.All,
+                r=True,
+                max_distance=d,
+                dedupe=False,
+                agent=agent,
            )
+            # Attach associated query to each search result
+            for item in results:
+                item.additional["query"] = query
+                search_results.append(item)
+
        search_results = text_search.deduplicated_search_responses(search_results)
        compiled_references = [
-            {"query": q, "compiled": item.additional["compiled"], "file": item.additional["file"]}
-            for q, item in zip(inferred_queries, search_results)
+            {
+                "query": item.additional["query"],
+                "compiled": item.additional["compiled"],
+                "file": item.additional["file"],
+            }
+            for item in search_results
        ]

    yield compiled_references, inferred_queries, defiltered_query
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -167,11 +167,11 @@ def collate_results(hits, dedupe=True):
 def deduplicated_search_responses(hits: List[SearchResponse]):
    hit_ids = set()
    for hit in hits:
-        if hit.corpus_id in hit_ids:
+        if hit.additional["compiled"] in hit_ids:
            continue

        else:
-            hit_ids.add(hit.corpus_id)
+            hit_ids.add(hit.additional["compiled"])
            yield SearchResponse.model_validate(
                {
                    "entry": hit.entry,
@@ -180,6 +180,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
                    "additional": {
                        "source": hit.additional["source"],
                        "file": hit.additional["file"],
+                        "query": hit.additional["query"],
                        "compiled": hit.additional["compiled"],
                        "heading": hit.additional["heading"],
                    },