Fix 2 document retrieval bugs to not drop valid search results

1. Due to the interaction of two changes:
  - dedupe by corpus_id, where corpus_id tracks logical content blocks
  like files, org/md headings.
  - return compiled, not logical blocks, where compiled track smaller
  content chunks that fit within search model, llm context windows.

  When combined they showed only 1 hit compiled chunk per logical
  block. Even if multiple chunks match within a logical content block.

  Fix is to either dedupe by compiled text or to return deduped
  logical content blocks (by corpus_id) corresponding to matched
  compiled chunks. This commit fixes it by the first method.

2. Due to inferred query, search results zip which resulted in a
   single search result being returned per query!
   This silently cut down matching search results and went undetected.
This commit is contained in:
Debanjum
2025-06-22 18:10:18 -07:00
parent 73c384b052
commit 170a8036fe
2 changed files with 23 additions and 15 deletions

View File

@@ -1242,22 +1242,29 @@ async def search_documents(
async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"):
yield {ChatEvent.STATUS: event}
for query in inferred_queries:
search_results.extend(
await execute_search(
user if not should_limit_to_agent_knowledge else None,
f"{query} {filters_in_query}",
n=n,
t=SearchType.All,
r=True,
max_distance=d,
dedupe=False,
agent=agent,
)
results = await execute_search(
user if not should_limit_to_agent_knowledge else None,
f"{query} {filters_in_query}",
n=n,
t=SearchType.All,
r=True,
max_distance=d,
dedupe=False,
agent=agent,
)
# Attach associated query to each search result
for item in results:
item.additional["query"] = query
search_results.append(item)
search_results = text_search.deduplicated_search_responses(search_results)
compiled_references = [
{"query": q, "compiled": item.additional["compiled"], "file": item.additional["file"]}
for q, item in zip(inferred_queries, search_results)
{
"query": item.additional["query"],
"compiled": item.additional["compiled"],
"file": item.additional["file"],
}
for item in search_results
]
yield compiled_references, inferred_queries, defiltered_query

View File

@@ -167,11 +167,11 @@ def collate_results(hits, dedupe=True):
def deduplicated_search_responses(hits: List[SearchResponse]):
hit_ids = set()
for hit in hits:
if hit.corpus_id in hit_ids:
if hit.additional["compiled"] in hit_ids:
continue
else:
hit_ids.add(hit.corpus_id)
hit_ids.add(hit.additional["compiled"])
yield SearchResponse.model_validate(
{
"entry": hit.entry,
@@ -180,6 +180,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
"additional": {
"source": hit.additional["source"],
"file": hit.additional["file"],
"query": hit.additional["query"],
"compiled": hit.additional["compiled"],
"heading": hit.additional["heading"],
},