mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Fix 2 document retrieval bugs to not drop valid search results
1. Due to the interaction of two changes: - dedupe by corpus_id, where corpus_id tracks logical content blocks like files, org/md headings. - return compiled, not logical blocks, where compiled track smaller content chunks that fit within search model, llm context windows. When combined they showed only 1 hit compiled chunk per logical block. Even if multiple chunks match within a logical content block. Fix is to either dedupe by compiled text or to return deduped logical content blocks (by corpus_id) corresponding to matched compiled chunks. This commit fixes it by the first method. 2. Due to inferred query, search results zip which resulted in a single search result being returned per query! This silently cut down matching search results and went undetected.
This commit is contained in:
@@ -1242,22 +1242,29 @@ async def search_documents(
|
||||
async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"):
|
||||
yield {ChatEvent.STATUS: event}
|
||||
for query in inferred_queries:
|
||||
search_results.extend(
|
||||
await execute_search(
|
||||
user if not should_limit_to_agent_knowledge else None,
|
||||
f"{query} {filters_in_query}",
|
||||
n=n,
|
||||
t=SearchType.All,
|
||||
r=True,
|
||||
max_distance=d,
|
||||
dedupe=False,
|
||||
agent=agent,
|
||||
)
|
||||
results = await execute_search(
|
||||
user if not should_limit_to_agent_knowledge else None,
|
||||
f"{query} {filters_in_query}",
|
||||
n=n,
|
||||
t=SearchType.All,
|
||||
r=True,
|
||||
max_distance=d,
|
||||
dedupe=False,
|
||||
agent=agent,
|
||||
)
|
||||
# Attach associated query to each search result
|
||||
for item in results:
|
||||
item.additional["query"] = query
|
||||
search_results.append(item)
|
||||
|
||||
search_results = text_search.deduplicated_search_responses(search_results)
|
||||
compiled_references = [
|
||||
{"query": q, "compiled": item.additional["compiled"], "file": item.additional["file"]}
|
||||
for q, item in zip(inferred_queries, search_results)
|
||||
{
|
||||
"query": item.additional["query"],
|
||||
"compiled": item.additional["compiled"],
|
||||
"file": item.additional["file"],
|
||||
}
|
||||
for item in search_results
|
||||
]
|
||||
|
||||
yield compiled_references, inferred_queries, defiltered_query
|
||||
|
||||
@@ -167,11 +167,11 @@ def collate_results(hits, dedupe=True):
|
||||
def deduplicated_search_responses(hits: List[SearchResponse]):
|
||||
hit_ids = set()
|
||||
for hit in hits:
|
||||
if hit.corpus_id in hit_ids:
|
||||
if hit.additional["compiled"] in hit_ids:
|
||||
continue
|
||||
|
||||
else:
|
||||
hit_ids.add(hit.corpus_id)
|
||||
hit_ids.add(hit.additional["compiled"])
|
||||
yield SearchResponse.model_validate(
|
||||
{
|
||||
"entry": hit.entry,
|
||||
@@ -180,6 +180,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
|
||||
"additional": {
|
||||
"source": hit.additional["source"],
|
||||
"file": hit.additional["file"],
|
||||
"query": hit.additional["query"],
|
||||
"compiled": hit.additional["compiled"],
|
||||
"heading": hit.additional["heading"],
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user