mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 13:22:12 +00:00
Fix 2 document retrieval bugs to not drop valid search results
1. Due to the interaction of two changes: - dedupe by corpus_id, where corpus_id tracks logical content blocks like files, org/md headings. - return compiled, not logical blocks, where compiled track smaller content chunks that fit within search model, llm context windows. When combined they showed only 1 hit compiled chunk per logical block. Even if multiple chunks match within a logical content block. Fix is to either dedupe by compiled text or to return deduped logical content blocks (by corpus_id) corresponding to matched compiled chunks. This commit fixes it by the first method. 2. Due to inferred query, search results zip which resulted in a single search result being returned per query! This silently cut down matching search results and went undetected.
This commit is contained in:
@@ -1242,22 +1242,29 @@ async def search_documents(
|
|||||||
async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"):
|
async for event in send_status_func(f"**Searching Documents for:** {inferred_queries_str}"):
|
||||||
yield {ChatEvent.STATUS: event}
|
yield {ChatEvent.STATUS: event}
|
||||||
for query in inferred_queries:
|
for query in inferred_queries:
|
||||||
search_results.extend(
|
results = await execute_search(
|
||||||
await execute_search(
|
user if not should_limit_to_agent_knowledge else None,
|
||||||
user if not should_limit_to_agent_knowledge else None,
|
f"{query} {filters_in_query}",
|
||||||
f"{query} {filters_in_query}",
|
n=n,
|
||||||
n=n,
|
t=SearchType.All,
|
||||||
t=SearchType.All,
|
r=True,
|
||||||
r=True,
|
max_distance=d,
|
||||||
max_distance=d,
|
dedupe=False,
|
||||||
dedupe=False,
|
agent=agent,
|
||||||
agent=agent,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
# Attach associated query to each search result
|
||||||
|
for item in results:
|
||||||
|
item.additional["query"] = query
|
||||||
|
search_results.append(item)
|
||||||
|
|
||||||
search_results = text_search.deduplicated_search_responses(search_results)
|
search_results = text_search.deduplicated_search_responses(search_results)
|
||||||
compiled_references = [
|
compiled_references = [
|
||||||
{"query": q, "compiled": item.additional["compiled"], "file": item.additional["file"]}
|
{
|
||||||
for q, item in zip(inferred_queries, search_results)
|
"query": item.additional["query"],
|
||||||
|
"compiled": item.additional["compiled"],
|
||||||
|
"file": item.additional["file"],
|
||||||
|
}
|
||||||
|
for item in search_results
|
||||||
]
|
]
|
||||||
|
|
||||||
yield compiled_references, inferred_queries, defiltered_query
|
yield compiled_references, inferred_queries, defiltered_query
|
||||||
|
|||||||
@@ -167,11 +167,11 @@ def collate_results(hits, dedupe=True):
|
|||||||
def deduplicated_search_responses(hits: List[SearchResponse]):
|
def deduplicated_search_responses(hits: List[SearchResponse]):
|
||||||
hit_ids = set()
|
hit_ids = set()
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
if hit.corpus_id in hit_ids:
|
if hit.additional["compiled"] in hit_ids:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
hit_ids.add(hit.corpus_id)
|
hit_ids.add(hit.additional["compiled"])
|
||||||
yield SearchResponse.model_validate(
|
yield SearchResponse.model_validate(
|
||||||
{
|
{
|
||||||
"entry": hit.entry,
|
"entry": hit.entry,
|
||||||
@@ -180,6 +180,7 @@ def deduplicated_search_responses(hits: List[SearchResponse]):
|
|||||||
"additional": {
|
"additional": {
|
||||||
"source": hit.additional["source"],
|
"source": hit.additional["source"],
|
||||||
"file": hit.additional["file"],
|
"file": hit.additional["file"],
|
||||||
|
"query": hit.additional["query"],
|
||||||
"compiled": hit.additional["compiled"],
|
"compiled": hit.additional["compiled"],
|
||||||
"heading": hit.additional["heading"],
|
"heading": hit.additional["heading"],
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user