From eb492f3025d05ace145f22aeb372d84cdecf4b2f Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 10 Nov 2024 13:19:24 -0800 Subject: [PATCH] Only keep webpage content requested, even if Jina API gets more data Jina search API returns content of all webpages in search results. Previously code wouldn't remove content beyond max_webpages_to_read limit set. Now, webpage content in organic results aree explicitly removed beyond the requested max_webpage_to_read limit. This should align behavior of online results from Jina with other online search providers. And restrict llm context to a reasonable size when using Jina for online search. --- src/khoj/processor/tools/online_search.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c6fc7c20..34c4911a 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -95,17 +95,21 @@ async def search_online( response_dict = {subquery: search_result for subquery, search_result in search_results} # Gather distinct web pages from organic results for subqueries without an instant answer. - # Content of web pages is directly available when Jina is used for search. webpages: Dict[str, Dict] = {} for subquery in response_dict: if "answerBox" in response_dict[subquery]: continue - for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]: + for idx, organic in enumerate(response_dict[subquery].get("organic", [])): link = organic.get("link") - if link in webpages: + if link in webpages and idx < max_webpages_to_read: webpages[link]["queries"].add(subquery) - else: + # Content of web pages is directly available when Jina is used for search. + elif idx < max_webpages_to_read: webpages[link] = {"queries": {subquery}, "content": organic.get("content")} + # Only keep webpage content for up to max_webpages_to_read organic results. + if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")): + organic["content"] = None + response_dict[subquery]["organic"][idx] = organic # Read, extract relevant info from the retrieved web pages if webpages: