From eb492f3025d05ace145f22aeb372d84cdecf4b2f Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 10 Nov 2024 13:19:24 -0800
Subject: [PATCH] Only keep webpage content requested, even if Jina API gets
 more data

Jina search API returns content of all webpages in search results.
Previously code wouldn't remove content beyond max_webpages_to_read
limit set. Now, webpage content in organic results aree explicitly
removed beyond the requested max_webpage_to_read limit.

This should align behavior of online results from Jina with other
online search providers. And restrict llm context to a reasonable size
when using Jina for online search.
---
 src/khoj/processor/tools/online_search.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index c6fc7c20..34c4911a 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -95,17 +95,21 @@ async def search_online(
         response_dict = {subquery: search_result for subquery, search_result in search_results}
 
     # Gather distinct web pages from organic results for subqueries without an instant answer.
-    # Content of web pages is directly available when Jina is used for search.
     webpages: Dict[str, Dict] = {}
     for subquery in response_dict:
         if "answerBox" in response_dict[subquery]:
             continue
-        for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
+        for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
             link = organic.get("link")
-            if link in webpages:
+            if link in webpages and idx < max_webpages_to_read:
                 webpages[link]["queries"].add(subquery)
-            else:
+            # Content of web pages is directly available when Jina is used for search.
+            elif idx < max_webpages_to_read:
                 webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
+            # Only keep webpage content for up to max_webpages_to_read organic results.
+            if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
+                organic["content"] = None
+                response_dict[subquery]["organic"][idx] = organic
 
     # Read, extract relevant info from the retrieved web pages
     if webpages: