From 45f42531202a0c96b3fdfe7c21cb07097112e41e Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Wed, 12 Nov 2025 13:40:54 -0800
Subject: [PATCH] Move Olostep scraping config into its webpage reader for
 cleaner code

---
 src/khoj/processor/tools/online_search.py | 41 ++++++++++++-----------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index 92913fe9..9147ab5c 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -33,36 +33,24 @@ from khoj.utils.rawconfig import LocationData
 
 logger = logging.getLogger(__name__)
 
+# Google Search API configurations
 GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
 GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+# Serper Dev API configurations
 SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY")
-AUTO_READ_WEBPAGE = is_env_var_true("KHOJ_AUTO_READ_WEBPAGE")
 SERPER_DEV_URL = "https://google.serper.dev/search"
-
+# Firecrawl API configurations
 FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
-
+# SearXNG API configurations
 SEARXNG_URL = os.getenv("KHOJ_SEARXNG_URL")
-
+# Exa API credentials
 EXA_API_KEY = os.getenv("EXA_API_KEY")
 
+# Whether to automatically read web pages from search results
+AUTO_READ_WEBPAGE = is_env_var_true("KHOJ_AUTO_READ_WEBPAGE")
 # Timeout for web search and webpage read HTTP requests
 WEBPAGE_REQUEST_TIMEOUT = 60  # seconds
 
-OLOSTEP_QUERY_PARAMS = {
-    "timeout": 35,  # seconds
-    "waitBeforeScraping": 0,  # seconds
-    "saveHtml": "False",
-    "saveMarkdown": "True",
-    "removeCSSselectors": "default",
-    "htmlTransformer": "none",
-    "removeImages": "True",
-    "fastLane": "True",
-    # Similar to Stripe's API, the expand parameters avoid the need to make a second API call
-    # to retrieve the dataset (from the dataset API) if you only need the markdown or html.
-    "expandMarkdown": "True",
-    "expandHtml": "False",
-}
-
 
 async def search_online(
     query: str,
@@ -567,7 +555,20 @@ async def read_webpage_at_url(web_url: str) -> str:
 
 async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str:
     headers = {"Authorization": f"Bearer {api_key}"}
-    web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy()  # type: ignore
+    web_scraping_params: Dict[str, Union[str, int, bool]] = {
+        "timeout": 35,  # seconds
+        "waitBeforeScraping": 0,  # seconds
+        "saveHtml": "False",
+        "saveMarkdown": "True",
+        "removeCSSselectors": "default",
+        "htmlTransformer": "none",
+        "removeImages": "True",
+        "fastLane": "True",
+        # Similar to Stripe's API, the expand parameters avoid the need to make a second API call
+        # to retrieve the dataset (from the dataset API) if you only need the markdown or html.
+        "expandMarkdown": "True",
+        "expandHtml": "False",
+    }
     web_scraping_params["url"] = web_url
 
     async with aiohttp.ClientSession() as session: