From db7eba56f617af92117549aa2379cf771e6d17a1 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Sun, 30 Mar 2025 20:17:42 +0530 Subject: [PATCH] Fix webpage read and improve web search with Jina - Improve webpage read to include image alt text - Improve Jina webpage search to not include each page content - Use POST instead of GET for web search, webpage read with Jina --- src/khoj/processor/tools/online_search.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 4234b32c..b3d8c015 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -398,16 +398,16 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str: - jina_reader_api_url = f"{api_url}/{web_url}" - headers = {"Accept": "application/json", "X-Timeout": "30"} + headers = {"Accept": "application/json", "X-Timeout": "30", "X-With-Generated-Alt": "true"} + data = {"url": web_url} if api_key: headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: - async with session.get(jina_reader_api_url, headers=headers) as response: + async with session.post(api_url, json=data, headers=headers) as response: response.raise_for_status() - response_json = await response.json() - return response_json["data"]["content"] + content = await response.text() + return content async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: @@ -459,10 +459,6 @@ Collate only relevant information from the website to answer the target query an async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: - encoded_query = urllib.parse.quote(query) - jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}" - headers = {"Accept": "application/json"} - # First check for jina scraper configuration in database default_jina_scraper = ( await ServerChatSettings.objects.filter() @@ -477,13 +473,15 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic jina_scraper = await WebScraper.objects.filter(type=WebScraper.WebScraperType.JINA).afirst() # Get API key from DB scraper config or environment variable + data = {"q": query} + headers = {"Accept": "application/json", "X-Respond-With": "no-content"} api_key = jina_scraper.api_key if jina_scraper and jina_scraper.api_key else JINA_API_KEY if api_key: headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: - async with session.get(jina_search_api_url, headers=headers) as response: + async with session.post(JINA_SEARCH_API_URL, json=data, headers=headers) as response: if response.status != 200: error_text = await response.text() logger.error(f"Jina search failed: {error_text}")