From 9affeb9e855b96bd510abc0f00da75317e026219 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 17 Oct 2024 15:24:43 -0700 Subject: [PATCH 01/13] Fix to log the client app calling the chat API - Remove unused subscribed variable from the chat API - Unexpectedly dropped client app logging when migrated API chat to do advanced streaming in july --- src/khoj/routers/api_chat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 228d081c..9022a7dc 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -574,7 +574,6 @@ async def chat( chat_metadata: dict = {} connection_alive = True user: KhojUser = request.user.object - subscribed: bool = has_required_scope(request, ["premium"]) event_delimiter = "␃🔚␗" q = unquote(q) nonlocal conversation_id @@ -641,7 +640,7 @@ async def chat( request=request, telemetry_type="api", api="chat", - client=request.user.client_app, + client=common.client, user_agent=request.headers.get("user-agent"), host=request.headers.get("host"), metadata=chat_metadata, From 1b04b801c6e6e62a09a16d4a5eddff7dbafe9590 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 14 Oct 2024 17:39:44 -0700 Subject: [PATCH 02/13] Try respond even if document search via inference endpoint fails The huggingface endpoint can be flaky. Khoj shouldn't refuse to respond to user if document search fails. It should transparently mention that document lookup failed. But try respond as best as it can without the document references This changes provides graceful failover when inference endpoint requests fail either when encoding query or reranking retrieved docs --- src/khoj/processor/embeddings.py | 1 + src/khoj/routers/api_chat.py | 47 +++++++++++++++++------------ src/khoj/search_type/text_search.py | 9 ++++-- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/src/khoj/processor/embeddings.py b/src/khoj/processor/embeddings.py index 71af5b7d..a19d85fa 100644 --- a/src/khoj/processor/embeddings.py +++ b/src/khoj/processor/embeddings.py @@ -114,6 +114,7 @@ class CrossEncoderModel: payload = {"inputs": {"query": query, "passages": [hit.additional[key] for hit in hits]}} headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} response = requests.post(target_url, json=payload, headers=headers) + response.raise_for_status() return response.json()["scores"] cross_inp = [[query, hit.additional[key]] for hit in hits] diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 9022a7dc..93c905b6 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -3,7 +3,6 @@ import base64 import json import logging import time -import warnings from datetime import datetime from functools import partial from typing import Dict, Optional @@ -839,25 +838,33 @@ async def chat( # Gather Context ## Extract Document References compiled_references, inferred_queries, defiltered_query = [], [], None - async for result in extract_references_and_questions( - request, - meta_log, - q, - (n or 7), - d, - conversation_id, - conversation_commands, - location, - partial(send_event, ChatEvent.STATUS), - uploaded_image_url=uploaded_image_url, - agent=agent, - ): - if isinstance(result, dict) and ChatEvent.STATUS in result: - yield result[ChatEvent.STATUS] - else: - compiled_references.extend(result[0]) - inferred_queries.extend(result[1]) - defiltered_query = result[2] + try: + async for result in extract_references_and_questions( + request, + meta_log, + q, + (n or 7), + d, + conversation_id, + conversation_commands, + location, + partial(send_event, ChatEvent.STATUS), + uploaded_image_url=uploaded_image_url, + agent=agent, + ): + if isinstance(result, dict) and ChatEvent.STATUS in result: + yield result[ChatEvent.STATUS] + else: + compiled_references.extend(result[0]) + inferred_queries.extend(result[1]) + defiltered_query = result[2] + except Exception as e: + error_message = f"Error searching knowledge base: {e}. Attempting to respond without document references." + logger.warning(error_message) + async for result in send_event( + ChatEvent.STATUS, "Document search failed. I'll try respond without document references" + ): + yield result if not is_none_or_empty(compiled_references): headings = "\n- " + "\n- ".join(set([c.get("compiled", c).split("\n")[0] for c in compiled_references])) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 52e23f29..b67132e4 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -3,6 +3,7 @@ import math from pathlib import Path from typing import List, Optional, Tuple, Type, Union +import requests import torch from asgiref.sync import sync_to_async from sentence_transformers import util @@ -231,8 +232,12 @@ def setup( def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]: """Score all retrieved entries using the cross-encoder""" - with timer("Cross-Encoder Predict Time", logger, state.device): - cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits) + try: + with timer("Cross-Encoder Predict Time", logger, state.device): + cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits) + except requests.exceptions.HTTPError as e: + logger.error(f"Failed to rerank documents using the inference endpoint. Error: {e}.", exc_info=True) + cross_scores = [0.0] * len(hits) # Convert cross-encoder scores to distances and pass in hits for reranking for idx in range(len(cross_scores)): From a9325641696dc2a718f9b2e8b85f5cdcfdd9aea6 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 14 Oct 2024 17:44:46 -0700 Subject: [PATCH 03/13] Try respond even if web search, webpage read fails during chat Khoj shouldn't refuse to respond to user if web lookups fail. It should transparently mention that online search etc. failed. But try respond as best as it can without those references This change ensures a response to the users query is attempted even when web info retrieval fails. --- src/khoj/routers/api_chat.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 93c905b6..0d367029 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -900,12 +900,13 @@ async def chat( yield result[ChatEvent.STATUS] else: online_results = result - except ValueError as e: + except Exception as e: error_message = f"Error searching online: {e}. Attempting to respond without online results" logger.warning(error_message) - async for result in send_llm_response(error_message): + async for result in send_event( + ChatEvent.STATUS, "Online search failed. I'll try respond without online references" + ): yield result - return ## Gather Webpage References if ConversationCommand.Webpage in conversation_commands: @@ -934,11 +935,15 @@ async def chat( webpages.append(webpage["link"]) async for result in send_event(ChatEvent.STATUS, f"**Read web pages**: {webpages}"): yield result - except ValueError as e: + except Exception as e: logger.warning( - f"Error directly reading webpages: {e}. Attempting to respond without online results", + f"Error reading webpages: {e}. Attempting to respond without webpage results", exc_info=True, ) + async for result in send_event( + ChatEvent.STATUS, "Webpage read failed. I'll try respond without webpage references" + ): + yield result ## Send Gathered References async for result in send_event( From 731ea3779eca33d6ada46b3f652b75e38b27121c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 14 Oct 2024 18:14:40 -0700 Subject: [PATCH 04/13] Return data sources to use if exception in data source chat actor Previously no value was returned if an exception got triggered when collecting information sources to search. --- src/khoj/routers/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 245fdf09..a80864ba 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -353,13 +353,13 @@ async def aget_relevant_information_sources( final_response = [ConversationCommand.Default] else: final_response = [ConversationCommand.General] - return final_response - except Exception as e: + except Exception: logger.error(f"Invalid response for determining relevant tools: {response}") if len(agent_tools) == 0: final_response = [ConversationCommand.Default] else: final_response = agent_tools + return final_response async def aget_relevant_output_modes( From 993fd7cd2b4e6e95b308be5617f109c4e62748bd Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 10:52:19 -0700 Subject: [PATCH 05/13] Support using Firecrawl to read webpages Firecrawl is open-source, self-hostable with a default hosted service provided, similar to Jina.ai. So it can be 1. Self-hosted as part of a private Khoj cloud deployment 2. Used directly by getting an API key from the Firecrawl.dev service This is as an alternative to Olostep and Jina.ai for reading webpages. --- src/khoj/processor/tools/online_search.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index f5cb3c12..a9dd2476 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -29,6 +29,9 @@ JINA_READER_API_URL = "https://r.jina.ai/" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") +FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") +FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") + OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" OLOSTEP_QUERY_PARAMS = { @@ -172,7 +175,12 @@ async def read_webpage_and_extract_content( try: if is_none_or_empty(content): with timer(f"Reading web page at '{url}' took", logger): - content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage_with_jina(url) + if FIRECRAWL_API_KEY: + content = await read_webpage_with_firecrawl(url) + elif OLOSTEP_API_KEY: + content = await read_webpage_with_olostep(url) + else: + content = await read_webpage_with_jina(url) with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) return subquery, extracted_info, url @@ -220,6 +228,18 @@ async def read_webpage_with_jina(web_url: str) -> str: return response_json["data"]["content"] +async def read_webpage_with_firecrawl(web_url: str) -> str: + firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} + params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]} + + async with aiohttp.ClientSession() as session: + async with session.post(firecrawl_api_url, json=params, headers=headers) as response: + response.raise_for_status() + response_json = await response.json() + return response_json["data"]["markdown"] + + async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: encoded_query = urllib.parse.quote(query) jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}" From 98f99fa6f86c613f2ec6974e5684b1f8f5f3ddc1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 12:54:18 -0700 Subject: [PATCH 06/13] Allow using Firecrawl to extract web page content Set the FIRECRAWL_TO_EXTRACT environment variable to true to have Firecrawl scrape and extract content from webpage using their LLM This could be faster, not sure about quality as LLM used is obfuscated --- src/khoj/processor/tools/online_search.py | 51 ++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index a9dd2476..2b4cac65 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup from markdownify import markdownify from khoj.database.models import Agent, KhojUser +from khoj.processor.conversation import prompts from khoj.routers.helpers import ( ChatEvent, extract_relevant_info, @@ -31,6 +32,7 @@ JINA_API_KEY = os.getenv("JINA_API_KEY") FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") +FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" @@ -172,21 +174,26 @@ async def read_webpages( async def read_webpage_and_extract_content( subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None ) -> Tuple[str, Union[None, str], str]: + extracted_info = None try: if is_none_or_empty(content): with timer(f"Reading web page at '{url}' took", logger): if FIRECRAWL_API_KEY: - content = await read_webpage_with_firecrawl(url) + if FIRECRAWL_TO_EXTRACT: + extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent) + else: + content = await read_webpage_with_firecrawl(url) elif OLOSTEP_API_KEY: content = await read_webpage_with_olostep(url) else: content = await read_webpage_with_jina(url) - with timer(f"Extracting relevant information from web page at '{url}' took", logger): - extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) - return subquery, extracted_info, url + if is_none_or_empty(extracted_info): + with timer(f"Extracting relevant information from web page at '{url}' took", logger): + extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, None, url + + return subquery, extracted_info, url async def read_webpage_at_url(web_url: str) -> str: @@ -240,6 +247,40 @@ async def read_webpage_with_firecrawl(web_url: str) -> str: return response_json["data"]["markdown"] +async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str: + firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} + schema = { + "type": "object", + "properties": { + "relevant_extract": {"type": "string"}, + }, + "required": [ + "relevant_extract", + ], + } + + personality_context = ( + prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else "" + ) + system_prompt = f""" +{prompts.system_prompt_extract_relevant_information} + +{personality_context} +User Query: {query} + +Collate only relevant information from the website to answer the target query and in the provided JSON schema. +""".strip() + + params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}} + + async with aiohttp.ClientSession() as session: + async with session.post(firecrawl_api_url, json=params, headers=headers) as response: + response.raise_for_status() + response_json = await response.json() + return response_json["data"]["extract"]["relevant_extract"] + + async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dict[str, List[Dict]]]: encoded_query = urllib.parse.quote(query) jina_search_api_url = f"{JINA_SEARCH_API_URL}/{encoded_query}" From e47922e53a3f91b5909a44670256f1049327ef0c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 16:07:31 -0700 Subject: [PATCH 07/13] Aggregate webpage extract queries to run once for each distinct webpage This should reduce webpage read and response generation time. Previously, we'd run separate webpage read and extract relevant content pipes for each distinct (query, url) pair. Now we aggregate all queries for each url to extract information from and run the webpage read and extract relevant content pipes once for each distinct url. Even though the webpage content extraction pipes were previously being in parallel. They increased response time by 1. adding more context for the response generation chat actor to respond from 2. and by being more susceptible to page read and extract latencies of the parallel jobs The aggregated retrieval of context for all queries for a given webpage could result in some hit to context quality. But it should improve and reduce variability in response time, quality and costs. --- src/khoj/processor/tools/online_search.py | 49 ++++++++++++----------- src/khoj/routers/helpers.py | 8 ++-- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 2b4cac65..ea45846b 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -88,33 +88,36 @@ async def search_online( search_results = await asyncio.gather(*search_tasks) response_dict = {subquery: search_result for subquery, search_result in search_results} - # Gather distinct web page data from organic results of each subquery without an instant answer. + # Gather distinct web pages from organic results for subqueries without an instant answer. # Content of web pages is directly available when Jina is used for search. - webpages = { - (organic.get("link"), subquery, organic.get("content")) - for subquery in response_dict - for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ] - if "answerBox" not in response_dict[subquery] - } + webpages: Dict[str, Dict] = {} + for subquery in response_dict: + if "answerBox" in response_dict[subquery]: + continue + for organic in response_dict[subquery].get("organic", [])[:MAX_WEBPAGES_TO_READ]: + link = organic.get("link") + if link in webpages: + webpages[link]["queries"].add(subquery) + else: + webpages[link] = {"queries": {subquery}, "content": organic.get("content")} # Read, extract relevant info from the retrieved web pages if webpages: - webpage_links = set([link for link, _, _ in webpages]) - logger.info(f"Reading web pages at: {list(webpage_links)}") + logger.info(f"Reading web pages at: {webpages.keys()}") if send_status_func: - webpage_links_str = "\n- " + "\n- ".join(list(webpage_links)) + webpage_links_str = "\n- " + "\n- ".join(webpages.keys()) async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"): yield {ChatEvent.STATUS: event} tasks = [ - read_webpage_and_extract_content(subquery, link, content, user=user, agent=agent) - for link, subquery, content in webpages + read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent) + for link, data in webpages.items() ] results = await asyncio.gather(*tasks) # Collect extracted info from the retrieved web pages - for subquery, webpage_extract, url in results: + for subqueries, url, webpage_extract in results: if webpage_extract is not None: - response_dict[subquery]["webpages"] = {"link": url, "snippet": webpage_extract} + response_dict[subqueries.pop()]["webpages"] = {"link": url, "snippet": webpage_extract} yield response_dict @@ -161,26 +164,26 @@ async def read_webpages( webpage_links_str = "\n- " + "\n- ".join(list(urls)) async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"): yield {ChatEvent.STATUS: event} - tasks = [read_webpage_and_extract_content(query, url, user=user, agent=agent) for url in urls] + tasks = [read_webpage_and_extract_content({query}, url, user=user, agent=agent) for url in urls] results = await asyncio.gather(*tasks) response: Dict[str, Dict] = defaultdict(dict) response[query]["webpages"] = [ - {"query": q, "link": url, "snippet": web_extract} for q, web_extract, url in results if web_extract is not None + {"query": qs.pop(), "link": url, "snippet": extract} for qs, url, extract in results if extract is not None ] yield response async def read_webpage_and_extract_content( - subquery: str, url: str, content: str = None, user: KhojUser = None, agent: Agent = None -) -> Tuple[str, Union[None, str], str]: + subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None +) -> Tuple[set[str], str, Union[None, str]]: extracted_info = None try: if is_none_or_empty(content): with timer(f"Reading web page at '{url}' took", logger): if FIRECRAWL_API_KEY: if FIRECRAWL_TO_EXTRACT: - extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subquery, agent) + extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) else: content = await read_webpage_with_firecrawl(url) elif OLOSTEP_API_KEY: @@ -189,11 +192,11 @@ async def read_webpage_and_extract_content( content = await read_webpage_with_jina(url) if is_none_or_empty(extracted_info): with timer(f"Extracting relevant information from web page at '{url}' took", logger): - extracted_info = await extract_relevant_info(subquery, content, user=user, agent=agent) + extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) except Exception as e: logger.error(f"Failed to read web page at '{url}' with {e}") - return subquery, extracted_info, url + return subqueries, url, extracted_info async def read_webpage_at_url(web_url: str) -> str: @@ -247,7 +250,7 @@ async def read_webpage_with_firecrawl(web_url: str) -> str: return response_json["data"]["markdown"] -async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: str, agent: Agent = None) -> str: +async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str: firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} schema = { @@ -267,7 +270,7 @@ async def read_webpage_and_extract_content_with_firecrawl(web_url: str, query: s {prompts.system_prompt_extract_relevant_information} {personality_context} -User Query: {query} +User Query: {", ".join(queries)} Collate only relevant information from the website to answer the target query and in the provided JSON schema. """.strip() diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index a80864ba..4edef61d 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -551,12 +551,14 @@ async def schedule_query( raise AssertionError(f"Invalid response for scheduling query: {raw_response}") -async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agent: Agent = None) -> Union[str, None]: +async def extract_relevant_info( + qs: set[str], corpus: str, user: KhojUser = None, agent: Agent = None +) -> Union[str, None]: """ Extract relevant information for a given query from the target corpus """ - if is_none_or_empty(corpus) or is_none_or_empty(q): + if is_none_or_empty(corpus) or is_none_or_empty(qs): return None personality_context = ( @@ -564,7 +566,7 @@ async def extract_relevant_info(q: str, corpus: str, user: KhojUser = None, agen ) extract_relevant_information = prompts.extract_relevant_information.format( - query=q, + query=", ".join(qs), corpus=corpus.strip(), personality_context=personality_context, ) From c841abe13f3cde06690e5c818ccd05dc40b4e74f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 17:17:36 -0700 Subject: [PATCH 08/13] Change webpage scraper to use via server admin panel --- src/khoj/database/adapters/__init__.py | 13 ++++++++++++ src/khoj/database/admin.py | 1 + .../0068_serverchatsettings_web_scraper.py | 21 +++++++++++++++++++ src/khoj/database/models/__init__.py | 7 +++++++ src/khoj/processor/tools/online_search.py | 13 +++++++----- 5 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 182ce701..51b8afe6 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1031,6 +1031,19 @@ class ConversationAdapters: return server_chat_settings.chat_advanced return await ConversationAdapters.aget_default_conversation_config(user) + @staticmethod + async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None): + server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst() + if server_chat_settings is not None and server_chat_settings.web_scraper is not None: + web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper) + if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or ( + web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY + ): + return web_scraper + # Fallback to JinaAI if the API keys for the other providers are not set + # JinaAI is the default web scraper as it does not require an API key + return ServerChatSettings.WebScraper.JINAAI + @staticmethod def create_conversation_from_public_conversation( user: KhojUser, public_conversation: PublicConversation, client_app: ClientApplication diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 3e192952..51988752 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -198,6 +198,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin): list_display = ( "chat_default", "chat_advanced", + "web_scraper", ) diff --git a/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py new file mode 100644 index 00000000..89482dbd --- /dev/null +++ b/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py @@ -0,0 +1,21 @@ +# Generated by Django 5.0.8 on 2024-10-16 00:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0067_alter_agent_style_icon"), + ] + + operations = [ + migrations.AddField( + model_name="serverchatsettings", + name="web_scraper", + field=models.CharField( + choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")], + default="jinaai", + max_length=20, + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index ec4b61d1..7c4a16fa 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -8,6 +8,7 @@ from django.core.exceptions import ValidationError from django.db import models from django.db.models.signals import pre_save from django.dispatch import receiver +from django.utils.translation import gettext_lazy from pgvector.django import VectorField from phonenumber_field.modelfields import PhoneNumberField @@ -245,12 +246,18 @@ class GithubRepoConfig(BaseModel): class ServerChatSettings(BaseModel): + class WebScraper(models.TextChoices): + FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") + OLOSTEP = "olostep", gettext_lazy("Olostep") + JINAAI = "jinaai", gettext_lazy("JinaAI") + chat_default = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default" ) chat_advanced = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced" ) + web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI) class LocalOrgConfig(BaseModel): diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index ea45846b..df9b180f 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -10,7 +10,8 @@ import aiohttp from bs4 import BeautifulSoup from markdownify import markdownify -from khoj.database.models import Agent, KhojUser +from khoj.database.adapters import ConversationAdapters +from khoj.database.models import Agent, KhojUser, ServerChatSettings from khoj.processor.conversation import prompts from khoj.routers.helpers import ( ChatEvent, @@ -177,16 +178,18 @@ async def read_webpages( async def read_webpage_and_extract_content( subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None ) -> Tuple[set[str], str, Union[None, str]]: + # Select the web scraper to use for reading the web page + web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY) extracted_info = None try: if is_none_or_empty(content): - with timer(f"Reading web page at '{url}' took", logger): - if FIRECRAWL_API_KEY: + with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger): + if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL: if FIRECRAWL_TO_EXTRACT: extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) else: content = await read_webpage_with_firecrawl(url) - elif OLOSTEP_API_KEY: + elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP: content = await read_webpage_with_olostep(url) else: content = await read_webpage_with_jina(url) @@ -194,7 +197,7 @@ async def read_webpage_and_extract_content( with timer(f"Extracting relevant information from web page at '{url}' took", logger): extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) except Exception as e: - logger.error(f"Failed to read web page at '{url}' with {e}") + logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}") return subqueries, url, extracted_info From 11c64791aa53a8715b5423fae82fe256d34dff9d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 15 Oct 2024 17:56:21 -0700 Subject: [PATCH 09/13] Allow changing perf timer log level. Info log time for webpage read --- src/khoj/processor/tools/online_search.py | 2 +- src/khoj/routers/helpers.py | 11 +++++------ src/khoj/utils/helpers.py | 8 ++++---- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index df9b180f..2fbe8cf3 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -183,7 +183,7 @@ async def read_webpage_and_extract_content( extracted_info = None try: if is_none_or_empty(content): - with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger): + with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO): if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL: if FIRECRAWL_TO_EXTRACT: extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 4edef61d..c3d997e9 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -571,12 +571,11 @@ async def extract_relevant_info( personality_context=personality_context, ) - with timer("Chat actor: Extract relevant information from data", logger): - response = await send_message_to_model_wrapper( - extract_relevant_information, - prompts.system_prompt_extract_relevant_information, - user=user, - ) + response = await send_message_to_model_wrapper( + extract_relevant_information, + prompts.system_prompt_extract_relevant_information, + user=user, + ) return response.strip() diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index e0908e51..f16f922c 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -164,9 +164,9 @@ def get_class_by_name(name: str) -> object: class timer: """Context manager to log time taken for a block of code to run""" - def __init__(self, message: str, logger: logging.Logger, device: torch.device = None): + def __init__(self, message: str, logger: logging.Logger, device: torch.device = None, log_level=logging.DEBUG): self.message = message - self.logger = logger + self.logger = logger.debug if log_level == logging.DEBUG else logger.info self.device = device def __enter__(self): @@ -176,9 +176,9 @@ class timer: def __exit__(self, *_): elapsed = perf_counter() - self.start if self.device is None: - self.logger.debug(f"{self.message}: {elapsed:.3f} seconds") + self.logger(f"{self.message}: {elapsed:.3f} seconds") else: - self.logger.debug(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}") + self.logger(f"{self.message}: {elapsed:.3f} seconds on device: {self.device}") class LRU(OrderedDict): From d94abba2dc8ed6e8f31a86b8792c9e57413abc5e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 16 Oct 2024 00:37:46 -0700 Subject: [PATCH 10/13] Fallback through enabled scrapers to reduce web page read failures - Set up scrapers via API keys, explicitly adding them via admin panel or enabling only a single scraper to use via server chat settings. - Use validation to ensure only valid scrapers added via admin panel Example API key is present for scrapers that require it etc. - Modularize the read webpage functions to take api key, url as args Removes dependence on constants loaded in online_search. Functions are now mostly self contained - Improve ability to read webpages by using the speed, success rate of different scrapers. Optimal configuration needs to be discovered --- src/khoj/database/adapters/__init__.py | 49 +++++++--- src/khoj/database/admin.py | 14 +++ .../0068_serverchatsettings_web_scraper.py | 21 ----- ...bscraper_serverchatsettings_web_scraper.py | 47 ++++++++++ src/khoj/database/models/__init__.py | 50 ++++++++-- src/khoj/processor/tools/online_search.py | 91 +++++++++++-------- 6 files changed, 196 insertions(+), 76 deletions(-) delete mode 100644 src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py create mode 100644 src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 51b8afe6..8c6aa5e4 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1,6 +1,7 @@ import json import logging import math +import os import random import re import secrets @@ -10,7 +11,6 @@ from enum import Enum from typing import Callable, Iterable, List, Optional, Type import cron_descriptor -import django from apscheduler.job import Job from asgiref.sync import sync_to_async from django.contrib.sessions.backends.db import SessionStore @@ -52,6 +52,7 @@ from khoj.database.models import ( UserTextToImageModelConfig, UserVoiceModelConfig, VoiceModelOption, + WebScraper, ) from khoj.processor.conversation import prompts from khoj.search_filter.date_filter import DateFilter @@ -1032,17 +1033,43 @@ class ConversationAdapters: return await ConversationAdapters.aget_default_conversation_config(user) @staticmethod - async def aget_webscraper(FIRECRAWL_API_KEY: str = None, OLOSTEP_API_KEY: str = None): - server_chat_settings: ServerChatSettings = await ServerChatSettings.objects.filter().afirst() + async def aget_server_webscraper(): + server_chat_settings = await ServerChatSettings.objects.filter().prefetch_related("web_scraper").afirst() if server_chat_settings is not None and server_chat_settings.web_scraper is not None: - web_scraper = ServerChatSettings.WebScraper(server_chat_settings.web_scraper) - if (web_scraper == ServerChatSettings.WebScraper.FIRECRAWL and FIRECRAWL_API_KEY) or ( - web_scraper == ServerChatSettings.WebScraper.OLOSTEP and OLOSTEP_API_KEY - ): - return web_scraper - # Fallback to JinaAI if the API keys for the other providers are not set - # JinaAI is the default web scraper as it does not require an API key - return ServerChatSettings.WebScraper.JINAAI + return server_chat_settings.web_scraper + return None + + @staticmethod + async def aget_enabled_webscrapers(): + enabled_scrapers = [] + server_webscraper = await ConversationAdapters.aget_server_webscraper() + if server_webscraper: + # Only use the webscraper set in the server chat settings + enabled_scrapers = [ + (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) + ] + if not enabled_scrapers: + # Use the enabled web scrapers, using the newest created scraper first, until get web page content + enabled_scrapers = [ + (scraper.type, scraper.api_key, scraper.api_url, scraper.name) + async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator() + ] + if not enabled_scrapers: + # Use scrapers enabled via environment variables + if os.getenv("FIRECRAWL_API_KEY"): + api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") + enabled_scrapers.append( + (WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl") + ) + if os.getenv("OLOSTEP_API_KEY"): + api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") + enabled_scrapers.append( + (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") + ) + # Jina is the default fallback scraper to use as it does not require an API key + api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + return enabled_scrapers @staticmethod def create_conversation_from_public_conversation( diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 51988752..8e650922 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -31,6 +31,7 @@ from khoj.database.models import ( UserSearchModelConfig, UserVoiceModelConfig, VoiceModelOption, + WebScraper, ) from khoj.utils.helpers import ImageIntentType @@ -202,6 +203,19 @@ class ServerChatSettingsAdmin(admin.ModelAdmin): ) +@admin.register(WebScraper) +class WebScraperAdmin(admin.ModelAdmin): + list_display = ( + "name", + "type", + "api_key", + "api_url", + "created_at", + ) + search_fields = ("name", "api_key", "api_url", "type") + ordering = ("-created_at",) + + @admin.register(Conversation) class ConversationAdmin(admin.ModelAdmin): list_display = ( diff --git a/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py deleted file mode 100644 index 89482dbd..00000000 --- a/src/khoj/database/migrations/0068_serverchatsettings_web_scraper.py +++ /dev/null @@ -1,21 +0,0 @@ -# Generated by Django 5.0.8 on 2024-10-16 00:06 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("database", "0067_alter_agent_style_icon"), - ] - - operations = [ - migrations.AddField( - model_name="serverchatsettings", - name="web_scraper", - field=models.CharField( - choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jinaai", "JinaAI")], - default="jinaai", - max_length=20, - ), - ), - ] diff --git a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py new file mode 100644 index 00000000..41d9c80b --- /dev/null +++ b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py @@ -0,0 +1,47 @@ +# Generated by Django 5.0.8 on 2024-10-16 06:51 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0067_alter_agent_style_icon"), + ] + + operations = [ + migrations.CreateModel( + name="WebScraper", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)), + ( + "type", + models.CharField( + choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")], + default="jina", + max_length=20, + ), + ), + ("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)), + ("api_url", models.URLField(blank=True, default=None, null=True)), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="serverchatsettings", + name="web_scraper", + field=models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="web_scraper", + to="database.webscraper", + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 7c4a16fa..ec36c6f3 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -1,3 +1,4 @@ +import os import re import uuid from random import choice @@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy from pgvector.django import VectorField from phonenumber_field.modelfields import PhoneNumberField -from khoj.utils.helpers import ConversationCommand - class BaseModel(models.Model): created_at = models.DateTimeField(auto_now_add=True) @@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel): github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig") -class ServerChatSettings(BaseModel): - class WebScraper(models.TextChoices): +class WebScraper(BaseModel): + class WebScraperType(models.TextChoices): FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") OLOSTEP = "olostep", gettext_lazy("Olostep") - JINAAI = "jinaai", gettext_lazy("JinaAI") + JINA = "jina", gettext_lazy("Jina") + name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) + type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) + api_key = models.CharField(max_length=200, default=None, null=True, blank=True) + api_url = models.URLField(max_length=200, default=None, null=True, blank=True) + + def clean(self): + error = {} + if self.name is None: + self.name = self.type.capitalize() + if self.api_url is None: + if self.type == self.WebScraperType.FIRECRAWL: + self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") + elif self.type == self.WebScraperType.OLOSTEP: + self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") + elif self.type == self.WebScraperType.JINA: + self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + if self.api_key is None: + if self.type == self.WebScraperType.FIRECRAWL: + self.api_key = os.getenv("FIRECRAWL_API_KEY") + if not self.api_key and self.api_url == "https://api.firecrawl.dev": + error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev." + elif self.type == self.WebScraperType.OLOSTEP: + self.api_key = os.getenv("OLOSTEP_API_KEY") + if self.api_key is None: + error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." + elif self.type == self.WebScraperType.JINA: + self.api_key = os.getenv("JINA_API_KEY") + + if error: + raise ValidationError(error) + + def save(self, *args, **kwargs): + self.clean() + super().save(*args, **kwargs) + + +class ServerChatSettings(BaseModel): chat_default = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default" ) chat_advanced = models.ForeignKey( ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced" ) - web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI) + web_scraper = models.ForeignKey( + WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper" + ) class LocalOrgConfig(BaseModel): diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index 2fbe8cf3..c111415b 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup from markdownify import markdownify from khoj.database.adapters import ConversationAdapters -from khoj.database.models import Agent, KhojUser, ServerChatSettings +from khoj.database.models import Agent, KhojUser, WebScraper from khoj.processor.conversation import prompts from khoj.routers.helpers import ( ChatEvent, @@ -27,16 +27,11 @@ logger = logging.getLogger(__name__) SERPER_DEV_API_KEY = os.getenv("SERPER_DEV_API_KEY") SERPER_DEV_URL = "https://google.serper.dev/search" -JINA_READER_API_URL = "https://r.jina.ai/" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") -FIRECRAWL_API_URL = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" -OLOSTEP_API_KEY = os.getenv("OLOSTEP_API_KEY") -OLOSTEP_API_URL = "https://agent.olostep.com/olostep-p2p-incomingAPI" OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds "waitBeforeScraping": 1, # seconds @@ -175,29 +170,47 @@ async def read_webpages( yield response +async def read_webpage( + url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None +) -> Tuple[str | None, str | None]: + if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT: + return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent) + elif scraper_type == WebScraper.WebScraperType.FIRECRAWL: + return await read_webpage_with_firecrawl(url, api_key, api_url), None + elif scraper_type == WebScraper.WebScraperType.OLOSTEP: + return await read_webpage_with_olostep(url, api_key, api_url), None + else: + return await read_webpage_with_jina(url, api_key, api_url), None + + async def read_webpage_and_extract_content( subqueries: set[str], url: str, content: str = None, user: KhojUser = None, agent: Agent = None ) -> Tuple[set[str], str, Union[None, str]]: - # Select the web scraper to use for reading the web page - web_scraper = await ConversationAdapters.aget_webscraper(FIRECRAWL_API_KEY, OLOSTEP_API_KEY) + # Select the web scrapers to use for reading the web page + web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() + + # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None - try: - if is_none_or_empty(content): - with timer(f"Reading web page with {web_scraper.value} at '{url}' took", logger, log_level=logging.INFO): - if web_scraper == ServerChatSettings.WebScraper.FIRECRAWL: - if FIRECRAWL_TO_EXTRACT: - extracted_info = await read_webpage_and_extract_content_with_firecrawl(url, subqueries, agent) - else: - content = await read_webpage_with_firecrawl(url) - elif web_scraper == ServerChatSettings.WebScraper.OLOSTEP: - content = await read_webpage_with_olostep(url) - else: - content = await read_webpage_with_jina(url) - if is_none_or_empty(extracted_info): - with timer(f"Extracting relevant information from web page at '{url}' took", logger): - extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) - except Exception as e: - logger.error(f"Failed to read web page with {web_scraper.value} at '{url}' with {e}") + for scraper_type, api_key, api_url, api_name in web_scrapers: + try: + # Read the web page + if is_none_or_empty(content): + with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO): + content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent) + + # Extract relevant information from the web page + if is_none_or_empty(extracted_info): + with timer(f"Extracting relevant information from web page at '{url}' took", logger): + extracted_info = await extract_relevant_info(subqueries, content, user=user, agent=agent) + + # If we successfully extracted information, break the loop + if not is_none_or_empty(extracted_info): + break + except Exception as e: + logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}") + # If this is the last web scraper in the list, log an error + if api_name == web_scrapers[-1][-1]: + logger.error(f"All web scrapers failed for '{url}'") return subqueries, url, extracted_info @@ -216,23 +229,23 @@ async def read_webpage_at_url(web_url: str) -> str: return markdownify(body) -async def read_webpage_with_olostep(web_url: str) -> str: - headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"} +async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> str: + headers = {"Authorization": f"Bearer {api_key}"} web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore web_scraping_params["url"] = web_url async with aiohttp.ClientSession() as session: - async with session.get(OLOSTEP_API_URL, params=web_scraping_params, headers=headers) as response: + async with session.get(api_url, params=web_scraping_params, headers=headers) as response: response.raise_for_status() response_json = await response.json() return response_json["markdown_content"] -async def read_webpage_with_jina(web_url: str) -> str: - jina_reader_api_url = f"{JINA_READER_API_URL}/{web_url}" +async def read_webpage_with_jina(web_url: str, api_key: str, api_url: str) -> str: + jina_reader_api_url = f"{api_url}/{web_url}" headers = {"Accept": "application/json", "X-Timeout": "30"} - if JINA_API_KEY: - headers["Authorization"] = f"Bearer {JINA_API_KEY}" + if api_key: + headers["Authorization"] = f"Bearer {api_key}" async with aiohttp.ClientSession() as session: async with session.get(jina_reader_api_url, headers=headers) as response: @@ -241,9 +254,9 @@ async def read_webpage_with_jina(web_url: str) -> str: return response_json["data"]["content"] -async def read_webpage_with_firecrawl(web_url: str) -> str: - firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} +async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: + firecrawl_api_url = f"{api_url}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} params = {"url": web_url, "formats": ["markdown"], "excludeTags": ["script", ".ad"]} async with aiohttp.ClientSession() as session: @@ -253,9 +266,11 @@ async def read_webpage_with_firecrawl(web_url: str) -> str: return response_json["data"]["markdown"] -async def read_webpage_and_extract_content_with_firecrawl(web_url: str, queries: set[str], agent: Agent = None) -> str: - firecrawl_api_url = f"{FIRECRAWL_API_URL}/v1/scrape" - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {FIRECRAWL_API_KEY}"} +async def query_webpage_with_firecrawl( + web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None +) -> str: + firecrawl_api_url = f"{api_url}/v1/scrape" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} schema = { "type": "object", "properties": { From 20b6f0c2f4857157f47b32cf7dd199aac1f40d9b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 16 Oct 2024 02:57:51 -0700 Subject: [PATCH 11/13] Access internal links directly via a simple get request The other webpage scrapers will not work for internal webpages. Try access those urls directly if they are visible to the Khoj server over the network. Only enable this by default for self-hosted, single user setups. Otherwise ability to scan internal network would be a liability! For use-cases where it makes sense, the Khoj server admin can explicitly add the direct webpage scraper via the admin panel --- src/khoj/database/adapters/__init__.py | 16 +++++++- src/khoj/database/models/__init__.py | 1 + src/khoj/processor/tools/online_search.py | 17 +++++++-- src/khoj/utils/helpers.py | 46 +++++++++++++++++++++++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 8c6aa5e4..7be931c5 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -60,7 +60,12 @@ from khoj.search_filter.file_filter import FileFilter from khoj.search_filter.word_filter import WordFilter from khoj.utils import state from khoj.utils.config import OfflineChatProcessorModel -from khoj.utils.helpers import generate_random_name, is_none_or_empty, timer +from khoj.utils.helpers import ( + generate_random_name, + in_debug_mode, + is_none_or_empty, + timer, +) logger = logging.getLogger(__name__) @@ -1066,9 +1071,16 @@ class ConversationAdapters: enabled_scrapers.append( (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") ) - # Jina is the default fallback scraper to use as it does not require an API key + + # Jina is the default fallback scrapers to use as it does not require an API key api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + + # Only enable the direct web page scraper by default in self-hosted single user setups. + # Useful for reading webpages on your intranet. + if state.anonymous_mode or in_debug_mode(): + enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct")) + return enabled_scrapers @staticmethod diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index ec36c6f3..56f482ae 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -249,6 +249,7 @@ class WebScraper(BaseModel): FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") OLOSTEP = "olostep", gettext_lazy("Olostep") JINA = "jina", gettext_lazy("Jina") + DIRECT = "direct", gettext_lazy("Direct") name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c111415b..c00660e3 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -19,7 +19,13 @@ from khoj.routers.helpers import ( generate_online_subqueries, infer_webpage_urls, ) -from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer +from khoj.utils.helpers import ( + is_env_var_true, + is_internal_url, + is_internet_connected, + is_none_or_empty, + timer, +) from khoj.utils.rawconfig import LocationData logger = logging.getLogger(__name__) @@ -30,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" +FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT") OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds @@ -179,8 +185,10 @@ async def read_webpage( return await read_webpage_with_firecrawl(url, api_key, api_url), None elif scraper_type == WebScraper.WebScraperType.OLOSTEP: return await read_webpage_with_olostep(url, api_key, api_url), None - else: + elif scraper_type == WebScraper.WebScraperType.JINA: return await read_webpage_with_jina(url, api_key, api_url), None + else: + return await read_webpage_at_url(url), None async def read_webpage_and_extract_content( @@ -188,6 +196,9 @@ async def read_webpage_and_extract_content( ) -> Tuple[set[str], str, Union[None, str]]: # Select the web scrapers to use for reading the web page web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() + # Only use the direct web scraper for internal URLs + if is_internal_url(url): + web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT] # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index f16f922c..4e5736a2 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints import datetime import io +import ipaddress import logging import os import platform import random +import urllib.parse import uuid from collections import OrderedDict from enum import Enum @@ -436,6 +438,50 @@ def is_internet_connected(): return False +def is_internal_url(url: str) -> bool: + """ + Check if a URL is likely to be internal/non-public. + + Args: + url (str): The URL to check. + + Returns: + bool: True if the URL is likely internal, False otherwise. + """ + try: + parsed_url = urllib.parse.urlparse(url) + hostname = parsed_url.hostname + + # Check for localhost + if hostname in ["localhost", "127.0.0.1", "::1"]: + return True + + # Check for IP addresses in private ranges + try: + ip = ipaddress.ip_address(hostname) + return ip.is_private + except ValueError: + pass # Not an IP address, continue with other checks + + # Check for common internal TLDs + internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"] + if any(hostname.endswith(tld) for tld in internal_tlds): + return True + + # Check for non-standard ports + # if parsed_url.port and parsed_url.port not in [80, 443]: + # return True + + # Check for URLs without a TLD + if "." not in hostname: + return True + + return False + except Exception: + # If we can't parse the URL or something else goes wrong, assume it's not internal + return False + + def convert_image_to_webp(image_bytes): """Convert image bytes to webp format for faster loading""" image_io = io.BytesIO(image_bytes) From 0db52786ed0a353a003a52453f4010286b71eebb Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 17 Oct 2024 16:22:46 -0700 Subject: [PATCH 12/13] Make web scraper priority configurable via admin panel - Simplifies changing order in which web scrapers are invoked to read web page by just changing their priority number on the admin panel. Previously you'd have to delete/, re-add the scrapers to change their priority. - Add help text for each scraper field to ease admin setup experience - Friendlier env var to use Firecrawl's LLM to extract content - Remove use of separate friendly name for scraper types. Reuse actual name and just make actual name better --- src/khoj/database/adapters/__init__.py | 4 +- src/khoj/database/admin.py | 3 +- ...bscraper_serverchatsettings_web_scraper.py | 47 ---------- ...bscraper_serverchatsettings_web_scraper.py | 89 +++++++++++++++++++ src/khoj/database/models/__init__.py | 47 ++++++++-- src/khoj/processor/tools/online_search.py | 4 +- 6 files changed, 133 insertions(+), 61 deletions(-) delete mode 100644 src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py create mode 100644 src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 7be931c5..0f078a00 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1054,10 +1054,10 @@ class ConversationAdapters: (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) ] if not enabled_scrapers: - # Use the enabled web scrapers, using the newest created scraper first, until get web page content + # Use the enabled web scrapers, ordered by priority, until get web page content enabled_scrapers = [ (scraper.type, scraper.api_key, scraper.api_url, scraper.name) - async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator() + async for scraper in WebScraper.objects.all().order_by("priority").aiterator() ] if not enabled_scrapers: # Use scrapers enabled via environment variables diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 8e650922..5aa9204b 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -206,6 +206,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin): @admin.register(WebScraper) class WebScraperAdmin(admin.ModelAdmin): list_display = ( + "priority", "name", "type", "api_key", @@ -213,7 +214,7 @@ class WebScraperAdmin(admin.ModelAdmin): "created_at", ) search_fields = ("name", "api_key", "api_url", "type") - ordering = ("-created_at",) + ordering = ("priority",) @admin.register(Conversation) diff --git a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py deleted file mode 100644 index 41d9c80b..00000000 --- a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py +++ /dev/null @@ -1,47 +0,0 @@ -# Generated by Django 5.0.8 on 2024-10-16 06:51 - -import django.db.models.deletion -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("database", "0067_alter_agent_style_icon"), - ] - - operations = [ - migrations.CreateModel( - name="WebScraper", - fields=[ - ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("updated_at", models.DateTimeField(auto_now=True)), - ("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)), - ( - "type", - models.CharField( - choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")], - default="jina", - max_length=20, - ), - ), - ("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)), - ("api_url", models.URLField(blank=True, default=None, null=True)), - ], - options={ - "abstract": False, - }, - ), - migrations.AddField( - model_name="serverchatsettings", - name="web_scraper", - field=models.ForeignKey( - blank=True, - default=None, - null=True, - on_delete=django.db.models.deletion.CASCADE, - related_name="web_scraper", - to="database.webscraper", - ), - ), - ] diff --git a/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py new file mode 100644 index 00000000..3ea8ebe3 --- /dev/null +++ b/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py @@ -0,0 +1,89 @@ +# Generated by Django 5.0.8 on 2024-10-18 00:41 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0068_alter_agent_output_modes"), + ] + + operations = [ + migrations.CreateModel( + name="WebScraper", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "name", + models.CharField( + blank=True, + default=None, + help_text="Friendly name. If not set, it will be set to the type of the scraper.", + max_length=200, + null=True, + unique=True, + ), + ), + ( + "type", + models.CharField( + choices=[ + ("Firecrawl", "Firecrawl"), + ("Olostep", "Olostep"), + ("Jina", "Jina"), + ("Direct", "Direct"), + ], + default="Jina", + max_length=20, + ), + ), + ( + "api_key", + models.CharField( + blank=True, + default=None, + help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.", + max_length=200, + null=True, + ), + ), + ( + "api_url", + models.URLField( + blank=True, + default=None, + help_text="API URL of the web scraper. Only set if scraper service on non-default URL.", + null=True, + ), + ), + ( + "priority", + models.IntegerField( + blank=True, + default=None, + help_text="Priority of the web scraper. Lower numbers run first.", + null=True, + unique=True, + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="serverchatsettings", + name="web_scraper", + field=models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="web_scraper", + to="database.webscraper", + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 56f482ae..2b2fde2d 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -9,7 +9,6 @@ from django.core.exceptions import ValidationError from django.db import models from django.db.models.signals import pre_save from django.dispatch import receiver -from django.utils.translation import gettext_lazy from pgvector.django import VectorField from phonenumber_field.modelfields import PhoneNumberField @@ -246,15 +245,41 @@ class GithubRepoConfig(BaseModel): class WebScraper(BaseModel): class WebScraperType(models.TextChoices): - FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") - OLOSTEP = "olostep", gettext_lazy("Olostep") - JINA = "jina", gettext_lazy("Jina") - DIRECT = "direct", gettext_lazy("Direct") + FIRECRAWL = "Firecrawl" + OLOSTEP = "Olostep" + JINA = "Jina" + DIRECT = "Direct" - name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) + name = models.CharField( + max_length=200, + default=None, + null=True, + blank=True, + unique=True, + help_text="Friendly name. If not set, it will be set to the type of the scraper.", + ) type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) - api_key = models.CharField(max_length=200, default=None, null=True, blank=True) - api_url = models.URLField(max_length=200, default=None, null=True, blank=True) + api_key = models.CharField( + max_length=200, + default=None, + null=True, + blank=True, + help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.", + ) + api_url = models.URLField( + max_length=200, + default=None, + null=True, + blank=True, + help_text="API URL of the web scraper. Only set if scraper service on non-default URL.", + ) + priority = models.IntegerField( + default=None, + null=True, + blank=True, + unique=True, + help_text="Priority of the web scraper. Lower numbers run first.", + ) def clean(self): error = {} @@ -278,12 +303,16 @@ class WebScraper(BaseModel): error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." elif self.type == self.WebScraperType.JINA: self.api_key = os.getenv("JINA_API_KEY") - if error: raise ValidationError(error) def save(self, *args, **kwargs): self.clean() + + if self.priority is None: + max_priority = WebScraper.objects.aggregate(models.Max("priority"))["priority__max"] + self.priority = max_priority + 1 if max_priority else 1 + super().save(*args, **kwargs) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c00660e3..fee0fa03 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -36,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT") +FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT") OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds @@ -179,7 +179,7 @@ async def read_webpages( async def read_webpage( url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None ) -> Tuple[str | None, str | None]: - if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT: + if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT: return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent) elif scraper_type == WebScraper.WebScraperType.FIRECRAWL: return await read_webpage_with_firecrawl(url, api_key, api_url), None From 2c20f49bc59c69192b56a97883e7a47ac95287b1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 17 Oct 2024 17:15:53 -0700 Subject: [PATCH 13/13] Return enabled scrapers as WebScraper objects for more ergonomic code --- src/khoj/database/adapters/__init__.py | 46 ++++++++++++++++------- src/khoj/processor/tools/online_search.py | 14 ++++--- src/khoj/utils/helpers.py | 4 -- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 0f078a00..28946557 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1045,41 +1045,59 @@ class ConversationAdapters: return None @staticmethod - async def aget_enabled_webscrapers(): - enabled_scrapers = [] + async def aget_enabled_webscrapers() -> list[WebScraper]: + enabled_scrapers: list[WebScraper] = [] server_webscraper = await ConversationAdapters.aget_server_webscraper() if server_webscraper: # Only use the webscraper set in the server chat settings - enabled_scrapers = [ - (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) - ] + enabled_scrapers = [server_webscraper] if not enabled_scrapers: # Use the enabled web scrapers, ordered by priority, until get web page content - enabled_scrapers = [ - (scraper.type, scraper.api_key, scraper.api_url, scraper.name) - async for scraper in WebScraper.objects.all().order_by("priority").aiterator() - ] + enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()] if not enabled_scrapers: # Use scrapers enabled via environment variables if os.getenv("FIRECRAWL_API_KEY"): api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") enabled_scrapers.append( - (WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl") + WebScraper( + type=WebScraper.WebScraperType.FIRECRAWL, + name=WebScraper.WebScraperType.FIRECRAWL.capitalize(), + api_key=os.getenv("FIRECRAWL_API_KEY"), + api_url=api_url, + ) ) if os.getenv("OLOSTEP_API_KEY"): api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") enabled_scrapers.append( - (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") + WebScraper( + type=WebScraper.WebScraperType.OLOSTEP, + name=WebScraper.WebScraperType.OLOSTEP.capitalize(), + api_key=os.getenv("OLOSTEP_API_KEY"), + api_url=api_url, + ) ) - # Jina is the default fallback scrapers to use as it does not require an API key api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") - enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.JINA, + name=WebScraper.WebScraperType.JINA.capitalize(), + api_key=os.getenv("JINA_API_KEY"), + api_url=api_url, + ) + ) # Only enable the direct web page scraper by default in self-hosted single user setups. # Useful for reading webpages on your intranet. if state.anonymous_mode or in_debug_mode(): - enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct")) + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.DIRECT, + name=WebScraper.WebScraperType.DIRECT.capitalize(), + api_key=None, + api_url=None, + ) + ) return enabled_scrapers diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index fee0fa03..70972eac 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -198,16 +198,18 @@ async def read_webpage_and_extract_content( web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() # Only use the direct web scraper for internal URLs if is_internal_url(url): - web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT] + web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT] # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None - for scraper_type, api_key, api_url, api_name in web_scrapers: + for scraper in web_scrapers: try: # Read the web page if is_none_or_empty(content): - with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO): - content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent) + with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO): + content, extracted_info = await read_webpage( + url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent + ) # Extract relevant information from the web page if is_none_or_empty(extracted_info): @@ -218,9 +220,9 @@ async def read_webpage_and_extract_content( if not is_none_or_empty(extracted_info): break except Exception as e: - logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}") + logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}") # If this is the last web scraper in the list, log an error - if api_name == web_scrapers[-1][-1]: + if scraper.name == web_scrapers[-1].name: logger.error(f"All web scrapers failed for '{url}'") return subqueries, url, extracted_info diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 4e5736a2..7006d7d4 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -468,10 +468,6 @@ def is_internal_url(url: str) -> bool: if any(hostname.endswith(tld) for tld in internal_tlds): return True - # Check for non-standard ports - # if parsed_url.port and parsed_url.port not in [80, 443]: - # return True - # Check for URLs without a TLD if "." not in hostname: return True