diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 0f078a00..28946557 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1045,41 +1045,59 @@ class ConversationAdapters: return None @staticmethod - async def aget_enabled_webscrapers(): - enabled_scrapers = [] + async def aget_enabled_webscrapers() -> list[WebScraper]: + enabled_scrapers: list[WebScraper] = [] server_webscraper = await ConversationAdapters.aget_server_webscraper() if server_webscraper: # Only use the webscraper set in the server chat settings - enabled_scrapers = [ - (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) - ] + enabled_scrapers = [server_webscraper] if not enabled_scrapers: # Use the enabled web scrapers, ordered by priority, until get web page content - enabled_scrapers = [ - (scraper.type, scraper.api_key, scraper.api_url, scraper.name) - async for scraper in WebScraper.objects.all().order_by("priority").aiterator() - ] + enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()] if not enabled_scrapers: # Use scrapers enabled via environment variables if os.getenv("FIRECRAWL_API_KEY"): api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") enabled_scrapers.append( - (WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl") + WebScraper( + type=WebScraper.WebScraperType.FIRECRAWL, + name=WebScraper.WebScraperType.FIRECRAWL.capitalize(), + api_key=os.getenv("FIRECRAWL_API_KEY"), + api_url=api_url, + ) ) if os.getenv("OLOSTEP_API_KEY"): api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") enabled_scrapers.append( - (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") + WebScraper( + type=WebScraper.WebScraperType.OLOSTEP, + name=WebScraper.WebScraperType.OLOSTEP.capitalize(), + api_key=os.getenv("OLOSTEP_API_KEY"), + api_url=api_url, + ) ) - # Jina is the default fallback scrapers to use as it does not require an API key api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") - enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.JINA, + name=WebScraper.WebScraperType.JINA.capitalize(), + api_key=os.getenv("JINA_API_KEY"), + api_url=api_url, + ) + ) # Only enable the direct web page scraper by default in self-hosted single user setups. # Useful for reading webpages on your intranet. if state.anonymous_mode or in_debug_mode(): - enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct")) + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.DIRECT, + name=WebScraper.WebScraperType.DIRECT.capitalize(), + api_key=None, + api_url=None, + ) + ) return enabled_scrapers diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index fee0fa03..70972eac 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -198,16 +198,18 @@ async def read_webpage_and_extract_content( web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() # Only use the direct web scraper for internal URLs if is_internal_url(url): - web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT] + web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT] # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None - for scraper_type, api_key, api_url, api_name in web_scrapers: + for scraper in web_scrapers: try: # Read the web page if is_none_or_empty(content): - with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO): - content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent) + with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO): + content, extracted_info = await read_webpage( + url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent + ) # Extract relevant information from the web page if is_none_or_empty(extracted_info): @@ -218,9 +220,9 @@ async def read_webpage_and_extract_content( if not is_none_or_empty(extracted_info): break except Exception as e: - logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}") + logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}") # If this is the last web scraper in the list, log an error - if api_name == web_scrapers[-1][-1]: + if scraper.name == web_scrapers[-1].name: logger.error(f"All web scrapers failed for '{url}'") return subqueries, url, extracted_info diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 4e5736a2..7006d7d4 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -468,10 +468,6 @@ def is_internal_url(url: str) -> bool: if any(hostname.endswith(tld) for tld in internal_tlds): return True - # Check for non-standard ports - # if parsed_url.port and parsed_url.port not in [80, 443]: - # return True - # Check for URLs without a TLD if "." not in hostname: return True