diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 8c6aa5e4..7be931c5 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -60,7 +60,12 @@ from khoj.search_filter.file_filter import FileFilter from khoj.search_filter.word_filter import WordFilter from khoj.utils import state from khoj.utils.config import OfflineChatProcessorModel -from khoj.utils.helpers import generate_random_name, is_none_or_empty, timer +from khoj.utils.helpers import ( + generate_random_name, + in_debug_mode, + is_none_or_empty, + timer, +) logger = logging.getLogger(__name__) @@ -1066,9 +1071,16 @@ class ConversationAdapters: enabled_scrapers.append( (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") ) - # Jina is the default fallback scraper to use as it does not require an API key + + # Jina is the default fallback scrapers to use as it does not require an API key api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + + # Only enable the direct web page scraper by default in self-hosted single user setups. + # Useful for reading webpages on your intranet. + if state.anonymous_mode or in_debug_mode(): + enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct")) + return enabled_scrapers @staticmethod diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index ec36c6f3..56f482ae 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -249,6 +249,7 @@ class WebScraper(BaseModel): FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") OLOSTEP = "olostep", gettext_lazy("Olostep") JINA = "jina", gettext_lazy("Jina") + DIRECT = "direct", gettext_lazy("Direct") name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c111415b..c00660e3 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -19,7 +19,13 @@ from khoj.routers.helpers import ( generate_online_subqueries, infer_webpage_urls, ) -from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer +from khoj.utils.helpers import ( + is_env_var_true, + is_internal_url, + is_internet_connected, + is_none_or_empty, + timer, +) from khoj.utils.rawconfig import LocationData logger = logging.getLogger(__name__) @@ -30,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" +FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT") OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds @@ -179,8 +185,10 @@ async def read_webpage( return await read_webpage_with_firecrawl(url, api_key, api_url), None elif scraper_type == WebScraper.WebScraperType.OLOSTEP: return await read_webpage_with_olostep(url, api_key, api_url), None - else: + elif scraper_type == WebScraper.WebScraperType.JINA: return await read_webpage_with_jina(url, api_key, api_url), None + else: + return await read_webpage_at_url(url), None async def read_webpage_and_extract_content( @@ -188,6 +196,9 @@ async def read_webpage_and_extract_content( ) -> Tuple[set[str], str, Union[None, str]]: # Select the web scrapers to use for reading the web page web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() + # Only use the direct web scraper for internal URLs + if is_internal_url(url): + web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT] # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index f16f922c..4e5736a2 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints import datetime import io +import ipaddress import logging import os import platform import random +import urllib.parse import uuid from collections import OrderedDict from enum import Enum @@ -436,6 +438,50 @@ def is_internet_connected(): return False +def is_internal_url(url: str) -> bool: + """ + Check if a URL is likely to be internal/non-public. + + Args: + url (str): The URL to check. + + Returns: + bool: True if the URL is likely internal, False otherwise. + """ + try: + parsed_url = urllib.parse.urlparse(url) + hostname = parsed_url.hostname + + # Check for localhost + if hostname in ["localhost", "127.0.0.1", "::1"]: + return True + + # Check for IP addresses in private ranges + try: + ip = ipaddress.ip_address(hostname) + return ip.is_private + except ValueError: + pass # Not an IP address, continue with other checks + + # Check for common internal TLDs + internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"] + if any(hostname.endswith(tld) for tld in internal_tlds): + return True + + # Check for non-standard ports + # if parsed_url.port and parsed_url.port not in [80, 443]: + # return True + + # Check for URLs without a TLD + if "." not in hostname: + return True + + return False + except Exception: + # If we can't parse the URL or something else goes wrong, assume it's not internal + return False + + def convert_image_to_webp(image_bytes): """Convert image bytes to webp format for faster loading""" image_io = io.BytesIO(image_bytes)