From 20b6f0c2f4857157f47b32cf7dd199aac1f40d9b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 16 Oct 2024 02:57:51 -0700 Subject: [PATCH] Access internal links directly via a simple get request The other webpage scrapers will not work for internal webpages. Try access those urls directly if they are visible to the Khoj server over the network. Only enable this by default for self-hosted, single user setups. Otherwise ability to scan internal network would be a liability! For use-cases where it makes sense, the Khoj server admin can explicitly add the direct webpage scraper via the admin panel --- src/khoj/database/adapters/__init__.py | 16 +++++++- src/khoj/database/models/__init__.py | 1 + src/khoj/processor/tools/online_search.py | 17 +++++++-- src/khoj/utils/helpers.py | 46 +++++++++++++++++++++++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 8c6aa5e4..7be931c5 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -60,7 +60,12 @@ from khoj.search_filter.file_filter import FileFilter from khoj.search_filter.word_filter import WordFilter from khoj.utils import state from khoj.utils.config import OfflineChatProcessorModel -from khoj.utils.helpers import generate_random_name, is_none_or_empty, timer +from khoj.utils.helpers import ( + generate_random_name, + in_debug_mode, + is_none_or_empty, + timer, +) logger = logging.getLogger(__name__) @@ -1066,9 +1071,16 @@ class ConversationAdapters: enabled_scrapers.append( (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep") ) - # Jina is the default fallback scraper to use as it does not require an API key + + # Jina is the default fallback scrapers to use as it does not require an API key api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina")) + + # Only enable the direct web page scraper by default in self-hosted single user setups. + # Useful for reading webpages on your intranet. + if state.anonymous_mode or in_debug_mode(): + enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct")) + return enabled_scrapers @staticmethod diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index ec36c6f3..56f482ae 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -249,6 +249,7 @@ class WebScraper(BaseModel): FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") OLOSTEP = "olostep", gettext_lazy("Olostep") JINA = "jina", gettext_lazy("Jina") + DIRECT = "direct", gettext_lazy("Direct") name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c111415b..c00660e3 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -19,7 +19,13 @@ from khoj.routers.helpers import ( generate_online_subqueries, infer_webpage_urls, ) -from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer +from khoj.utils.helpers import ( + is_env_var_true, + is_internal_url, + is_internet_connected, + is_none_or_empty, + timer, +) from khoj.utils.rawconfig import LocationData logger = logging.getLogger(__name__) @@ -30,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true" +FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT") OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds @@ -179,8 +185,10 @@ async def read_webpage( return await read_webpage_with_firecrawl(url, api_key, api_url), None elif scraper_type == WebScraper.WebScraperType.OLOSTEP: return await read_webpage_with_olostep(url, api_key, api_url), None - else: + elif scraper_type == WebScraper.WebScraperType.JINA: return await read_webpage_with_jina(url, api_key, api_url), None + else: + return await read_webpage_at_url(url), None async def read_webpage_and_extract_content( @@ -188,6 +196,9 @@ async def read_webpage_and_extract_content( ) -> Tuple[set[str], str, Union[None, str]]: # Select the web scrapers to use for reading the web page web_scrapers = await ConversationAdapters.aget_enabled_webscrapers() + # Only use the direct web scraper for internal URLs + if is_internal_url(url): + web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT] # Fallback through enabled web scrapers until we successfully read the web page extracted_info = None diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index f16f922c..4e5736a2 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints import datetime import io +import ipaddress import logging import os import platform import random +import urllib.parse import uuid from collections import OrderedDict from enum import Enum @@ -436,6 +438,50 @@ def is_internet_connected(): return False +def is_internal_url(url: str) -> bool: + """ + Check if a URL is likely to be internal/non-public. + + Args: + url (str): The URL to check. + + Returns: + bool: True if the URL is likely internal, False otherwise. + """ + try: + parsed_url = urllib.parse.urlparse(url) + hostname = parsed_url.hostname + + # Check for localhost + if hostname in ["localhost", "127.0.0.1", "::1"]: + return True + + # Check for IP addresses in private ranges + try: + ip = ipaddress.ip_address(hostname) + return ip.is_private + except ValueError: + pass # Not an IP address, continue with other checks + + # Check for common internal TLDs + internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"] + if any(hostname.endswith(tld) for tld in internal_tlds): + return True + + # Check for non-standard ports + # if parsed_url.port and parsed_url.port not in [80, 443]: + # return True + + # Check for URLs without a TLD + if "." not in hostname: + return True + + return False + except Exception: + # If we can't parse the URL or something else goes wrong, assume it's not internal + return False + + def convert_image_to_webp(image_bytes): """Convert image bytes to webp format for faster loading""" image_io = io.BytesIO(image_bytes)