Access internal links directly via a simple get request

The other webpage scrapers will not work for internal webpages. Try access those urls directly if they are visible to the Khoj server over the network. Only enable this by default for self-hosted, single user setups. Otherwise ability to scan internal network would be a liability! For use-cases where it makes sense, the Khoj server admin can explicitly add the direct webpage scraper via the admin panel
2026-03-06 13:22:12 +00:00 · 2024-10-16 02:57:51 -07:00
parent d94abba2dc
commit 20b6f0c2f4
4 changed files with 75 additions and 5 deletions
--- a/src/khoj/database/adapters/init.py
+++ b/src/khoj/database/adapters/init.py
@@ -60,7 +60,12 @@ from khoj.search_filter.file_filter import FileFilter
 from khoj.search_filter.word_filter import WordFilter
 from khoj.utils import state
 from khoj.utils.config import OfflineChatProcessorModel
-from khoj.utils.helpers import generate_random_name, is_none_or_empty, timer
+from khoj.utils.helpers import (
+    generate_random_name,
+    in_debug_mode,
+    is_none_or_empty,
+    timer,
+)

 logger = logging.getLogger(__name__)

@@ -1066,9 +1071,16 @@ class ConversationAdapters:
                enabled_scrapers.append(
                    (WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
                )
-            # Jina is the default fallback scraper to use as it does not require an API key
+
+            # Jina is the default fallback scrapers to use as it does not require an API key
            api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
            enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
+
+            # Only enable the direct web page scraper by default in self-hosted single user setups.
+            # Useful for reading webpages on your intranet.
+            if state.anonymous_mode or in_debug_mode():
+                enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct"))
+
        return enabled_scrapers

    @staticmethod
--- a/src/khoj/database/models/init.py
+++ b/src/khoj/database/models/init.py
@@ -249,6 +249,7 @@ class WebScraper(BaseModel):
        FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
        OLOSTEP = "olostep", gettext_lazy("Olostep")
        JINA = "jina", gettext_lazy("Jina")
+        DIRECT = "direct", gettext_lazy("Direct")

    name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
    type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -19,7 +19,13 @@ from khoj.routers.helpers import (
    generate_online_subqueries,
    infer_webpage_urls,
 )
-from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
+from khoj.utils.helpers import (
+    is_env_var_true,
+    is_internal_url,
+    is_internet_connected,
+    is_none_or_empty,
+    timer,
+)
 from khoj.utils.rawconfig import LocationData

 logger = logging.getLogger(__name__)
@@ -30,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search"
 JINA_SEARCH_API_URL = "https://s.jina.ai/"
 JINA_API_KEY = os.getenv("JINA_API_KEY")

-FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
+FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT")

 OLOSTEP_QUERY_PARAMS = {
    "timeout": 35,  # seconds
@@ -179,8 +185,10 @@ async def read_webpage(
        return await read_webpage_with_firecrawl(url, api_key, api_url), None
    elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
        return await read_webpage_with_olostep(url, api_key, api_url), None
-    else:
+    elif scraper_type == WebScraper.WebScraperType.JINA:
        return await read_webpage_with_jina(url, api_key, api_url), None
+    else:
+        return await read_webpage_at_url(url), None


 async def read_webpage_and_extract_content(
@@ -188,6 +196,9 @@ async def read_webpage_and_extract_content(
 ) -> Tuple[set[str], str, Union[None, str]]:
    # Select the web scrapers to use for reading the web page
    web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
+    # Only use the direct web scraper for internal URLs
+    if is_internal_url(url):
+        web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT]

    # Fallback through enabled web scrapers until we successfully read the web page
    extracted_info = None
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -2,10 +2,12 @@ from __future__ import annotations  # to avoid quoting type hints

 import datetime
 import io
+import ipaddress
 import logging
 import os
 import platform
 import random
+import urllib.parse
 import uuid
 from collections import OrderedDict
 from enum import Enum
@@ -436,6 +438,50 @@ def is_internet_connected():
        return False


+def is_internal_url(url: str) -> bool:
+    """
+    Check if a URL is likely to be internal/non-public.
+
+    Args:
+    url (str): The URL to check.
+
+    Returns:
+    bool: True if the URL is likely internal, False otherwise.
+    """
+    try:
+        parsed_url = urllib.parse.urlparse(url)
+        hostname = parsed_url.hostname
+
+        # Check for localhost
+        if hostname in ["localhost", "127.0.0.1", "::1"]:
+            return True
+
+        # Check for IP addresses in private ranges
+        try:
+            ip = ipaddress.ip_address(hostname)
+            return ip.is_private
+        except ValueError:
+            pass  # Not an IP address, continue with other checks
+
+        # Check for common internal TLDs
+        internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"]
+        if any(hostname.endswith(tld) for tld in internal_tlds):
+            return True
+
+        # Check for non-standard ports
+        # if parsed_url.port and parsed_url.port not in [80, 443]:
+        #     return True
+
+        # Check for URLs without a TLD
+        if "." not in hostname:
+            return True
+
+        return False
+    except Exception:
+        # If we can't parse the URL or something else goes wrong, assume it's not internal
+        return False
+
+
 def convert_image_to_webp(image_bytes):
    """Convert image bytes to webp format for faster loading"""
    image_io = io.BytesIO(image_bytes)