Access internal links directly via a simple get request

The other webpage scrapers will not work for internal webpages. Try
access those urls directly if they are visible to the Khoj server over
the network.

Only enable this by default for self-hosted, single user setups.
Otherwise ability to scan internal network would be a liability!

For use-cases where it makes sense, the Khoj server admin can
explicitly add the direct webpage scraper via the admin panel
This commit is contained in:
Debanjum Singh Solanky
2024-10-16 02:57:51 -07:00
parent d94abba2dc
commit 20b6f0c2f4
4 changed files with 75 additions and 5 deletions

View File

@@ -60,7 +60,12 @@ from khoj.search_filter.file_filter import FileFilter
from khoj.search_filter.word_filter import WordFilter
from khoj.utils import state
from khoj.utils.config import OfflineChatProcessorModel
from khoj.utils.helpers import generate_random_name, is_none_or_empty, timer
from khoj.utils.helpers import (
generate_random_name,
in_debug_mode,
is_none_or_empty,
timer,
)
logger = logging.getLogger(__name__)
@@ -1066,9 +1071,16 @@ class ConversationAdapters:
enabled_scrapers.append(
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
)
# Jina is the default fallback scraper to use as it does not require an API key
# Jina is the default fallback scrapers to use as it does not require an API key
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
# Only enable the direct web page scraper by default in self-hosted single user setups.
# Useful for reading webpages on your intranet.
if state.anonymous_mode or in_debug_mode():
enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct"))
return enabled_scrapers
@staticmethod

View File

@@ -249,6 +249,7 @@ class WebScraper(BaseModel):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep")
JINA = "jina", gettext_lazy("Jina")
DIRECT = "direct", gettext_lazy("Direct")
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)

View File

@@ -19,7 +19,13 @@ from khoj.routers.helpers import (
generate_online_subqueries,
infer_webpage_urls,
)
from khoj.utils.helpers import is_internet_connected, is_none_or_empty, timer
from khoj.utils.helpers import (
is_env_var_true,
is_internal_url,
is_internet_connected,
is_none_or_empty,
timer,
)
from khoj.utils.rawconfig import LocationData
logger = logging.getLogger(__name__)
@@ -30,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search"
JINA_SEARCH_API_URL = "https://s.jina.ai/"
JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_TO_EXTRACT = os.getenv("FIRECRAWL_TO_EXTRACT", "False").lower() == "true"
FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT")
OLOSTEP_QUERY_PARAMS = {
"timeout": 35, # seconds
@@ -179,8 +185,10 @@ async def read_webpage(
return await read_webpage_with_firecrawl(url, api_key, api_url), None
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
return await read_webpage_with_olostep(url, api_key, api_url), None
else:
elif scraper_type == WebScraper.WebScraperType.JINA:
return await read_webpage_with_jina(url, api_key, api_url), None
else:
return await read_webpage_at_url(url), None
async def read_webpage_and_extract_content(
@@ -188,6 +196,9 @@ async def read_webpage_and_extract_content(
) -> Tuple[set[str], str, Union[None, str]]:
# Select the web scrapers to use for reading the web page
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
# Only use the direct web scraper for internal URLs
if is_internal_url(url):
web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT]
# Fallback through enabled web scrapers until we successfully read the web page
extracted_info = None

View File

@@ -2,10 +2,12 @@ from __future__ import annotations # to avoid quoting type hints
import datetime
import io
import ipaddress
import logging
import os
import platform
import random
import urllib.parse
import uuid
from collections import OrderedDict
from enum import Enum
@@ -436,6 +438,50 @@ def is_internet_connected():
return False
def is_internal_url(url: str) -> bool:
"""
Check if a URL is likely to be internal/non-public.
Args:
url (str): The URL to check.
Returns:
bool: True if the URL is likely internal, False otherwise.
"""
try:
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.hostname
# Check for localhost
if hostname in ["localhost", "127.0.0.1", "::1"]:
return True
# Check for IP addresses in private ranges
try:
ip = ipaddress.ip_address(hostname)
return ip.is_private
except ValueError:
pass # Not an IP address, continue with other checks
# Check for common internal TLDs
internal_tlds = [".local", ".internal", ".private", ".corp", ".home", ".lan"]
if any(hostname.endswith(tld) for tld in internal_tlds):
return True
# Check for non-standard ports
# if parsed_url.port and parsed_url.port not in [80, 443]:
# return True
# Check for URLs without a TLD
if "." not in hostname:
return True
return False
except Exception:
# If we can't parse the URL or something else goes wrong, assume it's not internal
return False
def convert_image_to_webp(image_bytes):
"""Convert image bytes to webp format for faster loading"""
image_io = io.BytesIO(image_bytes)