mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 13:23:15 +00:00
Return enabled scrapers as WebScraper objects for more ergonomic code
This commit is contained in:
@@ -1045,41 +1045,59 @@ class ConversationAdapters:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def aget_enabled_webscrapers():
|
||||
enabled_scrapers = []
|
||||
async def aget_enabled_webscrapers() -> list[WebScraper]:
|
||||
enabled_scrapers: list[WebScraper] = []
|
||||
server_webscraper = await ConversationAdapters.aget_server_webscraper()
|
||||
if server_webscraper:
|
||||
# Only use the webscraper set in the server chat settings
|
||||
enabled_scrapers = [
|
||||
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
||||
]
|
||||
enabled_scrapers = [server_webscraper]
|
||||
if not enabled_scrapers:
|
||||
# Use the enabled web scrapers, ordered by priority, until get web page content
|
||||
enabled_scrapers = [
|
||||
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
||||
async for scraper in WebScraper.objects.all().order_by("priority").aiterator()
|
||||
]
|
||||
enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()]
|
||||
if not enabled_scrapers:
|
||||
# Use scrapers enabled via environment variables
|
||||
if os.getenv("FIRECRAWL_API_KEY"):
|
||||
api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
enabled_scrapers.append(
|
||||
(WebScraper.WebScraperType.FIRECRAWL, os.getenv("FIRECRAWL_API_KEY"), api_url, "Firecrawl")
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.FIRECRAWL,
|
||||
name=WebScraper.WebScraperType.FIRECRAWL.capitalize(),
|
||||
api_key=os.getenv("FIRECRAWL_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
if os.getenv("OLOSTEP_API_KEY"):
|
||||
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
enabled_scrapers.append(
|
||||
(WebScraper.WebScraperType.OLOSTEP, os.getenv("OLOSTEP_API_KEY"), api_url, "Olostep")
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.OLOSTEP,
|
||||
name=WebScraper.WebScraperType.OLOSTEP.capitalize(),
|
||||
api_key=os.getenv("OLOSTEP_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
|
||||
# Jina is the default fallback scrapers to use as it does not require an API key
|
||||
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
enabled_scrapers.append((WebScraper.WebScraperType.JINA, os.getenv("JINA_API_KEY"), api_url, "Jina"))
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.JINA,
|
||||
name=WebScraper.WebScraperType.JINA.capitalize(),
|
||||
api_key=os.getenv("JINA_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
|
||||
# Only enable the direct web page scraper by default in self-hosted single user setups.
|
||||
# Useful for reading webpages on your intranet.
|
||||
if state.anonymous_mode or in_debug_mode():
|
||||
enabled_scrapers.append((WebScraper.WebScraperType.DIRECT, None, None, "Direct"))
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.DIRECT,
|
||||
name=WebScraper.WebScraperType.DIRECT.capitalize(),
|
||||
api_key=None,
|
||||
api_url=None,
|
||||
)
|
||||
)
|
||||
|
||||
return enabled_scrapers
|
||||
|
||||
|
||||
@@ -198,16 +198,18 @@ async def read_webpage_and_extract_content(
|
||||
web_scrapers = await ConversationAdapters.aget_enabled_webscrapers()
|
||||
# Only use the direct web scraper for internal URLs
|
||||
if is_internal_url(url):
|
||||
web_scrapers = [scraper for scraper in web_scrapers if scraper[0] == WebScraper.WebScraperType.DIRECT]
|
||||
web_scrapers = [scraper for scraper in web_scrapers if scraper.type == WebScraper.WebScraperType.DIRECT]
|
||||
|
||||
# Fallback through enabled web scrapers until we successfully read the web page
|
||||
extracted_info = None
|
||||
for scraper_type, api_key, api_url, api_name in web_scrapers:
|
||||
for scraper in web_scrapers:
|
||||
try:
|
||||
# Read the web page
|
||||
if is_none_or_empty(content):
|
||||
with timer(f"Reading web page with {scraper_type} at '{url}' took", logger, log_level=logging.INFO):
|
||||
content, extracted_info = await read_webpage(url, scraper_type, api_key, api_url, subqueries, agent)
|
||||
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
||||
content, extracted_info = await read_webpage(
|
||||
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
||||
)
|
||||
|
||||
# Extract relevant information from the web page
|
||||
if is_none_or_empty(extracted_info):
|
||||
@@ -218,9 +220,9 @@ async def read_webpage_and_extract_content(
|
||||
if not is_none_or_empty(extracted_info):
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read web page with {scraper_type} at '{url}' with {e}")
|
||||
logger.warning(f"Failed to read web page with {scraper.type} at '{url}' with {e}")
|
||||
# If this is the last web scraper in the list, log an error
|
||||
if api_name == web_scrapers[-1][-1]:
|
||||
if scraper.name == web_scrapers[-1].name:
|
||||
logger.error(f"All web scrapers failed for '{url}'")
|
||||
|
||||
return subqueries, url, extracted_info
|
||||
|
||||
@@ -468,10 +468,6 @@ def is_internal_url(url: str) -> bool:
|
||||
if any(hostname.endswith(tld) for tld in internal_tlds):
|
||||
return True
|
||||
|
||||
# Check for non-standard ports
|
||||
# if parsed_url.port and parsed_url.port not in [80, 443]:
|
||||
# return True
|
||||
|
||||
# Check for URLs without a TLD
|
||||
if "." not in hostname:
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user