Enable webpage reading with Exa. Remove Jina web page reader

Support using Exa for webpage reading. It seems much faster than
currently available providers.

Remove Jina as a webpage reader and remaining references to Jina from
code, docs. It was anyway slow and API may shut down soon (as it was
bought by Elastic).

Update docs to mention Exa for web search and webpage reading.
This commit is contained in:
Debanjum
2025-11-12 12:21:59 -08:00
parent d57c597245
commit 61cb2d5b7e
7 changed files with 78 additions and 30 deletions

View File

@@ -1437,6 +1437,16 @@ class ConversationAdapters:
enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()]
if not enabled_scrapers:
# Use scrapers enabled via environment variables
if os.getenv("EXA_API_KEY"):
api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
enabled_scrapers.append(
WebScraper(
type=WebScraper.WebScraperType.EXA,
name=WebScraper.WebScraperType.EXA.capitalize(),
api_key=os.getenv("EXA_API_KEY"),
api_url=api_url,
)
)
if os.getenv("OLOSTEP_API_KEY"):
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
enabled_scrapers.append(
@@ -1457,17 +1467,6 @@ class ConversationAdapters:
api_url=api_url,
)
)
# Jina is the default fallback scrapers to use as it does not require an API key
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
enabled_scrapers.append(
WebScraper(
type=WebScraper.WebScraperType.JINA,
name=WebScraper.WebScraperType.JINA.capitalize(),
api_key=os.getenv("JINA_API_KEY"),
api_url=api_url,
)
)
# Only enable the direct web page scraper by default in self-hosted single user setups.
# Useful for reading webpages on your intranet.
if state.anonymous_mode or in_debug_mode():

View File

@@ -0,0 +1,26 @@
# Generated by Django 5.1.14 on 2025-11-12 19:25
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0094_serverchatsettings_think_free_deep_and_more"),
]
operations = [
migrations.AlterField(
model_name="webscraper",
name="type",
field=models.CharField(
choices=[
("Firecrawl", "Firecrawl"),
("Olostep", "Olostep"),
("Exa", "Exa"),
("Direct", "Direct"),
],
default="Direct",
max_length=20,
),
),
]

View File

@@ -393,7 +393,7 @@ class WebScraper(DbBaseModel):
class WebScraperType(models.TextChoices):
FIRECRAWL = "Firecrawl"
OLOSTEP = "Olostep"
JINA = "Jina"
EXA = "Exa"
DIRECT = "Direct"
name = models.CharField(
@@ -404,7 +404,7 @@ class WebScraper(DbBaseModel):
unique=True,
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.DIRECT)
api_key = models.CharField(
max_length=200,
default=None,
@@ -436,8 +436,8 @@ class WebScraper(DbBaseModel):
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
elif self.type == self.WebScraperType.OLOSTEP:
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
elif self.type == self.WebScraperType.JINA:
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
elif self.type == self.WebScraperType.EXA:
self.api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
if self.api_key is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_key = os.getenv("FIRECRAWL_API_KEY")
@@ -447,8 +447,10 @@ class WebScraper(DbBaseModel):
self.api_key = os.getenv("OLOSTEP_API_KEY")
if self.api_key is None:
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
elif self.type == self.WebScraperType.JINA:
self.api_key = os.getenv("JINA_API_KEY")
elif self.type == self.WebScraperType.EXA:
self.api_key = os.getenv("EXA_API_KEY")
if self.api_key is None:
error["api_key"] = "Set API key to use Exa. Get API key from https://exa.ai/."
if error:
raise ValidationError(error)

View File

@@ -156,7 +156,7 @@ async def search_online(
link = organic.get("link")
if link in webpages and idx < max_webpages_to_read:
webpages[link]["queries"].add(subquery)
# Content of web pages is directly available when Jina is used for search.
# Content of web pages can be directly available when Exa is used for search.
elif idx < max_webpages_to_read:
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
# Only keep webpage content for up to max_webpages_to_read organic results.
@@ -199,7 +199,8 @@ async def search_with_exa(query: str, location: LocationData) -> Tuple[str, Dict
Tuple containing the original query and a dictionary of search results
"""
# Set up API endpoint and headers
exa_search_api_endpoint = "https://api.exa.ai/search"
exa_api_base = os.getenv("EXA_API_URL", "https://api.exa.ai")
exa_search_api_endpoint = f"{exa_api_base}/search"
headers = {"Content-Type": "application/json", "x-api-key": EXA_API_KEY}
# Prepare request payload
@@ -507,6 +508,8 @@ async def read_webpage(
return await read_webpage_with_firecrawl(url, api_key, api_url), None
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
return await read_webpage_with_olostep(url, api_key, api_url), None
elif scraper_type == WebScraper.WebScraperType.EXA:
return await read_webpage_with_exa(url, api_key, api_url), None
else:
return await read_webpage_at_url(url), None
@@ -583,6 +586,23 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
return response_json["markdown_content"]
async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str:
exa_api_url = f"{api_url}/contents"
headers = {"Content-Type": "application/json", "x-api-key": api_key}
params = {
"urls": [web_url],
"text": True,
"livecrawl": "fallback",
"livecrawlTimeout": 15000,
}
async with aiohttp.ClientSession() as session:
async with session.post(exa_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
response.raise_for_status()
response_json = await response.json()
return response_json["results"][0]["text"]
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
firecrawl_api_url = f"{api_url}/v1/scrape"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}

View File

@@ -813,7 +813,7 @@ def is_web_search_enabled():
for search_config in [
"GOOGLE_SEARCH_API_KEY",
"SERPER_DEV_API_KEY",
"JINA_API_KEY",
"EXA_API_KEY",
"FIRECRAWL_API_KEY",
"KHOJ_SEARXNG_URL",
]