mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Enable webpage reading with Exa. Remove Jina web page reader
Support using Exa for webpage reading. It seems much faster than currently available providers. Remove Jina as a webpage reader and remaining references to Jina from code, docs. It was anyway slow and API may shut down soon (as it was bought by Elastic). Update docs to mention Exa for web search and webpage reading.
This commit is contained in:
@@ -1437,6 +1437,16 @@ class ConversationAdapters:
|
||||
enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()]
|
||||
if not enabled_scrapers:
|
||||
# Use scrapers enabled via environment variables
|
||||
if os.getenv("EXA_API_KEY"):
|
||||
api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.EXA,
|
||||
name=WebScraper.WebScraperType.EXA.capitalize(),
|
||||
api_key=os.getenv("EXA_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
if os.getenv("OLOSTEP_API_KEY"):
|
||||
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
enabled_scrapers.append(
|
||||
@@ -1457,17 +1467,6 @@ class ConversationAdapters:
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
# Jina is the default fallback scrapers to use as it does not require an API key
|
||||
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.JINA,
|
||||
name=WebScraper.WebScraperType.JINA.capitalize(),
|
||||
api_key=os.getenv("JINA_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
|
||||
# Only enable the direct web page scraper by default in self-hosted single user setups.
|
||||
# Useful for reading webpages on your intranet.
|
||||
if state.anonymous_mode or in_debug_mode():
|
||||
|
||||
26
src/khoj/database/migrations/0095_alter_webscraper_type.py
Normal file
26
src/khoj/database/migrations/0095_alter_webscraper_type.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 5.1.14 on 2025-11-12 19:25
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0094_serverchatsettings_think_free_deep_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="webscraper",
|
||||
name="type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("Firecrawl", "Firecrawl"),
|
||||
("Olostep", "Olostep"),
|
||||
("Exa", "Exa"),
|
||||
("Direct", "Direct"),
|
||||
],
|
||||
default="Direct",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -393,7 +393,7 @@ class WebScraper(DbBaseModel):
|
||||
class WebScraperType(models.TextChoices):
|
||||
FIRECRAWL = "Firecrawl"
|
||||
OLOSTEP = "Olostep"
|
||||
JINA = "Jina"
|
||||
EXA = "Exa"
|
||||
DIRECT = "Direct"
|
||||
|
||||
name = models.CharField(
|
||||
@@ -404,7 +404,7 @@ class WebScraper(DbBaseModel):
|
||||
unique=True,
|
||||
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||
)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.DIRECT)
|
||||
api_key = models.CharField(
|
||||
max_length=200,
|
||||
default=None,
|
||||
@@ -436,8 +436,8 @@ class WebScraper(DbBaseModel):
|
||||
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
elif self.type == self.WebScraperType.EXA:
|
||||
self.api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
if self.api_key is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
@@ -447,8 +447,10 @@ class WebScraper(DbBaseModel):
|
||||
self.api_key = os.getenv("OLOSTEP_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_key = os.getenv("JINA_API_KEY")
|
||||
elif self.type == self.WebScraperType.EXA:
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Exa. Get API key from https://exa.ai/."
|
||||
if error:
|
||||
raise ValidationError(error)
|
||||
|
||||
|
||||
@@ -156,7 +156,7 @@ async def search_online(
|
||||
link = organic.get("link")
|
||||
if link in webpages and idx < max_webpages_to_read:
|
||||
webpages[link]["queries"].add(subquery)
|
||||
# Content of web pages is directly available when Jina is used for search.
|
||||
# Content of web pages can be directly available when Exa is used for search.
|
||||
elif idx < max_webpages_to_read:
|
||||
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
||||
# Only keep webpage content for up to max_webpages_to_read organic results.
|
||||
@@ -199,7 +199,8 @@ async def search_with_exa(query: str, location: LocationData) -> Tuple[str, Dict
|
||||
Tuple containing the original query and a dictionary of search results
|
||||
"""
|
||||
# Set up API endpoint and headers
|
||||
exa_search_api_endpoint = "https://api.exa.ai/search"
|
||||
exa_api_base = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
exa_search_api_endpoint = f"{exa_api_base}/search"
|
||||
headers = {"Content-Type": "application/json", "x-api-key": EXA_API_KEY}
|
||||
|
||||
# Prepare request payload
|
||||
@@ -507,6 +508,8 @@ async def read_webpage(
|
||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
||||
return await read_webpage_with_olostep(url, api_key, api_url), None
|
||||
elif scraper_type == WebScraper.WebScraperType.EXA:
|
||||
return await read_webpage_with_exa(url, api_key, api_url), None
|
||||
else:
|
||||
return await read_webpage_at_url(url), None
|
||||
|
||||
@@ -583,6 +586,23 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
|
||||
return response_json["markdown_content"]
|
||||
|
||||
|
||||
async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str:
|
||||
exa_api_url = f"{api_url}/contents"
|
||||
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
||||
params = {
|
||||
"urls": [web_url],
|
||||
"text": True,
|
||||
"livecrawl": "fallback",
|
||||
"livecrawlTimeout": 15000,
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(exa_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["results"][0]["text"]
|
||||
|
||||
|
||||
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
|
||||
@@ -813,7 +813,7 @@ def is_web_search_enabled():
|
||||
for search_config in [
|
||||
"GOOGLE_SEARCH_API_KEY",
|
||||
"SERPER_DEV_API_KEY",
|
||||
"JINA_API_KEY",
|
||||
"EXA_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"KHOJ_SEARXNG_URL",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user