mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
Enable webpage reading with Exa. Remove Jina web page reader
Support using Exa for webpage reading. It seems much faster than currently available providers. Remove Jina as a webpage reader and remaining references to Jina from code, docs. It was anyway slow and API may shut down soon (as it was bought by Elastic). Update docs to mention Exa for web search and webpage reading.
This commit is contained in:
@@ -95,14 +95,14 @@ services:
|
||||
# Uncomment appropriate lines below to enable web results with Khoj
|
||||
# Ensure you set your provider specific API keys.
|
||||
# ---
|
||||
# Free, Slower API. Does both web search and webpage read. Get API key from https://jina.ai/
|
||||
# - JINA_API_KEY=your_jina_api_key
|
||||
# Paid, Fast API. Only does web search. Get API key from https://serper.dev/
|
||||
# - SERPER_DEV_API_KEY=your_serper_dev_api_key
|
||||
# Paid, Fast, Open API. Only does webpage read. Get API key from https://firecrawl.dev/
|
||||
# - FIRECRAWL_API_KEY=your_firecrawl_api_key
|
||||
# Paid, Fast, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/
|
||||
# Paid, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/
|
||||
# - OLOSTEP_API_KEY=your_olostep_api_key
|
||||
# Paid, Open API. Does both web search and webpage read. Get API key from https://firecrawl.dev/
|
||||
# - FIRECRAWL_API_KEY=your_firecrawl_api_key
|
||||
# Paid, Fast API. Does both web search and webpage read. Get API key from https://exa.ai/
|
||||
# - EXA_API_KEY=your_exa_api_key
|
||||
#
|
||||
# Uncomment the necessary lines below to make your instance publicly accessible.
|
||||
# Proceed with caution, especially if you are using anonymous mode.
|
||||
|
||||
@@ -19,13 +19,14 @@ Try it out yourself! https://app.khoj.dev
|
||||
Online search can work even with self-hosting! You have a few options:
|
||||
|
||||
- If you're using Docker, online search should work out of the box with [searxng](https://github.com/searxng/searxng) using our standard `docker-compose.yml`.
|
||||
- For a non-local, free solution, you can use [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages. You can get a free API key via https://jina.ai/reader. Set the `JINA_API_KEY` environment variable to your Jina AI reader API key to enable online search.
|
||||
- To get production-grade, fast online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc.
|
||||
- To use open, self-hostable search provider, set the `FIRECRAWL_API_KEY` environment variable to your [Firecrawl](https://firecrawl.dev) API key. These search results do not scrape social media results.
|
||||
- To use Exa search provider, set the `EXA_API_KEY` environment variable to your [Exa](https://exa.ai) API key.
|
||||
|
||||
### Webpage Reading
|
||||
|
||||
Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get more distributed and scalable webpage reading, you can use the following options:
|
||||
Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get faster, more readable webpages for Khoj, you can use the following options:
|
||||
|
||||
- If you're using Jina AI's reader API for search, it should work automatically for webpage reading as well.
|
||||
- For scalable webpage scraping, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl.
|
||||
- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This has a higher success rate at reading webpages than the default webpage readers. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep.
|
||||
- For open, self-hostable webpage reader, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl.
|
||||
- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This can read a wider variety of webpages. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep.
|
||||
- For fast webpage reading, you can use [Exa](https://exa.ai). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Exa API key to the Api Key field, and set the type to Exa.
|
||||
|
||||
@@ -1437,6 +1437,16 @@ class ConversationAdapters:
|
||||
enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()]
|
||||
if not enabled_scrapers:
|
||||
# Use scrapers enabled via environment variables
|
||||
if os.getenv("EXA_API_KEY"):
|
||||
api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.EXA,
|
||||
name=WebScraper.WebScraperType.EXA.capitalize(),
|
||||
api_key=os.getenv("EXA_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
if os.getenv("OLOSTEP_API_KEY"):
|
||||
api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
enabled_scrapers.append(
|
||||
@@ -1457,17 +1467,6 @@ class ConversationAdapters:
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
# Jina is the default fallback scrapers to use as it does not require an API key
|
||||
api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
enabled_scrapers.append(
|
||||
WebScraper(
|
||||
type=WebScraper.WebScraperType.JINA,
|
||||
name=WebScraper.WebScraperType.JINA.capitalize(),
|
||||
api_key=os.getenv("JINA_API_KEY"),
|
||||
api_url=api_url,
|
||||
)
|
||||
)
|
||||
|
||||
# Only enable the direct web page scraper by default in self-hosted single user setups.
|
||||
# Useful for reading webpages on your intranet.
|
||||
if state.anonymous_mode or in_debug_mode():
|
||||
|
||||
26
src/khoj/database/migrations/0095_alter_webscraper_type.py
Normal file
26
src/khoj/database/migrations/0095_alter_webscraper_type.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 5.1.14 on 2025-11-12 19:25
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0094_serverchatsettings_think_free_deep_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="webscraper",
|
||||
name="type",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("Firecrawl", "Firecrawl"),
|
||||
("Olostep", "Olostep"),
|
||||
("Exa", "Exa"),
|
||||
("Direct", "Direct"),
|
||||
],
|
||||
default="Direct",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -393,7 +393,7 @@ class WebScraper(DbBaseModel):
|
||||
class WebScraperType(models.TextChoices):
|
||||
FIRECRAWL = "Firecrawl"
|
||||
OLOSTEP = "Olostep"
|
||||
JINA = "Jina"
|
||||
EXA = "Exa"
|
||||
DIRECT = "Direct"
|
||||
|
||||
name = models.CharField(
|
||||
@@ -404,7 +404,7 @@ class WebScraper(DbBaseModel):
|
||||
unique=True,
|
||||
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||
)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.DIRECT)
|
||||
api_key = models.CharField(
|
||||
max_length=200,
|
||||
default=None,
|
||||
@@ -436,8 +436,8 @@ class WebScraper(DbBaseModel):
|
||||
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
elif self.type == self.WebScraperType.EXA:
|
||||
self.api_url = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
if self.api_key is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
@@ -447,8 +447,10 @@ class WebScraper(DbBaseModel):
|
||||
self.api_key = os.getenv("OLOSTEP_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_key = os.getenv("JINA_API_KEY")
|
||||
elif self.type == self.WebScraperType.EXA:
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Exa. Get API key from https://exa.ai/."
|
||||
if error:
|
||||
raise ValidationError(error)
|
||||
|
||||
|
||||
@@ -156,7 +156,7 @@ async def search_online(
|
||||
link = organic.get("link")
|
||||
if link in webpages and idx < max_webpages_to_read:
|
||||
webpages[link]["queries"].add(subquery)
|
||||
# Content of web pages is directly available when Jina is used for search.
|
||||
# Content of web pages can be directly available when Exa is used for search.
|
||||
elif idx < max_webpages_to_read:
|
||||
webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
|
||||
# Only keep webpage content for up to max_webpages_to_read organic results.
|
||||
@@ -199,7 +199,8 @@ async def search_with_exa(query: str, location: LocationData) -> Tuple[str, Dict
|
||||
Tuple containing the original query and a dictionary of search results
|
||||
"""
|
||||
# Set up API endpoint and headers
|
||||
exa_search_api_endpoint = "https://api.exa.ai/search"
|
||||
exa_api_base = os.getenv("EXA_API_URL", "https://api.exa.ai")
|
||||
exa_search_api_endpoint = f"{exa_api_base}/search"
|
||||
headers = {"Content-Type": "application/json", "x-api-key": EXA_API_KEY}
|
||||
|
||||
# Prepare request payload
|
||||
@@ -507,6 +508,8 @@ async def read_webpage(
|
||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
||||
return await read_webpage_with_olostep(url, api_key, api_url), None
|
||||
elif scraper_type == WebScraper.WebScraperType.EXA:
|
||||
return await read_webpage_with_exa(url, api_key, api_url), None
|
||||
else:
|
||||
return await read_webpage_at_url(url), None
|
||||
|
||||
@@ -583,6 +586,23 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) ->
|
||||
return response_json["markdown_content"]
|
||||
|
||||
|
||||
async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str:
|
||||
exa_api_url = f"{api_url}/contents"
|
||||
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
||||
params = {
|
||||
"urls": [web_url],
|
||||
"text": True,
|
||||
"livecrawl": "fallback",
|
||||
"livecrawlTimeout": 15000,
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(exa_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response:
|
||||
response.raise_for_status()
|
||||
response_json = await response.json()
|
||||
return response_json["results"][0]["text"]
|
||||
|
||||
|
||||
async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str:
|
||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
|
||||
@@ -813,7 +813,7 @@ def is_web_search_enabled():
|
||||
for search_config in [
|
||||
"GOOGLE_SEARCH_API_KEY",
|
||||
"SERPER_DEV_API_KEY",
|
||||
"JINA_API_KEY",
|
||||
"EXA_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"KHOJ_SEARXNG_URL",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user