From 61cb2d5b7e3349fdd6d06c92aa26b46fb8c9966d Mon Sep 17 00:00:00 2001 From: Debanjum Date: Wed, 12 Nov 2025 12:21:59 -0800 Subject: [PATCH] Enable webpage reading with Exa. Remove Jina web page reader Support using Exa for webpage reading. It seems much faster than currently available providers. Remove Jina as a webpage reader and remaining references to Jina from code, docs. It was anyway slow and API may shut down soon (as it was bought by Elastic). Update docs to mention Exa for web search and webpage reading. --- docker-compose.yml | 10 +++---- documentation/docs/features/online_search.md | 11 ++++---- src/khoj/database/adapters/__init__.py | 21 +++++++-------- .../migrations/0095_alter_webscraper_type.py | 26 +++++++++++++++++++ src/khoj/database/models/__init__.py | 14 +++++----- src/khoj/processor/tools/online_search.py | 24 +++++++++++++++-- src/khoj/utils/helpers.py | 2 +- 7 files changed, 78 insertions(+), 30 deletions(-) create mode 100644 src/khoj/database/migrations/0095_alter_webscraper_type.py diff --git a/docker-compose.yml b/docker-compose.yml index 46451fda..3bdb88a8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -95,14 +95,14 @@ services: # Uncomment appropriate lines below to enable web results with Khoj # Ensure you set your provider specific API keys. # --- - # Free, Slower API. Does both web search and webpage read. Get API key from https://jina.ai/ - # - JINA_API_KEY=your_jina_api_key # Paid, Fast API. Only does web search. Get API key from https://serper.dev/ # - SERPER_DEV_API_KEY=your_serper_dev_api_key - # Paid, Fast, Open API. Only does webpage read. Get API key from https://firecrawl.dev/ - # - FIRECRAWL_API_KEY=your_firecrawl_api_key - # Paid, Fast, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/ + # Paid, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/ # - OLOSTEP_API_KEY=your_olostep_api_key + # Paid, Open API. Does both web search and webpage read. Get API key from https://firecrawl.dev/ + # - FIRECRAWL_API_KEY=your_firecrawl_api_key + # Paid, Fast API. Does both web search and webpage read. Get API key from https://exa.ai/ + # - EXA_API_KEY=your_exa_api_key # # Uncomment the necessary lines below to make your instance publicly accessible. # Proceed with caution, especially if you are using anonymous mode. diff --git a/documentation/docs/features/online_search.md b/documentation/docs/features/online_search.md index de9e0926..66bc9725 100644 --- a/documentation/docs/features/online_search.md +++ b/documentation/docs/features/online_search.md @@ -19,13 +19,14 @@ Try it out yourself! https://app.khoj.dev Online search can work even with self-hosting! You have a few options: - If you're using Docker, online search should work out of the box with [searxng](https://github.com/searxng/searxng) using our standard `docker-compose.yml`. -- For a non-local, free solution, you can use [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages. You can get a free API key via https://jina.ai/reader. Set the `JINA_API_KEY` environment variable to your Jina AI reader API key to enable online search. - To get production-grade, fast online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc. +- To use open, self-hostable search provider, set the `FIRECRAWL_API_KEY` environment variable to your [Firecrawl](https://firecrawl.dev) API key. These search results do not scrape social media results. +- To use Exa search provider, set the `EXA_API_KEY` environment variable to your [Exa](https://exa.ai) API key. ### Webpage Reading -Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get more distributed and scalable webpage reading, you can use the following options: +Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get faster, more readable webpages for Khoj, you can use the following options: -- If you're using Jina AI's reader API for search, it should work automatically for webpage reading as well. -- For scalable webpage scraping, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl. -- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This has a higher success rate at reading webpages than the default webpage readers. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep. +- For open, self-hostable webpage reader, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl. +- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This can read a wider variety of webpages. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep. +- For fast webpage reading, you can use [Exa](https://exa.ai). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Exa API key to the Api Key field, and set the type to Exa. diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index ea9c773f..1807cdca 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1437,6 +1437,16 @@ class ConversationAdapters: enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()] if not enabled_scrapers: # Use scrapers enabled via environment variables + if os.getenv("EXA_API_KEY"): + api_url = os.getenv("EXA_API_URL", "https://api.exa.ai") + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.EXA, + name=WebScraper.WebScraperType.EXA.capitalize(), + api_key=os.getenv("EXA_API_KEY"), + api_url=api_url, + ) + ) if os.getenv("OLOSTEP_API_KEY"): api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") enabled_scrapers.append( @@ -1457,17 +1467,6 @@ class ConversationAdapters: api_url=api_url, ) ) - # Jina is the default fallback scrapers to use as it does not require an API key - api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") - enabled_scrapers.append( - WebScraper( - type=WebScraper.WebScraperType.JINA, - name=WebScraper.WebScraperType.JINA.capitalize(), - api_key=os.getenv("JINA_API_KEY"), - api_url=api_url, - ) - ) - # Only enable the direct web page scraper by default in self-hosted single user setups. # Useful for reading webpages on your intranet. if state.anonymous_mode or in_debug_mode(): diff --git a/src/khoj/database/migrations/0095_alter_webscraper_type.py b/src/khoj/database/migrations/0095_alter_webscraper_type.py new file mode 100644 index 00000000..de78da85 --- /dev/null +++ b/src/khoj/database/migrations/0095_alter_webscraper_type.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.14 on 2025-11-12 19:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0094_serverchatsettings_think_free_deep_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="webscraper", + name="type", + field=models.CharField( + choices=[ + ("Firecrawl", "Firecrawl"), + ("Olostep", "Olostep"), + ("Exa", "Exa"), + ("Direct", "Direct"), + ], + default="Direct", + max_length=20, + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 90ed67a8..bdeb9480 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -393,7 +393,7 @@ class WebScraper(DbBaseModel): class WebScraperType(models.TextChoices): FIRECRAWL = "Firecrawl" OLOSTEP = "Olostep" - JINA = "Jina" + EXA = "Exa" DIRECT = "Direct" name = models.CharField( @@ -404,7 +404,7 @@ class WebScraper(DbBaseModel): unique=True, help_text="Friendly name. If not set, it will be set to the type of the scraper.", ) - type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) + type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.DIRECT) api_key = models.CharField( max_length=200, default=None, @@ -436,8 +436,8 @@ class WebScraper(DbBaseModel): self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") elif self.type == self.WebScraperType.OLOSTEP: self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") - elif self.type == self.WebScraperType.JINA: - self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + elif self.type == self.WebScraperType.EXA: + self.api_url = os.getenv("EXA_API_URL", "https://api.exa.ai") if self.api_key is None: if self.type == self.WebScraperType.FIRECRAWL: self.api_key = os.getenv("FIRECRAWL_API_KEY") @@ -447,8 +447,10 @@ class WebScraper(DbBaseModel): self.api_key = os.getenv("OLOSTEP_API_KEY") if self.api_key is None: error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." - elif self.type == self.WebScraperType.JINA: - self.api_key = os.getenv("JINA_API_KEY") + elif self.type == self.WebScraperType.EXA: + self.api_key = os.getenv("EXA_API_KEY") + if self.api_key is None: + error["api_key"] = "Set API key to use Exa. Get API key from https://exa.ai/." if error: raise ValidationError(error) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index ef8e324c..8779df5a 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -156,7 +156,7 @@ async def search_online( link = organic.get("link") if link in webpages and idx < max_webpages_to_read: webpages[link]["queries"].add(subquery) - # Content of web pages is directly available when Jina is used for search. + # Content of web pages can be directly available when Exa is used for search. elif idx < max_webpages_to_read: webpages[link] = {"queries": {subquery}, "content": organic.get("content")} # Only keep webpage content for up to max_webpages_to_read organic results. @@ -199,7 +199,8 @@ async def search_with_exa(query: str, location: LocationData) -> Tuple[str, Dict Tuple containing the original query and a dictionary of search results """ # Set up API endpoint and headers - exa_search_api_endpoint = "https://api.exa.ai/search" + exa_api_base = os.getenv("EXA_API_URL", "https://api.exa.ai") + exa_search_api_endpoint = f"{exa_api_base}/search" headers = {"Content-Type": "application/json", "x-api-key": EXA_API_KEY} # Prepare request payload @@ -507,6 +508,8 @@ async def read_webpage( return await read_webpage_with_firecrawl(url, api_key, api_url), None elif scraper_type == WebScraper.WebScraperType.OLOSTEP: return await read_webpage_with_olostep(url, api_key, api_url), None + elif scraper_type == WebScraper.WebScraperType.EXA: + return await read_webpage_with_exa(url, api_key, api_url), None else: return await read_webpage_at_url(url), None @@ -583,6 +586,23 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> return response_json["markdown_content"] +async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str: + exa_api_url = f"{api_url}/contents" + headers = {"Content-Type": "application/json", "x-api-key": api_key} + params = { + "urls": [web_url], + "text": True, + "livecrawl": "fallback", + "livecrawlTimeout": 15000, + } + + async with aiohttp.ClientSession() as session: + async with session.post(exa_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: + response.raise_for_status() + response_json = await response.json() + return response_json["results"][0]["text"] + + async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: firecrawl_api_url = f"{api_url}/v1/scrape" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 1afa9675..7b54a1a1 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -813,7 +813,7 @@ def is_web_search_enabled(): for search_config in [ "GOOGLE_SEARCH_API_KEY", "SERPER_DEV_API_KEY", - "JINA_API_KEY", + "EXA_API_KEY", "FIRECRAWL_API_KEY", "KHOJ_SEARXNG_URL", ]