diff --git a/docker-compose.yml b/docker-compose.yml index 46451fda..3bdb88a8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -95,14 +95,14 @@ services: # Uncomment appropriate lines below to enable web results with Khoj # Ensure you set your provider specific API keys. # --- - # Free, Slower API. Does both web search and webpage read. Get API key from https://jina.ai/ - # - JINA_API_KEY=your_jina_api_key # Paid, Fast API. Only does web search. Get API key from https://serper.dev/ # - SERPER_DEV_API_KEY=your_serper_dev_api_key - # Paid, Fast, Open API. Only does webpage read. Get API key from https://firecrawl.dev/ - # - FIRECRAWL_API_KEY=your_firecrawl_api_key - # Paid, Fast, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/ + # Paid, Higher Read Success API. Only does webpage read. Get API key from https://olostep.com/ # - OLOSTEP_API_KEY=your_olostep_api_key + # Paid, Open API. Does both web search and webpage read. Get API key from https://firecrawl.dev/ + # - FIRECRAWL_API_KEY=your_firecrawl_api_key + # Paid, Fast API. Does both web search and webpage read. Get API key from https://exa.ai/ + # - EXA_API_KEY=your_exa_api_key # # Uncomment the necessary lines below to make your instance publicly accessible. # Proceed with caution, especially if you are using anonymous mode. diff --git a/documentation/docs/features/online_search.md b/documentation/docs/features/online_search.md index de9e0926..66bc9725 100644 --- a/documentation/docs/features/online_search.md +++ b/documentation/docs/features/online_search.md @@ -19,13 +19,14 @@ Try it out yourself! https://app.khoj.dev Online search can work even with self-hosting! You have a few options: - If you're using Docker, online search should work out of the box with [searxng](https://github.com/searxng/searxng) using our standard `docker-compose.yml`. -- For a non-local, free solution, you can use [JinaAI's reader API](https://jina.ai/reader/) to search online and read webpages. You can get a free API key via https://jina.ai/reader. Set the `JINA_API_KEY` environment variable to your Jina AI reader API key to enable online search. - To get production-grade, fast online search, set the `SERPER_DEV_API_KEY` environment variable to your [Serper.dev](https://serper.dev/) API key. These search results include additional context like answer box, knowledge graph etc. +- To use open, self-hostable search provider, set the `FIRECRAWL_API_KEY` environment variable to your [Firecrawl](https://firecrawl.dev) API key. These search results do not scrape social media results. +- To use Exa search provider, set the `EXA_API_KEY` environment variable to your [Exa](https://exa.ai) API key. ### Webpage Reading -Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get more distributed and scalable webpage reading, you can use the following options: +Out of the box, you **don't have to do anything to enable webpage reading**. Khoj will automatically read webpages by using the `requests` library. To get faster, more readable webpages for Khoj, you can use the following options: -- If you're using Jina AI's reader API for search, it should work automatically for webpage reading as well. -- For scalable webpage scraping, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl. -- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This has a higher success rate at reading webpages than the default webpage readers. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep. +- For open, self-hostable webpage reader, you can use [Firecrawl](https://www.firecrawl.dev/). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Firecrawl API key to the Api Key field, and set the type to Firecrawl. +- For advanced webpage reading, you can use [Olostep](https://www.olostep.com/). This can read a wider variety of webpages. Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Olostep API key to the Api Key field, and set the type to Olostep. +- For fast webpage reading, you can use [Exa](https://exa.ai). Create a new [Webscraper](http://localhost:42110/server/admin/database/webscraper/add/). Set your Exa API key to the Api Key field, and set the type to Exa. diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index ea9c773f..1807cdca 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1437,6 +1437,16 @@ class ConversationAdapters: enabled_scrapers = [scraper async for scraper in WebScraper.objects.all().order_by("priority").aiterator()] if not enabled_scrapers: # Use scrapers enabled via environment variables + if os.getenv("EXA_API_KEY"): + api_url = os.getenv("EXA_API_URL", "https://api.exa.ai") + enabled_scrapers.append( + WebScraper( + type=WebScraper.WebScraperType.EXA, + name=WebScraper.WebScraperType.EXA.capitalize(), + api_key=os.getenv("EXA_API_KEY"), + api_url=api_url, + ) + ) if os.getenv("OLOSTEP_API_KEY"): api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") enabled_scrapers.append( @@ -1457,17 +1467,6 @@ class ConversationAdapters: api_url=api_url, ) ) - # Jina is the default fallback scrapers to use as it does not require an API key - api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") - enabled_scrapers.append( - WebScraper( - type=WebScraper.WebScraperType.JINA, - name=WebScraper.WebScraperType.JINA.capitalize(), - api_key=os.getenv("JINA_API_KEY"), - api_url=api_url, - ) - ) - # Only enable the direct web page scraper by default in self-hosted single user setups. # Useful for reading webpages on your intranet. if state.anonymous_mode or in_debug_mode(): diff --git a/src/khoj/database/migrations/0095_alter_webscraper_type.py b/src/khoj/database/migrations/0095_alter_webscraper_type.py new file mode 100644 index 00000000..de78da85 --- /dev/null +++ b/src/khoj/database/migrations/0095_alter_webscraper_type.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.14 on 2025-11-12 19:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0094_serverchatsettings_think_free_deep_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="webscraper", + name="type", + field=models.CharField( + choices=[ + ("Firecrawl", "Firecrawl"), + ("Olostep", "Olostep"), + ("Exa", "Exa"), + ("Direct", "Direct"), + ], + default="Direct", + max_length=20, + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 90ed67a8..bdeb9480 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -393,7 +393,7 @@ class WebScraper(DbBaseModel): class WebScraperType(models.TextChoices): FIRECRAWL = "Firecrawl" OLOSTEP = "Olostep" - JINA = "Jina" + EXA = "Exa" DIRECT = "Direct" name = models.CharField( @@ -404,7 +404,7 @@ class WebScraper(DbBaseModel): unique=True, help_text="Friendly name. If not set, it will be set to the type of the scraper.", ) - type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) + type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.DIRECT) api_key = models.CharField( max_length=200, default=None, @@ -436,8 +436,8 @@ class WebScraper(DbBaseModel): self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev") elif self.type == self.WebScraperType.OLOSTEP: self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI") - elif self.type == self.WebScraperType.JINA: - self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/") + elif self.type == self.WebScraperType.EXA: + self.api_url = os.getenv("EXA_API_URL", "https://api.exa.ai") if self.api_key is None: if self.type == self.WebScraperType.FIRECRAWL: self.api_key = os.getenv("FIRECRAWL_API_KEY") @@ -447,8 +447,10 @@ class WebScraper(DbBaseModel): self.api_key = os.getenv("OLOSTEP_API_KEY") if self.api_key is None: error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." - elif self.type == self.WebScraperType.JINA: - self.api_key = os.getenv("JINA_API_KEY") + elif self.type == self.WebScraperType.EXA: + self.api_key = os.getenv("EXA_API_KEY") + if self.api_key is None: + error["api_key"] = "Set API key to use Exa. Get API key from https://exa.ai/." if error: raise ValidationError(error) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index ef8e324c..8779df5a 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -156,7 +156,7 @@ async def search_online( link = organic.get("link") if link in webpages and idx < max_webpages_to_read: webpages[link]["queries"].add(subquery) - # Content of web pages is directly available when Jina is used for search. + # Content of web pages can be directly available when Exa is used for search. elif idx < max_webpages_to_read: webpages[link] = {"queries": {subquery}, "content": organic.get("content")} # Only keep webpage content for up to max_webpages_to_read organic results. @@ -199,7 +199,8 @@ async def search_with_exa(query: str, location: LocationData) -> Tuple[str, Dict Tuple containing the original query and a dictionary of search results """ # Set up API endpoint and headers - exa_search_api_endpoint = "https://api.exa.ai/search" + exa_api_base = os.getenv("EXA_API_URL", "https://api.exa.ai") + exa_search_api_endpoint = f"{exa_api_base}/search" headers = {"Content-Type": "application/json", "x-api-key": EXA_API_KEY} # Prepare request payload @@ -507,6 +508,8 @@ async def read_webpage( return await read_webpage_with_firecrawl(url, api_key, api_url), None elif scraper_type == WebScraper.WebScraperType.OLOSTEP: return await read_webpage_with_olostep(url, api_key, api_url), None + elif scraper_type == WebScraper.WebScraperType.EXA: + return await read_webpage_with_exa(url, api_key, api_url), None else: return await read_webpage_at_url(url), None @@ -583,6 +586,23 @@ async def read_webpage_with_olostep(web_url: str, api_key: str, api_url: str) -> return response_json["markdown_content"] +async def read_webpage_with_exa(web_url: str, api_key: str, api_url: str) -> str: + exa_api_url = f"{api_url}/contents" + headers = {"Content-Type": "application/json", "x-api-key": api_key} + params = { + "urls": [web_url], + "text": True, + "livecrawl": "fallback", + "livecrawlTimeout": 15000, + } + + async with aiohttp.ClientSession() as session: + async with session.post(exa_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT) as response: + response.raise_for_status() + response_json = await response.json() + return response_json["results"][0]["text"] + + async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str) -> str: firecrawl_api_url = f"{api_url}/v1/scrape" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 1afa9675..7b54a1a1 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -813,7 +813,7 @@ def is_web_search_enabled(): for search_config in [ "GOOGLE_SEARCH_API_KEY", "SERPER_DEV_API_KEY", - "JINA_API_KEY", + "EXA_API_KEY", "FIRECRAWL_API_KEY", "KHOJ_SEARXNG_URL", ]