mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 05:39:11 +00:00
Fallback through enabled scrapers to reduce web page read failures
- Set up scrapers via API keys, explicitly adding them via admin panel or enabling only a single scraper to use via server chat settings. - Use validation to ensure only valid scrapers added via admin panel Example API key is present for scrapers that require it etc. - Modularize the read webpage functions to take api key, url as args Removes dependence on constants loaded in online_search. Functions are now mostly self contained - Improve ability to read webpages by using the speed, success rate of different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from random import choice
|
||||
@@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
|
||||
from pgvector.django import VectorField
|
||||
from phonenumber_field.modelfields import PhoneNumberField
|
||||
|
||||
from khoj.utils.helpers import ConversationCommand
|
||||
|
||||
|
||||
class BaseModel(models.Model):
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
@@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
|
||||
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
|
||||
|
||||
|
||||
class ServerChatSettings(BaseModel):
|
||||
class WebScraper(models.TextChoices):
|
||||
class WebScraper(BaseModel):
|
||||
class WebScraperType(models.TextChoices):
|
||||
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
||||
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
||||
JINAAI = "jinaai", gettext_lazy("JinaAI")
|
||||
JINA = "jina", gettext_lazy("Jina")
|
||||
|
||||
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
|
||||
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
|
||||
|
||||
def clean(self):
|
||||
error = {}
|
||||
if self.name is None:
|
||||
self.name = self.type.capitalize()
|
||||
if self.api_url is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
|
||||
if self.api_key is None:
|
||||
if self.type == self.WebScraperType.FIRECRAWL:
|
||||
self.api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
|
||||
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
|
||||
elif self.type == self.WebScraperType.OLOSTEP:
|
||||
self.api_key = os.getenv("OLOSTEP_API_KEY")
|
||||
if self.api_key is None:
|
||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_key = os.getenv("JINA_API_KEY")
|
||||
|
||||
if error:
|
||||
raise ValidationError(error)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.clean()
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class ServerChatSettings(BaseModel):
|
||||
chat_default = models.ForeignKey(
|
||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
|
||||
)
|
||||
chat_advanced = models.ForeignKey(
|
||||
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
|
||||
)
|
||||
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
|
||||
web_scraper = models.ForeignKey(
|
||||
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
|
||||
)
|
||||
|
||||
|
||||
class LocalOrgConfig(BaseModel):
|
||||
|
||||
Reference in New Issue
Block a user