Fallback through enabled scrapers to reduce web page read failures

- Set up scrapers via API keys, explicitly adding them via admin panel
  or enabling only a single scraper to use via server chat settings.

- Use validation to ensure only valid scrapers added via admin panel
  Example API key is present for scrapers that require it etc.

- Modularize the read webpage functions to take api key, url as args
  Removes dependence on constants loaded in online_search. Functions
  are now mostly self contained

- Improve ability to read webpages by using the speed, success rate of
  different scrapers. Optimal configuration needs to be discovered
This commit is contained in:
Debanjum Singh Solanky
2024-10-16 00:37:46 -07:00
parent 11c64791aa
commit d94abba2dc
6 changed files with 196 additions and 76 deletions

View File

@@ -1,3 +1,4 @@
import os
import re
import uuid
from random import choice
@@ -12,8 +13,6 @@ from django.utils.translation import gettext_lazy
from pgvector.django import VectorField
from phonenumber_field.modelfields import PhoneNumberField
from khoj.utils.helpers import ConversationCommand
class BaseModel(models.Model):
created_at = models.DateTimeField(auto_now_add=True)
@@ -245,19 +244,58 @@ class GithubRepoConfig(BaseModel):
github_config = models.ForeignKey(GithubConfig, on_delete=models.CASCADE, related_name="githubrepoconfig")
class ServerChatSettings(BaseModel):
class WebScraper(models.TextChoices):
class WebScraper(BaseModel):
class WebScraperType(models.TextChoices):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep")
JINAAI = "jinaai", gettext_lazy("JinaAI")
JINA = "jina", gettext_lazy("Jina")
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
def clean(self):
error = {}
if self.name is None:
self.name = self.type.capitalize()
if self.api_url is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_url = os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev")
elif self.type == self.WebScraperType.OLOSTEP:
self.api_url = os.getenv("OLOSTEP_API_URL", "https://agent.olostep.com/olostep-p2p-incomingAPI")
elif self.type == self.WebScraperType.JINA:
self.api_url = os.getenv("JINA_READER_API_URL", "https://r.jina.ai/")
if self.api_key is None:
if self.type == self.WebScraperType.FIRECRAWL:
self.api_key = os.getenv("FIRECRAWL_API_KEY")
if not self.api_key and self.api_url == "https://api.firecrawl.dev":
error["api_key"] = "Set API key to use default Firecrawl. Get API key from https://firecrawl.dev."
elif self.type == self.WebScraperType.OLOSTEP:
self.api_key = os.getenv("OLOSTEP_API_KEY")
if self.api_key is None:
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
elif self.type == self.WebScraperType.JINA:
self.api_key = os.getenv("JINA_API_KEY")
if error:
raise ValidationError(error)
def save(self, *args, **kwargs):
self.clean()
super().save(*args, **kwargs)
class ServerChatSettings(BaseModel):
chat_default = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_default"
)
chat_advanced = models.ForeignKey(
ChatModelOptions, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="chat_advanced"
)
web_scraper = models.CharField(max_length=20, choices=WebScraper.choices, default=WebScraper.JINAAI)
web_scraper = models.ForeignKey(
WebScraper, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name="web_scraper"
)
class LocalOrgConfig(BaseModel):