Make web scraper priority configurable via admin panel

- Simplifies changing order in which web scrapers are invoked to read
  web page by just changing their priority number on the admin panel.
  Previously you'd have to delete/, re-add the scrapers to change
  their priority.

- Add help text for each scraper field to ease admin setup experience

- Friendlier env var to use Firecrawl's LLM to extract content

- Remove use of separate friendly name for scraper types.
  Reuse actual name and just make actual name better
This commit is contained in:
Debanjum Singh Solanky
2024-10-17 16:22:46 -07:00
parent 20b6f0c2f4
commit 0db52786ed
6 changed files with 133 additions and 61 deletions

View File

@@ -1054,10 +1054,10 @@ class ConversationAdapters:
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
]
if not enabled_scrapers:
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
# Use the enabled web scrapers, ordered by priority, until get web page content
enabled_scrapers = [
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
async for scraper in WebScraper.objects.all().order_by("priority").aiterator()
]
if not enabled_scrapers:
# Use scrapers enabled via environment variables

View File

@@ -206,6 +206,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
@admin.register(WebScraper)
class WebScraperAdmin(admin.ModelAdmin):
list_display = (
"priority",
"name",
"type",
"api_key",
@@ -213,7 +214,7 @@ class WebScraperAdmin(admin.ModelAdmin):
"created_at",
)
search_fields = ("name", "api_key", "api_url", "type")
ordering = ("-created_at",)
ordering = ("priority",)
@admin.register(Conversation)

View File

@@ -1,47 +0,0 @@
# Generated by Django 5.0.8 on 2024-10-16 06:51
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0067_alter_agent_style_icon"),
]
operations = [
migrations.CreateModel(
name="WebScraper",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
(
"type",
models.CharField(
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
default="jina",
max_length=20,
),
),
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
("api_url", models.URLField(blank=True, default=None, null=True)),
],
options={
"abstract": False,
},
),
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="web_scraper",
to="database.webscraper",
),
),
]

View File

@@ -0,0 +1,89 @@
# Generated by Django 5.0.8 on 2024-10-18 00:41
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0068_alter_agent_output_modes"),
]
operations = [
migrations.CreateModel(
name="WebScraper",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
(
"name",
models.CharField(
blank=True,
default=None,
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
max_length=200,
null=True,
unique=True,
),
),
(
"type",
models.CharField(
choices=[
("Firecrawl", "Firecrawl"),
("Olostep", "Olostep"),
("Jina", "Jina"),
("Direct", "Direct"),
],
default="Jina",
max_length=20,
),
),
(
"api_key",
models.CharField(
blank=True,
default=None,
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
max_length=200,
null=True,
),
),
(
"api_url",
models.URLField(
blank=True,
default=None,
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
null=True,
),
),
(
"priority",
models.IntegerField(
blank=True,
default=None,
help_text="Priority of the web scraper. Lower numbers run first.",
null=True,
unique=True,
),
),
],
options={
"abstract": False,
},
),
migrations.AddField(
model_name="serverchatsettings",
name="web_scraper",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="web_scraper",
to="database.webscraper",
),
),
]

View File

@@ -9,7 +9,6 @@ from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.signals import pre_save
from django.dispatch import receiver
from django.utils.translation import gettext_lazy
from pgvector.django import VectorField
from phonenumber_field.modelfields import PhoneNumberField
@@ -246,15 +245,41 @@ class GithubRepoConfig(BaseModel):
class WebScraper(BaseModel):
class WebScraperType(models.TextChoices):
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
OLOSTEP = "olostep", gettext_lazy("Olostep")
JINA = "jina", gettext_lazy("Jina")
DIRECT = "direct", gettext_lazy("Direct")
FIRECRAWL = "Firecrawl"
OLOSTEP = "Olostep"
JINA = "Jina"
DIRECT = "Direct"
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
name = models.CharField(
max_length=200,
default=None,
null=True,
blank=True,
unique=True,
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
)
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
api_key = models.CharField(
max_length=200,
default=None,
null=True,
blank=True,
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
)
api_url = models.URLField(
max_length=200,
default=None,
null=True,
blank=True,
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
)
priority = models.IntegerField(
default=None,
null=True,
blank=True,
unique=True,
help_text="Priority of the web scraper. Lower numbers run first.",
)
def clean(self):
error = {}
@@ -278,12 +303,16 @@ class WebScraper(BaseModel):
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
elif self.type == self.WebScraperType.JINA:
self.api_key = os.getenv("JINA_API_KEY")
if error:
raise ValidationError(error)
def save(self, *args, **kwargs):
self.clean()
if self.priority is None:
max_priority = WebScraper.objects.aggregate(models.Max("priority"))["priority__max"]
self.priority = max_priority + 1 if max_priority else 1
super().save(*args, **kwargs)

View File

@@ -36,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search"
JINA_SEARCH_API_URL = "https://s.jina.ai/"
JINA_API_KEY = os.getenv("JINA_API_KEY")
FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT")
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
OLOSTEP_QUERY_PARAMS = {
"timeout": 35, # seconds
@@ -179,7 +179,7 @@ async def read_webpages(
async def read_webpage(
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
) -> Tuple[str | None, str | None]:
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
return await read_webpage_with_firecrawl(url, api_key, api_url), None