mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Make web scraper priority configurable via admin panel
- Simplifies changing order in which web scrapers are invoked to read web page by just changing their priority number on the admin panel. Previously you'd have to delete/, re-add the scrapers to change their priority. - Add help text for each scraper field to ease admin setup experience - Friendlier env var to use Firecrawl's LLM to extract content - Remove use of separate friendly name for scraper types. Reuse actual name and just make actual name better
This commit is contained in:
@@ -1054,10 +1054,10 @@ class ConversationAdapters:
|
||||
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
||||
]
|
||||
if not enabled_scrapers:
|
||||
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
|
||||
# Use the enabled web scrapers, ordered by priority, until get web page content
|
||||
enabled_scrapers = [
|
||||
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
||||
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
|
||||
async for scraper in WebScraper.objects.all().order_by("priority").aiterator()
|
||||
]
|
||||
if not enabled_scrapers:
|
||||
# Use scrapers enabled via environment variables
|
||||
|
||||
@@ -206,6 +206,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
|
||||
@admin.register(WebScraper)
|
||||
class WebScraperAdmin(admin.ModelAdmin):
|
||||
list_display = (
|
||||
"priority",
|
||||
"name",
|
||||
"type",
|
||||
"api_key",
|
||||
@@ -213,7 +214,7 @@ class WebScraperAdmin(admin.ModelAdmin):
|
||||
"created_at",
|
||||
)
|
||||
search_fields = ("name", "api_key", "api_url", "type")
|
||||
ordering = ("-created_at",)
|
||||
ordering = ("priority",)
|
||||
|
||||
|
||||
@admin.register(Conversation)
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
# Generated by Django 5.0.8 on 2024-10-16 06:51
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0067_alter_agent_style_icon"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="WebScraper",
|
||||
fields=[
|
||||
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||
("updated_at", models.DateTimeField(auto_now=True)),
|
||||
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
|
||||
(
|
||||
"type",
|
||||
models.CharField(
|
||||
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
|
||||
default="jina",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
|
||||
("api_url", models.URLField(blank=True, default=None, null=True)),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="serverchatsettings",
|
||||
name="web_scraper",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
default=None,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="web_scraper",
|
||||
to="database.webscraper",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,89 @@
|
||||
# Generated by Django 5.0.8 on 2024-10-18 00:41
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("database", "0068_alter_agent_output_modes"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="WebScraper",
|
||||
fields=[
|
||||
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||
("updated_at", models.DateTimeField(auto_now=True)),
|
||||
(
|
||||
"name",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||
max_length=200,
|
||||
null=True,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
"type",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("Firecrawl", "Firecrawl"),
|
||||
("Olostep", "Olostep"),
|
||||
("Jina", "Jina"),
|
||||
("Direct", "Direct"),
|
||||
],
|
||||
default="Jina",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
(
|
||||
"api_key",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
|
||||
max_length=200,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
"api_url",
|
||||
models.URLField(
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
"priority",
|
||||
models.IntegerField(
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text="Priority of the web scraper. Lower numbers run first.",
|
||||
null=True,
|
||||
unique=True,
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="serverchatsettings",
|
||||
name="web_scraper",
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
default=None,
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="web_scraper",
|
||||
to="database.webscraper",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -9,7 +9,6 @@ from django.core.exceptions import ValidationError
|
||||
from django.db import models
|
||||
from django.db.models.signals import pre_save
|
||||
from django.dispatch import receiver
|
||||
from django.utils.translation import gettext_lazy
|
||||
from pgvector.django import VectorField
|
||||
from phonenumber_field.modelfields import PhoneNumberField
|
||||
|
||||
@@ -246,15 +245,41 @@ class GithubRepoConfig(BaseModel):
|
||||
|
||||
class WebScraper(BaseModel):
|
||||
class WebScraperType(models.TextChoices):
|
||||
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
||||
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
||||
JINA = "jina", gettext_lazy("Jina")
|
||||
DIRECT = "direct", gettext_lazy("Direct")
|
||||
FIRECRAWL = "Firecrawl"
|
||||
OLOSTEP = "Olostep"
|
||||
JINA = "Jina"
|
||||
DIRECT = "Direct"
|
||||
|
||||
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
|
||||
name = models.CharField(
|
||||
max_length=200,
|
||||
default=None,
|
||||
null=True,
|
||||
blank=True,
|
||||
unique=True,
|
||||
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||
)
|
||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
|
||||
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
|
||||
api_key = models.CharField(
|
||||
max_length=200,
|
||||
default=None,
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
|
||||
)
|
||||
api_url = models.URLField(
|
||||
max_length=200,
|
||||
default=None,
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
|
||||
)
|
||||
priority = models.IntegerField(
|
||||
default=None,
|
||||
null=True,
|
||||
blank=True,
|
||||
unique=True,
|
||||
help_text="Priority of the web scraper. Lower numbers run first.",
|
||||
)
|
||||
|
||||
def clean(self):
|
||||
error = {}
|
||||
@@ -278,12 +303,16 @@ class WebScraper(BaseModel):
|
||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||
elif self.type == self.WebScraperType.JINA:
|
||||
self.api_key = os.getenv("JINA_API_KEY")
|
||||
|
||||
if error:
|
||||
raise ValidationError(error)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.clean()
|
||||
|
||||
if self.priority is None:
|
||||
max_priority = WebScraper.objects.aggregate(models.Max("priority"))["priority__max"]
|
||||
self.priority = max_priority + 1 if max_priority else 1
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search"
|
||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||
|
||||
FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT")
|
||||
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
||||
|
||||
OLOSTEP_QUERY_PARAMS = {
|
||||
"timeout": 35, # seconds
|
||||
@@ -179,7 +179,7 @@ async def read_webpages(
|
||||
async def read_webpage(
|
||||
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
||||
) -> Tuple[str | None, str | None]:
|
||||
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
|
||||
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
||||
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
||||
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||
|
||||
Reference in New Issue
Block a user