mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Make web scraper priority configurable via admin panel
- Simplifies changing order in which web scrapers are invoked to read web page by just changing their priority number on the admin panel. Previously you'd have to delete/, re-add the scrapers to change their priority. - Add help text for each scraper field to ease admin setup experience - Friendlier env var to use Firecrawl's LLM to extract content - Remove use of separate friendly name for scraper types. Reuse actual name and just make actual name better
This commit is contained in:
@@ -1054,10 +1054,10 @@ class ConversationAdapters:
|
|||||||
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
(server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name)
|
||||||
]
|
]
|
||||||
if not enabled_scrapers:
|
if not enabled_scrapers:
|
||||||
# Use the enabled web scrapers, using the newest created scraper first, until get web page content
|
# Use the enabled web scrapers, ordered by priority, until get web page content
|
||||||
enabled_scrapers = [
|
enabled_scrapers = [
|
||||||
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
(scraper.type, scraper.api_key, scraper.api_url, scraper.name)
|
||||||
async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator()
|
async for scraper in WebScraper.objects.all().order_by("priority").aiterator()
|
||||||
]
|
]
|
||||||
if not enabled_scrapers:
|
if not enabled_scrapers:
|
||||||
# Use scrapers enabled via environment variables
|
# Use scrapers enabled via environment variables
|
||||||
|
|||||||
@@ -206,6 +206,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin):
|
|||||||
@admin.register(WebScraper)
|
@admin.register(WebScraper)
|
||||||
class WebScraperAdmin(admin.ModelAdmin):
|
class WebScraperAdmin(admin.ModelAdmin):
|
||||||
list_display = (
|
list_display = (
|
||||||
|
"priority",
|
||||||
"name",
|
"name",
|
||||||
"type",
|
"type",
|
||||||
"api_key",
|
"api_key",
|
||||||
@@ -213,7 +214,7 @@ class WebScraperAdmin(admin.ModelAdmin):
|
|||||||
"created_at",
|
"created_at",
|
||||||
)
|
)
|
||||||
search_fields = ("name", "api_key", "api_url", "type")
|
search_fields = ("name", "api_key", "api_url", "type")
|
||||||
ordering = ("-created_at",)
|
ordering = ("priority",)
|
||||||
|
|
||||||
|
|
||||||
@admin.register(Conversation)
|
@admin.register(Conversation)
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
# Generated by Django 5.0.8 on 2024-10-16 06:51
|
|
||||||
|
|
||||||
import django.db.models.deletion
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
dependencies = [
|
|
||||||
("database", "0067_alter_agent_style_icon"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.CreateModel(
|
|
||||||
name="WebScraper",
|
|
||||||
fields=[
|
|
||||||
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
|
||||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
|
||||||
("updated_at", models.DateTimeField(auto_now=True)),
|
|
||||||
("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)),
|
|
||||||
(
|
|
||||||
"type",
|
|
||||||
models.CharField(
|
|
||||||
choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")],
|
|
||||||
default="jina",
|
|
||||||
max_length=20,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)),
|
|
||||||
("api_url", models.URLField(blank=True, default=None, null=True)),
|
|
||||||
],
|
|
||||||
options={
|
|
||||||
"abstract": False,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name="serverchatsettings",
|
|
||||||
name="web_scraper",
|
|
||||||
field=models.ForeignKey(
|
|
||||||
blank=True,
|
|
||||||
default=None,
|
|
||||||
null=True,
|
|
||||||
on_delete=django.db.models.deletion.CASCADE,
|
|
||||||
related_name="web_scraper",
|
|
||||||
to="database.webscraper",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
# Generated by Django 5.0.8 on 2024-10-18 00:41
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0068_alter_agent_output_modes"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name="WebScraper",
|
||||||
|
fields=[
|
||||||
|
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
|
||||||
|
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||||
|
("updated_at", models.DateTimeField(auto_now=True)),
|
||||||
|
(
|
||||||
|
"name",
|
||||||
|
models.CharField(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||||
|
max_length=200,
|
||||||
|
null=True,
|
||||||
|
unique=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"type",
|
||||||
|
models.CharField(
|
||||||
|
choices=[
|
||||||
|
("Firecrawl", "Firecrawl"),
|
||||||
|
("Olostep", "Olostep"),
|
||||||
|
("Jina", "Jina"),
|
||||||
|
("Direct", "Direct"),
|
||||||
|
],
|
||||||
|
default="Jina",
|
||||||
|
max_length=20,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"api_key",
|
||||||
|
models.CharField(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
|
||||||
|
max_length=200,
|
||||||
|
null=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"api_url",
|
||||||
|
models.URLField(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
|
||||||
|
null=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"priority",
|
||||||
|
models.IntegerField(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
help_text="Priority of the web scraper. Lower numbers run first.",
|
||||||
|
null=True,
|
||||||
|
unique=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
"abstract": False,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="serverchatsettings",
|
||||||
|
name="web_scraper",
|
||||||
|
field=models.ForeignKey(
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
on_delete=django.db.models.deletion.CASCADE,
|
||||||
|
related_name="web_scraper",
|
||||||
|
to="database.webscraper",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -9,7 +9,6 @@ from django.core.exceptions import ValidationError
|
|||||||
from django.db import models
|
from django.db import models
|
||||||
from django.db.models.signals import pre_save
|
from django.db.models.signals import pre_save
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
from django.utils.translation import gettext_lazy
|
|
||||||
from pgvector.django import VectorField
|
from pgvector.django import VectorField
|
||||||
from phonenumber_field.modelfields import PhoneNumberField
|
from phonenumber_field.modelfields import PhoneNumberField
|
||||||
|
|
||||||
@@ -246,15 +245,41 @@ class GithubRepoConfig(BaseModel):
|
|||||||
|
|
||||||
class WebScraper(BaseModel):
|
class WebScraper(BaseModel):
|
||||||
class WebScraperType(models.TextChoices):
|
class WebScraperType(models.TextChoices):
|
||||||
FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl")
|
FIRECRAWL = "Firecrawl"
|
||||||
OLOSTEP = "olostep", gettext_lazy("Olostep")
|
OLOSTEP = "Olostep"
|
||||||
JINA = "jina", gettext_lazy("Jina")
|
JINA = "Jina"
|
||||||
DIRECT = "direct", gettext_lazy("Direct")
|
DIRECT = "Direct"
|
||||||
|
|
||||||
name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True)
|
name = models.CharField(
|
||||||
|
max_length=200,
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
unique=True,
|
||||||
|
help_text="Friendly name. If not set, it will be set to the type of the scraper.",
|
||||||
|
)
|
||||||
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA)
|
||||||
api_key = models.CharField(max_length=200, default=None, null=True, blank=True)
|
api_key = models.CharField(
|
||||||
api_url = models.URLField(max_length=200, default=None, null=True, blank=True)
|
max_length=200,
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.",
|
||||||
|
)
|
||||||
|
api_url = models.URLField(
|
||||||
|
max_length=200,
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
help_text="API URL of the web scraper. Only set if scraper service on non-default URL.",
|
||||||
|
)
|
||||||
|
priority = models.IntegerField(
|
||||||
|
default=None,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
unique=True,
|
||||||
|
help_text="Priority of the web scraper. Lower numbers run first.",
|
||||||
|
)
|
||||||
|
|
||||||
def clean(self):
|
def clean(self):
|
||||||
error = {}
|
error = {}
|
||||||
@@ -278,12 +303,16 @@ class WebScraper(BaseModel):
|
|||||||
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/."
|
||||||
elif self.type == self.WebScraperType.JINA:
|
elif self.type == self.WebScraperType.JINA:
|
||||||
self.api_key = os.getenv("JINA_API_KEY")
|
self.api_key = os.getenv("JINA_API_KEY")
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
raise ValidationError(error)
|
raise ValidationError(error)
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
self.clean()
|
self.clean()
|
||||||
|
|
||||||
|
if self.priority is None:
|
||||||
|
max_priority = WebScraper.objects.aggregate(models.Max("priority"))["priority__max"]
|
||||||
|
self.priority = max_priority + 1 if max_priority else 1
|
||||||
|
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search"
|
|||||||
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
JINA_SEARCH_API_URL = "https://s.jina.ai/"
|
||||||
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
JINA_API_KEY = os.getenv("JINA_API_KEY")
|
||||||
|
|
||||||
FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT")
|
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
||||||
|
|
||||||
OLOSTEP_QUERY_PARAMS = {
|
OLOSTEP_QUERY_PARAMS = {
|
||||||
"timeout": 35, # seconds
|
"timeout": 35, # seconds
|
||||||
@@ -179,7 +179,7 @@ async def read_webpages(
|
|||||||
async def read_webpage(
|
async def read_webpage(
|
||||||
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
||||||
) -> Tuple[str | None, str | None]:
|
) -> Tuple[str | None, str | None]:
|
||||||
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT:
|
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
||||||
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
||||||
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
||||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||||
|
|||||||
Reference in New Issue
Block a user