diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 7be931c5..0f078a00 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1054,10 +1054,10 @@ class ConversationAdapters: (server_webscraper.type, server_webscraper.api_key, server_webscraper.api_url, server_webscraper.name) ] if not enabled_scrapers: - # Use the enabled web scrapers, using the newest created scraper first, until get web page content + # Use the enabled web scrapers, ordered by priority, until get web page content enabled_scrapers = [ (scraper.type, scraper.api_key, scraper.api_url, scraper.name) - async for scraper in WebScraper.objects.all().order_by("-created_at").aiterator() + async for scraper in WebScraper.objects.all().order_by("priority").aiterator() ] if not enabled_scrapers: # Use scrapers enabled via environment variables diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 8e650922..5aa9204b 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -206,6 +206,7 @@ class ServerChatSettingsAdmin(admin.ModelAdmin): @admin.register(WebScraper) class WebScraperAdmin(admin.ModelAdmin): list_display = ( + "priority", "name", "type", "api_key", @@ -213,7 +214,7 @@ class WebScraperAdmin(admin.ModelAdmin): "created_at", ) search_fields = ("name", "api_key", "api_url", "type") - ordering = ("-created_at",) + ordering = ("priority",) @admin.register(Conversation) diff --git a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py deleted file mode 100644 index 41d9c80b..00000000 --- a/src/khoj/database/migrations/0068_webscraper_serverchatsettings_web_scraper.py +++ /dev/null @@ -1,47 +0,0 @@ -# Generated by Django 5.0.8 on 2024-10-16 06:51 - -import django.db.models.deletion -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("database", "0067_alter_agent_style_icon"), - ] - - operations = [ - migrations.CreateModel( - name="WebScraper", - fields=[ - ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("updated_at", models.DateTimeField(auto_now=True)), - ("name", models.CharField(blank=True, default=None, max_length=200, null=True, unique=True)), - ( - "type", - models.CharField( - choices=[("firecrawl", "Firecrawl"), ("olostep", "Olostep"), ("jina", "Jina")], - default="jina", - max_length=20, - ), - ), - ("api_key", models.CharField(blank=True, default=None, max_length=200, null=True)), - ("api_url", models.URLField(blank=True, default=None, null=True)), - ], - options={ - "abstract": False, - }, - ), - migrations.AddField( - model_name="serverchatsettings", - name="web_scraper", - field=models.ForeignKey( - blank=True, - default=None, - null=True, - on_delete=django.db.models.deletion.CASCADE, - related_name="web_scraper", - to="database.webscraper", - ), - ), - ] diff --git a/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py b/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py new file mode 100644 index 00000000..3ea8ebe3 --- /dev/null +++ b/src/khoj/database/migrations/0069_webscraper_serverchatsettings_web_scraper.py @@ -0,0 +1,89 @@ +# Generated by Django 5.0.8 on 2024-10-18 00:41 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("database", "0068_alter_agent_output_modes"), + ] + + operations = [ + migrations.CreateModel( + name="WebScraper", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "name", + models.CharField( + blank=True, + default=None, + help_text="Friendly name. If not set, it will be set to the type of the scraper.", + max_length=200, + null=True, + unique=True, + ), + ), + ( + "type", + models.CharField( + choices=[ + ("Firecrawl", "Firecrawl"), + ("Olostep", "Olostep"), + ("Jina", "Jina"), + ("Direct", "Direct"), + ], + default="Jina", + max_length=20, + ), + ), + ( + "api_key", + models.CharField( + blank=True, + default=None, + help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.", + max_length=200, + null=True, + ), + ), + ( + "api_url", + models.URLField( + blank=True, + default=None, + help_text="API URL of the web scraper. Only set if scraper service on non-default URL.", + null=True, + ), + ), + ( + "priority", + models.IntegerField( + blank=True, + default=None, + help_text="Priority of the web scraper. Lower numbers run first.", + null=True, + unique=True, + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="serverchatsettings", + name="web_scraper", + field=models.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="web_scraper", + to="database.webscraper", + ), + ), + ] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 56f482ae..2b2fde2d 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -9,7 +9,6 @@ from django.core.exceptions import ValidationError from django.db import models from django.db.models.signals import pre_save from django.dispatch import receiver -from django.utils.translation import gettext_lazy from pgvector.django import VectorField from phonenumber_field.modelfields import PhoneNumberField @@ -246,15 +245,41 @@ class GithubRepoConfig(BaseModel): class WebScraper(BaseModel): class WebScraperType(models.TextChoices): - FIRECRAWL = "firecrawl", gettext_lazy("Firecrawl") - OLOSTEP = "olostep", gettext_lazy("Olostep") - JINA = "jina", gettext_lazy("Jina") - DIRECT = "direct", gettext_lazy("Direct") + FIRECRAWL = "Firecrawl" + OLOSTEP = "Olostep" + JINA = "Jina" + DIRECT = "Direct" - name = models.CharField(max_length=200, default=None, null=True, blank=True, unique=True) + name = models.CharField( + max_length=200, + default=None, + null=True, + blank=True, + unique=True, + help_text="Friendly name. If not set, it will be set to the type of the scraper.", + ) type = models.CharField(max_length=20, choices=WebScraperType.choices, default=WebScraperType.JINA) - api_key = models.CharField(max_length=200, default=None, null=True, blank=True) - api_url = models.URLField(max_length=200, default=None, null=True, blank=True) + api_key = models.CharField( + max_length=200, + default=None, + null=True, + blank=True, + help_text="API key of the web scraper. Only set if scraper service requires an API key. Default is set from env var.", + ) + api_url = models.URLField( + max_length=200, + default=None, + null=True, + blank=True, + help_text="API URL of the web scraper. Only set if scraper service on non-default URL.", + ) + priority = models.IntegerField( + default=None, + null=True, + blank=True, + unique=True, + help_text="Priority of the web scraper. Lower numbers run first.", + ) def clean(self): error = {} @@ -278,12 +303,16 @@ class WebScraper(BaseModel): error["api_key"] = "Set API key to use Olostep. Get API key from https://olostep.com/." elif self.type == self.WebScraperType.JINA: self.api_key = os.getenv("JINA_API_KEY") - if error: raise ValidationError(error) def save(self, *args, **kwargs): self.clean() + + if self.priority is None: + max_priority = WebScraper.objects.aggregate(models.Max("priority"))["priority__max"] + self.priority = max_priority + 1 if max_priority else 1 + super().save(*args, **kwargs) diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py index c00660e3..fee0fa03 100644 --- a/src/khoj/processor/tools/online_search.py +++ b/src/khoj/processor/tools/online_search.py @@ -36,7 +36,7 @@ SERPER_DEV_URL = "https://google.serper.dev/search" JINA_SEARCH_API_URL = "https://s.jina.ai/" JINA_API_KEY = os.getenv("JINA_API_KEY") -FIRECRAWL_TO_EXTRACT = is_env_var_true("FIRECRAWL_TO_EXTRACT") +FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT") OLOSTEP_QUERY_PARAMS = { "timeout": 35, # seconds @@ -179,7 +179,7 @@ async def read_webpages( async def read_webpage( url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None ) -> Tuple[str | None, str | None]: - if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_TO_EXTRACT: + if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT: return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent) elif scraper_type == WebScraper.WebScraperType.FIRECRAWL: return await read_webpage_with_firecrawl(url, api_key, api_url), None