mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Deprecate support for using Firecrawl webpage summarizer
Better speed and control by using Khoj webpage summarizer. Reduce code cruft by clearing unused features.
This commit is contained in:
@@ -16,7 +16,6 @@ from khoj.database.models import (
|
|||||||
KhojUser,
|
KhojUser,
|
||||||
WebScraper,
|
WebScraper,
|
||||||
)
|
)
|
||||||
from khoj.processor.conversation import prompts
|
|
||||||
from khoj.routers.helpers import (
|
from khoj.routers.helpers import (
|
||||||
ChatEvent,
|
ChatEvent,
|
||||||
extract_relevant_info,
|
extract_relevant_info,
|
||||||
@@ -41,7 +40,6 @@ AUTO_READ_WEBPAGE = is_env_var_true("KHOJ_AUTO_READ_WEBPAGE")
|
|||||||
SERPER_DEV_URL = "https://google.serper.dev/search"
|
SERPER_DEV_URL = "https://google.serper.dev/search"
|
||||||
|
|
||||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
|
||||||
FIRECRAWL_USE_LLM_EXTRACT = is_env_var_true("FIRECRAWL_USE_LLM_EXTRACT")
|
|
||||||
|
|
||||||
SEARXNG_URL = os.getenv("KHOJ_SEARXNG_URL")
|
SEARXNG_URL = os.getenv("KHOJ_SEARXNG_URL")
|
||||||
|
|
||||||
@@ -500,12 +498,8 @@ async def read_webpages_content(
|
|||||||
yield response
|
yield response
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage(
|
async def read_webpage(url, scraper_type=None, api_key=None, api_url=None) -> Tuple[str | None, str | None]:
|
||||||
url, scraper_type=None, api_key=None, api_url=None, subqueries=None, agent=None
|
if scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
||||||
) -> Tuple[str | None, str | None]:
|
|
||||||
if scraper_type == WebScraper.WebScraperType.FIRECRAWL and FIRECRAWL_USE_LLM_EXTRACT:
|
|
||||||
return None, await query_webpage_with_firecrawl(url, subqueries, api_key, api_url, agent)
|
|
||||||
elif scraper_type == WebScraper.WebScraperType.FIRECRAWL:
|
|
||||||
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
return await read_webpage_with_firecrawl(url, api_key, api_url), None
|
||||||
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
elif scraper_type == WebScraper.WebScraperType.OLOSTEP:
|
||||||
return await read_webpage_with_olostep(url, api_key, api_url), None
|
return await read_webpage_with_olostep(url, api_key, api_url), None
|
||||||
@@ -536,9 +530,7 @@ async def read_webpage_and_extract_content(
|
|||||||
# Read the web page
|
# Read the web page
|
||||||
if is_none_or_empty(content):
|
if is_none_or_empty(content):
|
||||||
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
with timer(f"Reading web page with {scraper.type} at '{url}' took", logger, log_level=logging.INFO):
|
||||||
content, extracted_info = await read_webpage(
|
content, extracted_info = await read_webpage(url, scraper.type, scraper.api_key, scraper.api_url)
|
||||||
url, scraper.type, scraper.api_key, scraper.api_url, subqueries, agent
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract relevant information from the web page
|
# Extract relevant information from the web page
|
||||||
if is_none_or_empty(extracted_info):
|
if is_none_or_empty(extracted_info):
|
||||||
@@ -624,44 +616,6 @@ async def read_webpage_with_firecrawl(web_url: str, api_key: str, api_url: str)
|
|||||||
return response_json["data"]["markdown"]
|
return response_json["data"]["markdown"]
|
||||||
|
|
||||||
|
|
||||||
async def query_webpage_with_firecrawl(
|
|
||||||
web_url: str, queries: set[str], api_key: str, api_url: str, agent: Agent = None
|
|
||||||
) -> str:
|
|
||||||
firecrawl_api_url = f"{api_url}/v1/scrape"
|
|
||||||
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
|
|
||||||
schema = {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"relevant_extract": {"type": "string"},
|
|
||||||
},
|
|
||||||
"required": [
|
|
||||||
"relevant_extract",
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
personality_context = (
|
|
||||||
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
|
||||||
)
|
|
||||||
system_prompt = f"""
|
|
||||||
{prompts.system_prompt_extract_relevant_information}
|
|
||||||
|
|
||||||
{personality_context}
|
|
||||||
User Query: {", ".join(queries)}
|
|
||||||
|
|
||||||
Collate only relevant information from the website to answer the target query and in the provided JSON schema.
|
|
||||||
""".strip()
|
|
||||||
|
|
||||||
params = {"url": web_url, "formats": ["extract"], "extract": {"systemPrompt": system_prompt, "schema": schema}}
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
async with session.post(
|
|
||||||
firecrawl_api_url, json=params, headers=headers, timeout=WEBPAGE_REQUEST_TIMEOUT
|
|
||||||
) as response:
|
|
||||||
response.raise_for_status()
|
|
||||||
response_json = await response.json()
|
|
||||||
return response_json["data"]["extract"]["relevant_extract"]
|
|
||||||
|
|
||||||
|
|
||||||
def deduplicate_organic_results(online_results: dict) -> dict:
|
def deduplicate_organic_results(online_results: dict) -> dict:
|
||||||
"""Deduplicate organic search results based on links across all queries."""
|
"""Deduplicate organic search results based on links across all queries."""
|
||||||
# Keep track of seen links to filter out duplicates across queries
|
# Keep track of seen links to filter out duplicates across queries
|
||||||
|
|||||||
Reference in New Issue
Block a user