mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 13:25:11 +00:00
Limit the number of urls the webscraper can extract for scraping
This commit is contained in:
@@ -54,6 +54,7 @@ OLOSTEP_QUERY_PARAMS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
DEFAULT_MAX_WEBPAGES_TO_READ = 1
|
||||||
|
MAX_WEBPAGES_TO_INFER = 10
|
||||||
|
|
||||||
|
|
||||||
async def search_online(
|
async def search_online(
|
||||||
@@ -157,6 +158,7 @@ async def read_webpages(
|
|||||||
query_images: List[str] = None,
|
query_images: List[str] = None,
|
||||||
agent: Agent = None,
|
agent: Agent = None,
|
||||||
tracer: dict = {},
|
tracer: dict = {},
|
||||||
|
max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
|
||||||
):
|
):
|
||||||
"Infer web pages to read from the query and extract relevant information from them"
|
"Infer web pages to read from the query and extract relevant information from them"
|
||||||
logger.info(f"Inferring web pages to read")
|
logger.info(f"Inferring web pages to read")
|
||||||
@@ -165,6 +167,9 @@ async def read_webpages(
|
|||||||
yield {ChatEvent.STATUS: event}
|
yield {ChatEvent.STATUS: event}
|
||||||
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
|
urls = await infer_webpage_urls(query, conversation_history, location, user, query_images)
|
||||||
|
|
||||||
|
# Get the top 10 web pages to read
|
||||||
|
urls = urls[:max_webpages_to_read]
|
||||||
|
|
||||||
logger.info(f"Reading web pages at: {urls}")
|
logger.info(f"Reading web pages at: {urls}")
|
||||||
if send_status_func:
|
if send_status_func:
|
||||||
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
webpage_links_str = "\n- " + "\n- ".join(list(urls))
|
||||||
|
|||||||
Reference in New Issue
Block a user