mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Read webpages directly when Olostep proxy not setup
This is useful for self-hosted, individual user, low traffic setups where a proxy service is not required
This commit is contained in:
@@ -6,6 +6,8 @@ from typing import Dict, Union
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify
|
||||
|
||||
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
||||
from khoj.utils.helpers import is_none_or_empty, timer
|
||||
@@ -101,7 +103,7 @@ async def search_with_google(query: str, conversation_history: dict, location: L
|
||||
async def read_webpage_and_extract_content(subquery, url):
|
||||
try:
|
||||
with timer(f"Reading web page at '{url}' took", logger):
|
||||
content = await read_webpage_with_olostep(url)
|
||||
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
|
||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||
extracted_info = await extract_relevant_info(subquery, {subquery: [content.strip()]}) if content else None
|
||||
return subquery, extracted_info
|
||||
@@ -110,6 +112,20 @@ async def read_webpage_and_extract_content(subquery, url):
|
||||
return subquery, None
|
||||
|
||||
|
||||
async def read_webpage(web_url: str) -> str:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(web_url, headers=headers, timeout=30) as response:
|
||||
response.raise_for_status()
|
||||
html = await response.text()
|
||||
parsed_html = BeautifulSoup(html, "html.parser")
|
||||
body = parsed_html.body.get_text(separator="\n", strip=True)
|
||||
return markdownify(body)
|
||||
|
||||
|
||||
async def read_webpage_with_olostep(web_url: str) -> str:
|
||||
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
||||
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
||||
|
||||
Reference in New Issue
Block a user