mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
Read webpages directly when Olostep proxy not setup
This is useful for self-hosted, individual user, low traffic setups where a proxy service is not required
This commit is contained in:
@@ -36,7 +36,7 @@ classifiers = [
|
|||||||
"Topic :: Text Processing :: Linguistic",
|
"Topic :: Text Processing :: Linguistic",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bs4 >= 0.0.1",
|
"beautifulsoup4 ~= 4.12.3",
|
||||||
"dateparser >= 1.1.1",
|
"dateparser >= 1.1.1",
|
||||||
"defusedxml == 0.7.1",
|
"defusedxml == 0.7.1",
|
||||||
"fastapi >= 0.104.1",
|
"fastapi >= 0.104.1",
|
||||||
@@ -58,7 +58,6 @@ dependencies = [
|
|||||||
"langchain <= 0.2.0",
|
"langchain <= 0.2.0",
|
||||||
"langchain-openai >= 0.0.5",
|
"langchain-openai >= 0.0.5",
|
||||||
"requests >= 2.26.0",
|
"requests >= 2.26.0",
|
||||||
"bs4 >= 0.0.1",
|
|
||||||
"anyio == 3.7.1",
|
"anyio == 3.7.1",
|
||||||
"pymupdf >= 1.23.5",
|
"pymupdf >= 1.23.5",
|
||||||
"django == 4.2.10",
|
"django == 4.2.10",
|
||||||
@@ -76,6 +75,7 @@ dependencies = [
|
|||||||
"openai-whisper >= 20231117",
|
"openai-whisper >= 20231117",
|
||||||
"django-phonenumber-field == 7.3.0",
|
"django-phonenumber-field == 7.3.0",
|
||||||
"phonenumbers == 8.13.27",
|
"phonenumbers == 8.13.27",
|
||||||
|
"markdownify ~= 0.11.6",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ from typing import Dict, Union
|
|||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import requests
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from markdownify import markdownify
|
||||||
|
|
||||||
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
from khoj.routers.helpers import extract_relevant_info, generate_online_subqueries
|
||||||
from khoj.utils.helpers import is_none_or_empty, timer
|
from khoj.utils.helpers import is_none_or_empty, timer
|
||||||
@@ -101,7 +103,7 @@ async def search_with_google(query: str, conversation_history: dict, location: L
|
|||||||
async def read_webpage_and_extract_content(subquery, url):
|
async def read_webpage_and_extract_content(subquery, url):
|
||||||
try:
|
try:
|
||||||
with timer(f"Reading web page at '{url}' took", logger):
|
with timer(f"Reading web page at '{url}' took", logger):
|
||||||
content = await read_webpage_with_olostep(url)
|
content = await read_webpage_with_olostep(url) if OLOSTEP_API_KEY else await read_webpage(url)
|
||||||
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
with timer(f"Extracting relevant information from web page at '{url}' took", logger):
|
||||||
extracted_info = await extract_relevant_info(subquery, {subquery: [content.strip()]}) if content else None
|
extracted_info = await extract_relevant_info(subquery, {subquery: [content.strip()]}) if content else None
|
||||||
return subquery, extracted_info
|
return subquery, extracted_info
|
||||||
@@ -110,6 +112,20 @@ async def read_webpage_and_extract_content(subquery, url):
|
|||||||
return subquery, None
|
return subquery, None
|
||||||
|
|
||||||
|
|
||||||
|
async def read_webpage(web_url: str) -> str:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(web_url, headers=headers, timeout=30) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
html = await response.text()
|
||||||
|
parsed_html = BeautifulSoup(html, "html.parser")
|
||||||
|
body = parsed_html.body.get_text(separator="\n", strip=True)
|
||||||
|
return markdownify(body)
|
||||||
|
|
||||||
|
|
||||||
async def read_webpage_with_olostep(web_url: str) -> str:
|
async def read_webpage_with_olostep(web_url: str) -> str:
|
||||||
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
headers = {"Authorization": f"Bearer {OLOSTEP_API_KEY}"}
|
||||||
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
web_scraping_params: Dict[str, Union[str, int, bool]] = OLOSTEP_QUERY_PARAMS.copy() # type: ignore
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import pytest
|
|||||||
from scipy.stats import linregress
|
from scipy.stats import linregress
|
||||||
|
|
||||||
from khoj.processor.embeddings import EmbeddingsModel
|
from khoj.processor.embeddings import EmbeddingsModel
|
||||||
from khoj.processor.tools.online_search import read_webpage_with_olostep
|
from khoj.processor.tools.online_search import read_webpage, read_webpage_with_olostep
|
||||||
from khoj.utils import helpers
|
from khoj.utils import helpers
|
||||||
|
|
||||||
|
|
||||||
@@ -84,13 +84,29 @@ def test_encode_docs_memory_leak():
|
|||||||
assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"
|
assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")
|
@pytest.mark.asyncio
|
||||||
def test_olostep_api():
|
async def test_reading_webpage():
|
||||||
# Arrange
|
# Arrange
|
||||||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = read_webpage_with_olostep(website)
|
response = await read_webpage(website)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert (
|
||||||
|
"An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were"
|
||||||
|
in response
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_reading_webpage_with_olostep():
|
||||||
|
# Arrange
|
||||||
|
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||||
|
|
||||||
|
# Act
|
||||||
|
response = await read_webpage_with_olostep(website)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert (
|
assert (
|
||||||
|
|||||||
Reference in New Issue
Block a user