Scrape results from Serper results using Olostep (#627)

* Initailize changes to incporate web scraping logic after getting SERP results
- Do some minor refactors to pass a symptom prompt to the openai model when making a query
- integrate Olostep in order to perform the webscraping
* Fix truncation error with new line, fix typing in olostep code
* Use the authorization header for the token
* Add a small hint/indicator for how to use Khojs other modalities in the welcome prompt
* Add more detailed error message if Olostep query fails
* Add unit tests which invoke Olostep in chat director
* Add test for olostep tool
This commit is contained in:
sabaimran
2024-01-29 00:46:50 -08:00
committed by GitHub
parent 360b59cdb2
commit b782683e60
12 changed files with 214 additions and 30 deletions

View File

@@ -57,11 +57,11 @@ def test_chat_with_no_chat_history_or_retrieved_content_gpt4all(client_offline_c
@pytest.mark.skipif(os.getenv("SERPER_DEV_API_KEY") is None, reason="requires SERPER_DEV_API_KEY")
@pytest.mark.chatquality
@pytest.mark.django_db(transaction=True)
def test_chat_with_online_content(chat_client):
def test_chat_with_online_content(client_offline_chat):
# Act
q = "/online give me the link to paul graham's essay how to do great work"
encoded_q = quote(q, safe="")
response = chat_client.get(f"/api/chat?q={encoded_q}&stream=true")
response = client_offline_chat.get(f"/api/chat?q={encoded_q}&stream=true")
response_message = response.content.decode("utf-8")
response_message = response_message.split("### compiled references")[0]
@@ -70,7 +70,31 @@ def test_chat_with_online_content(chat_client):
expected_responses = ["http://www.paulgraham.com/greatwork.html"]
assert response.status_code == 200
assert any([expected_response in response_message for expected_response in expected_responses]), (
"Expected assistants name, [K|k]hoj, in response but got: " + response_message
"Expected links or serper not setup in response but got: " + response_message
)
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(
os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None,
reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY",
)
@pytest.mark.chatquality
@pytest.mark.django_db(transaction=True)
def test_chat_with_online_webpage_content(client_offline_chat):
# Act
q = "/online how many firefighters were involved in the great chicago fire and which year did it take place?"
encoded_q = quote(q, safe="")
response = client_offline_chat.get(f"/api/chat?q={encoded_q}&stream=true")
response_message = response.content.decode("utf-8")
response_message = response_message.split("### compiled references")[0]
# Assert
expected_responses = ["185", "1871", "horse"]
assert response.status_code == 200
assert any([expected_response in response_message for expected_response in expected_responses]), (
"Expected links or serper not setup in response but got: " + response_message
)

View File

@@ -1,3 +1,4 @@
import os
import secrets
import numpy as np
@@ -6,6 +7,7 @@ import pytest
from scipy.stats import linregress
from khoj.processor.embeddings import EmbeddingsModel
from khoj.processor.tools.online_search import search_with_olostep
from khoj.utils import helpers
@@ -80,3 +82,18 @@ def test_encode_docs_memory_leak():
# If slope is positive memory utilization is increasing
# Positive threshold of 2, from observing memory usage trend on MPS vs CPU device
assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"
@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")
def test_olostep_api():
# Arrange
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
# Act
response = search_with_olostep(website)
# Assert
assert (
"An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were"
in response
)

View File

@@ -73,6 +73,30 @@ def test_chat_with_online_content(chat_client):
)
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(
os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None,
reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY",
)
@pytest.mark.chatquality
@pytest.mark.django_db(transaction=True)
def test_chat_with_online_webpage_content(chat_client):
# Act
q = "/online how many firefighters were involved in the great chicago fire and which year did it take place?"
encoded_q = quote(q, safe="")
response = chat_client.get(f"/api/chat?q={encoded_q}&stream=true")
response_message = response.content.decode("utf-8")
response_message = response_message.split("### compiled references")[0]
# Assert
expected_responses = ["185", "1871", "horse"]
assert response.status_code == 200
assert any([expected_response in response_message for expected_response in expected_responses]), (
"Expected links or serper not setup in response but got: " + response_message
)
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
@pytest.mark.chatquality