mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Scrape results from Serper results using Olostep (#627)
* Initailize changes to incporate web scraping logic after getting SERP results - Do some minor refactors to pass a symptom prompt to the openai model when making a query - integrate Olostep in order to perform the webscraping * Fix truncation error with new line, fix typing in olostep code * Use the authorization header for the token * Add a small hint/indicator for how to use Khojs other modalities in the welcome prompt * Add more detailed error message if Olostep query fails * Add unit tests which invoke Olostep in chat director * Add test for olostep tool
This commit is contained in:
@@ -57,11 +57,11 @@ def test_chat_with_no_chat_history_or_retrieved_content_gpt4all(client_offline_c
|
||||
@pytest.mark.skipif(os.getenv("SERPER_DEV_API_KEY") is None, reason="requires SERPER_DEV_API_KEY")
|
||||
@pytest.mark.chatquality
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_chat_with_online_content(chat_client):
|
||||
def test_chat_with_online_content(client_offline_chat):
|
||||
# Act
|
||||
q = "/online give me the link to paul graham's essay how to do great work"
|
||||
encoded_q = quote(q, safe="")
|
||||
response = chat_client.get(f"/api/chat?q={encoded_q}&stream=true")
|
||||
response = client_offline_chat.get(f"/api/chat?q={encoded_q}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
response_message = response_message.split("### compiled references")[0]
|
||||
@@ -70,7 +70,31 @@ def test_chat_with_online_content(chat_client):
|
||||
expected_responses = ["http://www.paulgraham.com/greatwork.html"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected assistants name, [K|k]hoj, in response but got: " + response_message
|
||||
"Expected links or serper not setup in response but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None,
|
||||
reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY",
|
||||
)
|
||||
@pytest.mark.chatquality
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_chat_with_online_webpage_content(client_offline_chat):
|
||||
# Act
|
||||
q = "/online how many firefighters were involved in the great chicago fire and which year did it take place?"
|
||||
encoded_q = quote(q, safe="")
|
||||
response = client_offline_chat.get(f"/api/chat?q={encoded_q}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
response_message = response_message.split("### compiled references")[0]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["185", "1871", "horse"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected links or serper not setup in response but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import secrets
|
||||
|
||||
import numpy as np
|
||||
@@ -6,6 +7,7 @@ import pytest
|
||||
from scipy.stats import linregress
|
||||
|
||||
from khoj.processor.embeddings import EmbeddingsModel
|
||||
from khoj.processor.tools.online_search import search_with_olostep
|
||||
from khoj.utils import helpers
|
||||
|
||||
|
||||
@@ -80,3 +82,18 @@ def test_encode_docs_memory_leak():
|
||||
# If slope is positive memory utilization is increasing
|
||||
# Positive threshold of 2, from observing memory usage trend on MPS vs CPU device
|
||||
assert slope < 2, f"Memory leak suspected on {device}. Memory usage increased at ~{slope:.2f} MB per iteration"
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("OLOSTEP_API_KEY") is None, reason="OLOSTEP_API_KEY is not set")
|
||||
def test_olostep_api():
|
||||
# Arrange
|
||||
website = "https://en.wikipedia.org/wiki/Great_Chicago_Fire"
|
||||
|
||||
# Act
|
||||
response = search_with_olostep(website)
|
||||
|
||||
# Assert
|
||||
assert (
|
||||
"An alarm sent from the area near the fire also failed to register at the courthouse where the fire watchmen were"
|
||||
in response
|
||||
)
|
||||
|
||||
@@ -73,6 +73,30 @@ def test_chat_with_online_content(chat_client):
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("SERPER_DEV_API_KEY") is None or os.getenv("OLOSTEP_API_KEY") is None,
|
||||
reason="requires SERPER_DEV_API_KEY and OLOSTEP_API_KEY",
|
||||
)
|
||||
@pytest.mark.chatquality
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_chat_with_online_webpage_content(chat_client):
|
||||
# Act
|
||||
q = "/online how many firefighters were involved in the great chicago fire and which year did it take place?"
|
||||
encoded_q = quote(q, safe="")
|
||||
response = chat_client.get(f"/api/chat?q={encoded_q}&stream=true")
|
||||
response_message = response.content.decode("utf-8")
|
||||
|
||||
response_message = response_message.split("### compiled references")[0]
|
||||
|
||||
# Assert
|
||||
expected_responses = ["185", "1871", "horse"]
|
||||
assert response.status_code == 200
|
||||
assert any([expected_response in response_message for expected_response in expected_responses]), (
|
||||
"Expected links or serper not setup in response but got: " + response_message
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
@pytest.mark.chatquality
|
||||
|
||||
Reference in New Issue
Block a user