Create Tests to Measure Chat Quality, Capabilities

Create Rubric to Test Chat Quality and Capabilities

### Issues
- Previously the improvements in quality of Khoj Chat on changes was uncertain
- Manual testing on my evolving set of notes was slow and didn't assess all expected, desired capabilities

### Fix
1. Create an Evaluation Dataset to assess Chat Capabilities
   - Create custom notes for a fictitious person (I'll publish a book with these soon 😅😋)
   - Add a few of Paul Graham's more personal essays. *[Easy to get as markdown](https://github.com/ofou/graham-essays)*
2. Write Unit Tests to Measure Chat Capabilities
   - Measure quality at 2 separate layers
     - **Chat Actor**: These are the narrow agents made of LLM + Prompt. E.g `summarize`, `converse` in `gpt.py`
     - **Chat Director**: This is the chat orchestration agent. It calls on required chat actors, search through user provided knowledge base (i.e notes, ledger, image) etc to respond appropriately to the users message.  This is what the `/api/chat` API exposes.
   - Mark desired but not currently available capabilities as expected to fail <br />
     This still allows measuring the chat capability score/percentage while only failing capability tests which were passing before on any changes to chat
This commit is contained in:
Debanjum
2023-03-16 11:30:52 -06:00
committed by GitHub
31 changed files with 1787 additions and 359 deletions

View File

@@ -9,7 +9,8 @@ import openai
# Internal Packages
from khoj.utils.constants import empty_escape_sequences
from khoj.utils.helpers import merge_dicts
from khoj.processor.conversation.utils import message_to_prompt, generate_chatml_messages_with_context
logger = logging.getLogger(__name__)
@@ -121,7 +122,7 @@ A:{ "search-type": "notes" }"""
return json.loads(story.strip(empty_escape_sequences))
def converse(text, user_query, conversation_log=None, api_key=None, temperature=0):
def converse(text, user_query, conversation_log={}, api_key=None, temperature=0.2):
"""
Converse with user using OpenAI's ChatGPT
"""
@@ -129,9 +130,9 @@ def converse(text, user_query, conversation_log=None, api_key=None, temperature=
model = "gpt-3.5-turbo"
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
personality_primer = "You are a friendly, helpful personal assistant."
personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
conversation_primer = f"""
Using the notes and our chats as context, answer the following question.
Using the notes and our past conversations as context, answer the following question.
Current Date: {datetime.now().strftime("%Y-%m-%d")}
Notes:
@@ -157,60 +158,3 @@ Question: {user_query}"""
# Extract, Clean Message from GPT's Response
story = str(response["choices"][0]["message"]["content"])
return story.strip(empty_escape_sequences)
def generate_chatml_messages_with_context(user_message, system_message, conversation_log=None):
"""Generate messages for ChatGPT with context from previous conversation"""
# Extract Chat History for Context
chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
last_backnforth = reciprocal_conversation_to_chatml(chat_logs[-2:])
rest_backnforth = reciprocal_conversation_to_chatml(chat_logs[-4:-2])
# Format user and system messages to chatml format
system_chatml_message = [message_to_chatml(system_message, "system")]
user_chatml_message = [message_to_chatml(user_message, "user")]
return rest_backnforth + system_chatml_message + last_backnforth + user_chatml_message
def reciprocal_conversation_to_chatml(message_pair):
"""Convert a single back and forth between user and assistant to chatml format"""
return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
def message_to_chatml(message, role="assistant"):
"""Create chatml message from message and role"""
return {"role": role, "content": message}
def message_to_prompt(
user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
):
"""Create prompt for GPT from messages and conversation history"""
gpt_message = f" {gpt_message}" if gpt_message else ""
return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
def message_to_log(user_message, gpt_message, khoj_message_metadata={}, conversation_log=[]):
"""Create json logs from messages, metadata for conversation log"""
default_khoj_message_metadata = {
"intent": {"type": "remember", "memory-type": "notes", "query": user_message},
"trigger-emotion": "calm",
}
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Create json log from Human's message
human_log = {"message": user_message, "by": "you", "created": current_dt}
# Create json log from GPT's response
khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
khoj_log = merge_dicts({"message": gpt_message, "by": "khoj", "created": current_dt}, khoj_log)
conversation_log.extend([human_log, khoj_log])
return conversation_log
def extract_summaries(metadata):
"""Extract summaries from metadata"""
return "".join([f'\n{session["summary"]}' for session in metadata])

View File

@@ -0,0 +1,62 @@
# Standard Packages
from datetime import datetime
# Internal Packages
from khoj.utils.helpers import merge_dicts
def generate_chatml_messages_with_context(user_message, system_message, conversation_log={}):
"""Generate messages for ChatGPT with context from previous conversation"""
# Extract Chat History for Context
chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
last_backnforth = reciprocal_conversation_to_chatml(chat_logs[-2:])
rest_backnforth = reciprocal_conversation_to_chatml(chat_logs[-4:-2])
# Format user and system messages to chatml format
system_chatml_message = [message_to_chatml(system_message, "system")]
user_chatml_message = [message_to_chatml(user_message, "user")]
return rest_backnforth + system_chatml_message + last_backnforth + user_chatml_message
def reciprocal_conversation_to_chatml(message_pair):
"""Convert a single back and forth between user and assistant to chatml format"""
return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
def message_to_chatml(message, role="assistant"):
"""Create chatml message from message and role"""
return {"role": role, "content": message}
def message_to_prompt(
user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
):
"""Create prompt for GPT from messages and conversation history"""
gpt_message = f" {gpt_message}" if gpt_message else ""
return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
def message_to_log(user_message, gpt_message, khoj_message_metadata={}, conversation_log=[]):
"""Create json logs from messages, metadata for conversation log"""
default_khoj_message_metadata = {
"intent": {"type": "remember", "memory-type": "notes", "query": user_message},
"trigger-emotion": "calm",
}
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Create json log from Human's message
human_log = {"message": user_message, "by": "you", "created": current_dt}
# Create json log from GPT's response
khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
khoj_log = merge_dicts({"message": gpt_message, "by": "khoj", "created": current_dt}, khoj_log)
conversation_log.extend([human_log, khoj_log])
return conversation_log
def extract_summaries(metadata):
"""Extract summaries from metadata"""
return "".join([f'\n{session["summary"]}' for session in metadata])

View File

@@ -10,7 +10,8 @@ from fastapi import HTTPException
# Internal Packages
from khoj.configure import configure_processor, configure_search
from khoj.processor.conversation.gpt import converse, message_to_log, message_to_prompt
from khoj.processor.conversation.gpt import converse
from khoj.processor.conversation.utils import message_to_log, message_to_prompt
from khoj.search_type import image_search, text_search
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import FullConfig, SearchResponse