Create Tests to Measure Chat Quality, Capabilities

Create Rubric to Test Chat Quality and Capabilities ### Issues - Previously the improvements in quality of Khoj Chat on changes was uncertain - Manual testing on my evolving set of notes was slow and didn't assess all expected, desired capabilities ### Fix 1. Create an Evaluation Dataset to assess Chat Capabilities - Create custom notes for a fictitious person (I'll publish a book with these soon 😅😋) - Add a few of Paul Graham's more personal essays. *[Easy to get as markdown](https://github.com/ofou/graham-essays)* 2. Write Unit Tests to Measure Chat Capabilities - Measure quality at 2 separate layers - **Chat Actor**: These are the narrow agents made of LLM + Prompt. E.g `summarize`, `converse` in `gpt.py` - **Chat Director**: This is the chat orchestration agent. It calls on required chat actors, search through user provided knowledge base (i.e notes, ledger, image) etc to respond appropriately to the users message. This is what the `/api/chat` API exposes. - Mark desired but not currently available capabilities as expected to fail <br /> This still allows measuring the chat capability score/percentage while only failing capability tests which were passing before on any changes to chat
2026-03-05 21:29:11 +00:00 · 2023-03-16 11:30:52 -06:00
parent 8609e3129e 4e15b4e411
commit e75e13d788
31 changed files with 1787 additions and 359 deletions
--- a/src/khoj/processor/conversation/gpt.py
+++ b/src/khoj/processor/conversation/gpt.py
@@ -9,7 +9,8 @@ import openai

 # Internal Packages
 from khoj.utils.constants import empty_escape_sequences
-from khoj.utils.helpers import merge_dicts
+from khoj.processor.conversation.utils import message_to_prompt, generate_chatml_messages_with_context
+

 logger = logging.getLogger(__name__)

@@ -121,7 +122,7 @@ A:{ "search-type": "notes" }"""
    return json.loads(story.strip(empty_escape_sequences))


-def converse(text, user_query, conversation_log=None, api_key=None, temperature=0):
+def converse(text, user_query, conversation_log={}, api_key=None, temperature=0.2):
    """
    Converse with user using OpenAI's ChatGPT
    """
@@ -129,9 +130,9 @@ def converse(text, user_query, conversation_log=None, api_key=None, temperature=
    model = "gpt-3.5-turbo"
    openai.api_key = api_key or os.getenv("OPENAI_API_KEY")

-    personality_primer = "You are a friendly, helpful personal assistant."
+    personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
    conversation_primer = f"""
-Using the notes and our chats as context, answer the following question.
+Using the notes and our past conversations as context, answer the following question.
 Current Date: {datetime.now().strftime("%Y-%m-%d")}

 Notes:
@@ -157,60 +158,3 @@ Question: {user_query}"""
    # Extract, Clean Message from GPT's Response
    story = str(response["choices"][0]["message"]["content"])
    return story.strip(empty_escape_sequences)
-
-
-def generate_chatml_messages_with_context(user_message, system_message, conversation_log=None):
-    """Generate messages for ChatGPT with context from previous conversation"""
-    # Extract Chat History for Context
-    chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
-    last_backnforth = reciprocal_conversation_to_chatml(chat_logs[-2:])
-    rest_backnforth = reciprocal_conversation_to_chatml(chat_logs[-4:-2])
-
-    # Format user and system messages to chatml format
-    system_chatml_message = [message_to_chatml(system_message, "system")]
-    user_chatml_message = [message_to_chatml(user_message, "user")]
-
-    return rest_backnforth + system_chatml_message + last_backnforth + user_chatml_message
-
-
-def reciprocal_conversation_to_chatml(message_pair):
-    """Convert a single back and forth between user and assistant to chatml format"""
-    return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
-
-
-def message_to_chatml(message, role="assistant"):
-    """Create chatml message from message and role"""
-    return {"role": role, "content": message}
-
-
-def message_to_prompt(
-    user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
-):
-    """Create prompt for GPT from messages and conversation history"""
-    gpt_message = f" {gpt_message}" if gpt_message else ""
-
-    return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
-
-
-def message_to_log(user_message, gpt_message, khoj_message_metadata={}, conversation_log=[]):
-    """Create json logs from messages, metadata for conversation log"""
-    default_khoj_message_metadata = {
-        "intent": {"type": "remember", "memory-type": "notes", "query": user_message},
-        "trigger-emotion": "calm",
-    }
-    current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-    # Create json log from Human's message
-    human_log = {"message": user_message, "by": "you", "created": current_dt}
-
-    # Create json log from GPT's response
-    khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
-    khoj_log = merge_dicts({"message": gpt_message, "by": "khoj", "created": current_dt}, khoj_log)
-
-    conversation_log.extend([human_log, khoj_log])
-    return conversation_log
-
-
-def extract_summaries(metadata):
-    """Extract summaries from metadata"""
-    return "".join([f'\n{session["summary"]}' for session in metadata])
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -0,0 +1,62 @@
+# Standard Packages
+from datetime import datetime
+
+# Internal Packages
+from khoj.utils.helpers import merge_dicts
+
+
+def generate_chatml_messages_with_context(user_message, system_message, conversation_log={}):
+    """Generate messages for ChatGPT with context from previous conversation"""
+    # Extract Chat History for Context
+    chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
+    last_backnforth = reciprocal_conversation_to_chatml(chat_logs[-2:])
+    rest_backnforth = reciprocal_conversation_to_chatml(chat_logs[-4:-2])
+
+    # Format user and system messages to chatml format
+    system_chatml_message = [message_to_chatml(system_message, "system")]
+    user_chatml_message = [message_to_chatml(user_message, "user")]
+
+    return rest_backnforth + system_chatml_message + last_backnforth + user_chatml_message
+
+
+def reciprocal_conversation_to_chatml(message_pair):
+    """Convert a single back and forth between user and assistant to chatml format"""
+    return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
+
+
+def message_to_chatml(message, role="assistant"):
+    """Create chatml message from message and role"""
+    return {"role": role, "content": message}
+
+
+def message_to_prompt(
+    user_message, conversation_history="", gpt_message=None, start_sequence="\nAI:", restart_sequence="\nHuman:"
+):
+    """Create prompt for GPT from messages and conversation history"""
+    gpt_message = f" {gpt_message}" if gpt_message else ""
+
+    return f"{conversation_history}{restart_sequence} {user_message}{start_sequence}{gpt_message}"
+
+
+def message_to_log(user_message, gpt_message, khoj_message_metadata={}, conversation_log=[]):
+    """Create json logs from messages, metadata for conversation log"""
+    default_khoj_message_metadata = {
+        "intent": {"type": "remember", "memory-type": "notes", "query": user_message},
+        "trigger-emotion": "calm",
+    }
+    current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    # Create json log from Human's message
+    human_log = {"message": user_message, "by": "you", "created": current_dt}
+
+    # Create json log from GPT's response
+    khoj_log = merge_dicts(khoj_message_metadata, default_khoj_message_metadata)
+    khoj_log = merge_dicts({"message": gpt_message, "by": "khoj", "created": current_dt}, khoj_log)
+
+    conversation_log.extend([human_log, khoj_log])
+    return conversation_log
+
+
+def extract_summaries(metadata):
+    """Extract summaries from metadata"""
+    return "".join([f'\n{session["summary"]}' for session in metadata])
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -10,7 +10,8 @@ from fastapi import HTTPException

 # Internal Packages
 from khoj.configure import configure_processor, configure_search
-from khoj.processor.conversation.gpt import converse, message_to_log, message_to_prompt
+from khoj.processor.conversation.gpt import converse
+from khoj.processor.conversation.utils import message_to_log, message_to_prompt
 from khoj.search_type import image_search, text_search
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import FullConfig, SearchResponse