mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Optionally pass references used by agent for response to eval scorers
This will allow the eval framework to evaluate retrieval quality too
This commit is contained in:
@@ -238,14 +238,18 @@ def get_agent_response(prompt: str) -> Dict[str, Any]:
|
|||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
return {"response": response_json.get("response", ""), "usage": response_json.get("usage", {})}
|
return {
|
||||||
|
"response": response_json.get("response", ""),
|
||||||
|
"usage": response_json.get("usage", {}),
|
||||||
|
"references": response_json.get("references", {}),
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting agent response: {e}")
|
logger.error(f"Error getting agent response: {e}")
|
||||||
return {"response": "", "usage": {}}
|
return {"response": "", "usage": {}, "references": {}}
|
||||||
|
|
||||||
|
|
||||||
def evaluate_response_with_mcq_match(
|
def evaluate_response_with_mcq_match(
|
||||||
query: str, agent_response: str, ground_truth: str
|
query: str, agent_response: str, ground_truth: str, agent_references: dict = {}
|
||||||
) -> tuple[bool | None, str, float]:
|
) -> tuple[bool | None, str, float]:
|
||||||
"""Evaluate Khoj response against benchmark ground truth using string matching"""
|
"""Evaluate Khoj response against benchmark ground truth using string matching"""
|
||||||
try:
|
try:
|
||||||
@@ -266,7 +270,7 @@ def evaluate_response_with_mcq_match(
|
|||||||
|
|
||||||
|
|
||||||
def evaluate_response_with_gemini(
|
def evaluate_response_with_gemini(
|
||||||
query: str, agent_response: str, ground_truth: str, eval_model=GEMINI_EVAL_MODEL
|
query: str, agent_response: str, ground_truth: str, agent_references: dict = {}, eval_model=GEMINI_EVAL_MODEL
|
||||||
) -> tuple[bool | None, str, float]:
|
) -> tuple[bool | None, str, float]:
|
||||||
"""Evaluate Khoj response against benchmark ground truth using Gemini"""
|
"""Evaluate Khoj response against benchmark ground truth using Gemini"""
|
||||||
evaluation_prompt = f"""
|
evaluation_prompt = f"""
|
||||||
@@ -331,13 +335,14 @@ def process_batch(batch, batch_start, results, dataset_length, response_evaluato
|
|||||||
response = get_agent_response(prompt)
|
response = get_agent_response(prompt)
|
||||||
agent_response = response["response"]
|
agent_response = response["response"]
|
||||||
agent_usage = response["usage"]
|
agent_usage = response["usage"]
|
||||||
|
agent_references = response["references"]
|
||||||
|
|
||||||
# Evaluate response
|
# Evaluate response
|
||||||
if is_none_or_empty(agent_response):
|
if is_none_or_empty(agent_response):
|
||||||
decision = None
|
decision = None
|
||||||
explanation = "Agent response is empty. This maybe due to a service error."
|
explanation = "Agent response is empty. This maybe due to a service error."
|
||||||
else:
|
else:
|
||||||
decision, explanation, eval_cost = response_evaluator(prompt, agent_response, answer)
|
decision, explanation, eval_cost = response_evaluator(prompt, agent_response, answer, agent_references)
|
||||||
|
|
||||||
# Store results
|
# Store results
|
||||||
results.append(
|
results.append(
|
||||||
@@ -350,6 +355,7 @@ def process_batch(batch, batch_start, results, dataset_length, response_evaluato
|
|||||||
"evaluation_explanation": explanation,
|
"evaluation_explanation": explanation,
|
||||||
"reasoning_type": reasoning_type,
|
"reasoning_type": reasoning_type,
|
||||||
"usage": agent_usage,
|
"usage": agent_usage,
|
||||||
|
"references": agent_references,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user