From 8231f4bb6e864d7338cf9c63e23fc3355fab96b7 Mon Sep 17 00:00:00 2001 From: Debanjum Date: Fri, 3 Jan 2025 12:11:19 +0700 Subject: [PATCH] Return accuracy as decision to generalize across IR & standard scorers --- tests/evals/eval.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/evals/eval.py b/tests/evals/eval.py index eee3656f..629df91e 100644 --- a/tests/evals/eval.py +++ b/tests/evals/eval.py @@ -58,7 +58,7 @@ class Counter: # Track running metrics while evaluating running_cost = Counter() running_true_count = Counter(0) -running_false_count = Counter(0) +running_total_count = Counter(0) def load_frames_dataset(): @@ -259,7 +259,7 @@ def evaluate_response_with_mcq_match( explanation = f"Agent response {'matches' if decision else 'does not match'} ground truth {ground_truth}" # Return decision, explanation and cost in structured form - return decision, explanation, 0.0 + return float(decision), explanation, 0.0 except Exception as e: logger.error(f"Error in evaluation: {e}") return None, f"Evaluation failed: {str(e)}", 0.0 @@ -306,7 +306,7 @@ def evaluate_response_with_gemini( eval_response: dict[str, str] = json.loads( clean_json(response_json["candidates"][0]["content"]["parts"][0]["text"]) ) - decision = str(eval_response.get("decision", "")).upper() == "TRUE" + decision = float(str(eval_response.get("decision", "")).upper() == "TRUE") explanation = eval_response.get("explanation", "") # Handle evaluation service errors if "503 Service Error" in explanation: @@ -360,11 +360,12 @@ def process_batch(batch, batch_start, results, dataset_length, response_evaluato # Update running accuracy running_accuracy = 0.0 if decision is not None: - running_true_count.add(1) if decision == True else running_false_count.add(1) - running_accuracy = running_true_count.get() / (running_true_count.get() + running_false_count.get()) + running_true_count.add(decision) + running_total_count.add(1) + running_accuracy = running_true_count.get() / running_total_count.get() ## Log results - decision_color = {True: "green", None: "blue", False: "red"}[decision] + decision_color = {True: "green", None: "blue", False: "red"}[decision > 0.5] colored_decision = color_text(str(decision), decision_color) result_to_print = f""" --------- @@ -466,12 +467,10 @@ def main(): # Calculate metrics df = pd.DataFrame(results) eval_df = df.dropna(subset=["evaluation_decision"]) # Exclude rows with missing evaluation decision - accuracy = (eval_df["evaluation_decision"] == True).mean() + accuracy = (eval_df["evaluation_decision"]).mean() # Calculate accuracy by reasoning type - reasoning_type_accuracy = eval_df.groupby("reasoning_type")["evaluation_decision"].apply( - lambda x: (x == True).mean() - ) + reasoning_type_accuracy = (eval_df.groupby("reasoning_type")["evaluation_decision"]).apply(lambda x: x.mean()) # Collect summary colored_accuracy = color_text(f"{accuracy:.2%}", "blue")