include all evaluators in results

Gapminder · Sep 16, 2024 · eea145b · eea145b
1 parent 21edc6f
commit eea145b
Showing 1 changed file with 5 additions and 31 deletions.
diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py
@@ -12,9 +12,9 @@
 # all Yival experiment results are exported into pickle files.
 # you can use follow code to explore the structure.
 # change fp variable to the pickle file path
-# fp = "gpt4_0.pkl"
+# fp = "path/to/result.pkl"
 # data: Experiment = pickle.load(open(fp, "rb"))
-# data.group_experiment_results[:2]
+# data.group_experiment_results[0].asdict()
 # result = data.group_experiment_results[1]
 # rs = result.experiment_results
 # len(rs)
@@ -24,28 +24,6 @@
 # In this script, we store all responses into an excel file.
 output_dir = current_script_path / "../output"
 
-option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1}
-
-
-def exact_match_correctness(answer, options, correctness):
-    option_occurance = [0, 0, 0]
-    scores = [option_score_mapping[x] for x in correctness]
-    for i, o in zip(range(3), options):
-        if o.strip().lower() in answer.strip().lower():
-            option_occurance[i] = 1
-    if sum(option_occurance) == 1:
-        score = scores[option_occurance.index(1)]
-    else:
-        score = 0
-
-    return score
-
-
-def extract_correct_answer(options, correctness):
-    for t, c in zip(options, correctness):
-        if c == "Correct":
-            return t
-
 
 if __name__ == "__main__":
     output_list = []
@@ -71,29 +49,25 @@ def extract_correct_answer(options, correctness):
                     option_b_correctness,
                     option_c_correctness,
                 ]
-                auto_mark_correctness = exact_match_correctness(
-                    answer, options, correctness
-                )
-                correct_answer = extract_correct_answer(options, correctness)
                 result_dict = dict(
                     experiment_date=expr_date,
                     question_id=str(result.input_data.content["question_id"]),
                     model_id=result.combination["model_config"]["model_id"],
                     model_params=str(result.combination["model_config"]["params"]),
                     prompt_template=result.combination["prompt_template"],
                     question=result.input_data.content["question_text"],
-                    correct_answer=correct_answer,
                     raw_output=result.raw_output.text_output,
-                    auto_mark_correctness=auto_mark_correctness,
                 )
                 for eval_output in result.evaluator_outputs:
-                    result_dict[eval_output.display_name] = eval_output.result
+                    col_name = f"{eval_output.name}_{eval_output.display_name}"
+                    result_dict[col_name] = eval_output.result
 
                 output_list.append(result_dict)
 
     output_df = pd.DataFrame.from_records(output_list)
     # add a human rating column
     output_df["human_rating_score"] = np.nan
     output_df.to_excel(osp.join(output_dir, "results.xlsx"), index=False)
+    output_df.to_parquet(osp.join(output_dir, "results.parquet"), index=False)
 
     print("done")