Skip to content

Commit

Permalink
include all evaluators in results
Browse files Browse the repository at this point in the history
  • Loading branch information
semio committed Sep 16, 2024
1 parent 21edc6f commit eea145b
Showing 1 changed file with 5 additions and 31 deletions.
36 changes: 5 additions & 31 deletions automation-api/yival_experiments/scripts/generate_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# all Yival experiment results are exported into pickle files.
# you can use follow code to explore the structure.
# change fp variable to the pickle file path
# fp = "gpt4_0.pkl"
# fp = "path/to/result.pkl"
# data: Experiment = pickle.load(open(fp, "rb"))
# data.group_experiment_results[:2]
# data.group_experiment_results[0].asdict()
# result = data.group_experiment_results[1]
# rs = result.experiment_results
# len(rs)
Expand All @@ -24,28 +24,6 @@
# In this script, we store all responses into an excel file.
output_dir = current_script_path / "../output"

option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1}


def exact_match_correctness(answer, options, correctness):
option_occurance = [0, 0, 0]
scores = [option_score_mapping[x] for x in correctness]
for i, o in zip(range(3), options):
if o.strip().lower() in answer.strip().lower():
option_occurance[i] = 1
if sum(option_occurance) == 1:
score = scores[option_occurance.index(1)]
else:
score = 0

return score


def extract_correct_answer(options, correctness):
for t, c in zip(options, correctness):
if c == "Correct":
return t


if __name__ == "__main__":
output_list = []
Expand All @@ -71,29 +49,25 @@ def extract_correct_answer(options, correctness):
option_b_correctness,
option_c_correctness,
]
auto_mark_correctness = exact_match_correctness(
answer, options, correctness
)
correct_answer = extract_correct_answer(options, correctness)
result_dict = dict(
experiment_date=expr_date,
question_id=str(result.input_data.content["question_id"]),
model_id=result.combination["model_config"]["model_id"],
model_params=str(result.combination["model_config"]["params"]),
prompt_template=result.combination["prompt_template"],
question=result.input_data.content["question_text"],
correct_answer=correct_answer,
raw_output=result.raw_output.text_output,
auto_mark_correctness=auto_mark_correctness,
)
for eval_output in result.evaluator_outputs:
result_dict[eval_output.display_name] = eval_output.result
col_name = f"{eval_output.name}_{eval_output.display_name}"
result_dict[col_name] = eval_output.result

output_list.append(result_dict)

output_df = pd.DataFrame.from_records(output_list)
# add a human rating column
output_df["human_rating_score"] = np.nan
output_df.to_excel(osp.join(output_dir, "results.xlsx"), index=False)
output_df.to_parquet(osp.join(output_dir, "results.parquet"), index=False)

print("done")

0 comments on commit eea145b

Please sign in to comment.