diff --git a/yival_experiments/output/generate_report.py b/yival_experiments/output/generate_report.py index e7c7fb0..1dead14 100644 --- a/yival_experiments/output/generate_report.py +++ b/yival_experiments/output/generate_report.py @@ -18,6 +18,7 @@ # rs[1].asdict() # We will combine all pickle files in output dir and calculate final scores. +# TODO: follow the format in `Latest Results` sheet of AI eval spreadsheet # 1. Store all responses into excel file. output_list = [] @@ -45,6 +46,7 @@ # 2. calculate a final score per model configuration +# TODO: I think it's possible to convert these into a Yival Evaluator. def is_correct_p(round_results): c = Counter(round_results) top2 = c.most_common(2)