diff --git a/automation-api/yival_experiments/notebooks/compare_evaluators.py b/automation-api/yival_experiments/notebooks/compare_evaluators.py index d0d0540..c80dbed 100644 --- a/automation-api/yival_experiments/notebooks/compare_evaluators.py +++ b/automation-api/yival_experiments/notebooks/compare_evaluators.py @@ -12,8 +12,12 @@ conn = duckdb.connect() -conn.query("select * from df") +simple_eval_check = conn.query("select * from df where simple_evaluator_matching <> auto_mark_correctness") +simple_eval_check +simple_eval_check.to_csv("./simple_eval_check.csv") + +# NEXT: review the query and begin to check results. q = """select * from @@ -22,19 +26,35 @@ not ( llama3_evaluator_correctness = vertex_ai_evaluator_correctness and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness + ) + or ( + auto_mark_correctness <> 0 + and ( + llama3_evaluator_correctness = vertex_ai_evaluator_correctness + and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness + ) + and auto_mark_correctness <> gpt4_evaluator_correctness + ) + or ( + simple_evaluator_matching <> 0 and ( - simple_evaluator_matching <> 0 - and simple_evaluator_matching = gpt4_evaluator_correctness + llama3_evaluator_correctness = vertex_ai_evaluator_correctness + and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness ) + and simple_evaluator_matching <> gpt4_evaluator_correctness ) - and simple_evaluator_matching <> 0""" + + """ diffs = conn.query(q) conn.query("select count(*) from df") conn.query("select count(*) from diffs") -diffs.to_csv("to_check.csv") +diffs.to_csv("to_check_all.csv") + +410 / 30780 -1281 / 30780 +# FIXME: the simple evaluator seems not working very well? +# just use the exact matching algo diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py index a93872a..e7815db 100644 --- a/automation-api/yival_experiments/notebooks/result_data_analysis.py +++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py @@ -2,7 +2,7 @@ # # This notebook is for producing tables listed in https://docs.google.com/spreadsheets/d/1ln5ui3f13AfAQkBuEMbNomBXlZLhkQPYVEpBlZjUtu0/edit?pli=1#gid=0 # -# Results are from the experiments in Apr and May 2023 +# Latest Update: 2024-10-02 # going to use duckdb # %load_ext sql @@ -10,6 +10,7 @@ # %sql duckdb:// import pandas as pd +import polars as pl from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants from lib.config import read_config import matplotlib.pyplot as plt @@ -23,7 +24,10 @@ # results to be analyzed # manually download from AI eval spreadsheet. -result = pd.read_csv('./data/Gapminder AI evaluations - Master Output.csv') +result = pd.concat([ + pd.read_csv('./data/Gapminder AI evaluations - Master Output.csv'), + # pd.read_csv('./data/Gapminder AI evaluations - Latest Results.csv'), +]) # load ai eval spreadsheet ai_eval_sheet = read_ai_eval_spreadsheet() @@ -50,28 +54,11 @@ # from result # where model_configuration_id != 'mc026' -- exclude qwen 1201 -# + magic_args="--save result_to_analyze_latest_only" language="sql" -# select * from result_to_analyze -# where -# model_configuration_id = 'mc030' -# OR model_configuration_id = 'mc035' -# OR model_configuration_id = 'mc032' -# OR model_configuration_id = 'mc033' -# OR model_configuration_id = 'mc034' - # + magic_args="--with result_to_analyze --save result_chn_prompt_renamed" language="sql" # select # * exclude (prompt_variation_id), # replace(prompt_variation_id, '_zh', '') as prompt_variation_id # from result_to_analyze -# + magic_args="--save result_chn_prompt_renamed_latest_only" language="sql" -# select * from result_chn_prompt_renamed -# where -# model_configuration_id = 'mc030' -# OR model_configuration_id = 'mc035' -# OR model_configuration_id = 'mc032' -# OR model_configuration_id = 'mc033' -# OR model_configuration_id = 'mc034' # - @@ -166,27 +153,11 @@ +# ## Summary +# ### Correctness Break Down by Model - -# ## Experiment Total - -# + language="sql" -# select -# 'AI' as name, -# count(*) as total_count, -# count(*) filter (result != 'fail') as total_count_exclude_indecisive, -# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, -# 100 - correct_rate as wrong_rate, -# count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from result_to_analyze_latest_only -# - - - - -# ### Break down by Model - -# + language="sql" +# + magic_args="result_by_models <<" language="sql" # select # m.model_id as model_id, # count(*) as total_count, @@ -194,103 +165,16 @@ # count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, # 100 - correct_rate as wrong_rate, # count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from result_to_analyze_latest_only r left join all_models m on r.model_configuration_id = m.model_config_id +# from result_to_analyze r left join all_models m on r.model_configuration_id = m.model_config_id # GROUP BY m.model_id # order by correct_rate desc # - +result_by_models_df = result_by_models.DataFrame() +result_by_models_df -# ### break down by prompt and prompt family - -# + magic_args="by_prompt_family <<" language="sql" -# select -# p.prompt_family as prompt_family, -# count(DISTINCT p.variation_id) as number_of_prompts, -# -- count(DISTINCT p.variation_id) / 2 as number_of_prompts, -- uncomment this to treat chinese prompt and english prompt the same. -# count(*) as total_count, -# count(*) filter (result != 'fail') as total_count_exclude_indecisive, -# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, -# 100 - correct_rate as wrong_rate, -# count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from result_to_analyze_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id -# GROUP BY p.prompt_family -# ORDER BY correct_rate desc -# - - -by_prompt_family.DataFrame().set_index('prompt_family') - -# + magic_args="by_prompt <<" language="sql" -# select -# any_value(p.prompt_family) as prompt_family, -# prompt_variation_id, -# count(*) as total_count, -# count(*) filter (result != 'fail') as total_count_exclude_indecisive, -# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, -# 100 - correct_rate as wrong_rate, -# count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from result_chn_prompt_renamed_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id -# GROUP BY r.prompt_variation_id -# ORDER BY correct_rate desc -# - - -by_prompt.DataFrame().to_csv('./data/outputs/new_total_by_prompts.csv', index=False) - - - - - - - -# ### break down by topics - -# + magic_args="by_topics_1 <<" language="sql" -# select -# q.sdg_topic as sdg_topic, -# count(DISTINCT q.question_id) as number_of_questions, -- treat chinese prompt and english prompt the same. -# count(*) as total_count, -# count(*) filter (result != 'fail') as total_count_exclude_indecisive, -# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, -# 100 - correct_rate as wrong_rate, -# count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from result_to_analyze_latest_only r left join q_and_t q on r.question_id = q.question_id -# GROUP BY q.sdg_topic -# ORDER BY sdg_topic -# - - -by_topics_1.DataFrame().set_index('sdg_topic') - -# + -# other topics - -# + magic_args="--save res_with_other_topics" language="sql" -# select -# r.*, -# unnest(q.other_topics) as topic -# from result_to_analyze_latest_only r left join q_and_t q on r.question_id = q.question_id -# - - - - -# + magic_args="--with res_with_other_topics by_topics_2 <<" language="sql" -# select -# topic, -# count(DISTINCT question_id) as number_of_questions, -- treat chinese prompt and english prompt the same. -# count(*) as total_count, -# count(*) filter (result != 'fail') as total_count_exclude_indecisive, -# count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate, -# 100 - correct_rate as wrong_rate, -# count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate -# from res_with_other_topics -# GROUP BY topic -# ORDER BY topic -# - - -by_topics_2.DataFrame().set_index('topic') - - - -# ## The Top 5 and Bottom 5 prompts of a model +# ### The Top 5 and Bottom 5 prompts of a model # + magic_args="--save by_prompt_and_model" language="sql" # select @@ -309,7 +193,7 @@ # select * # from by_prompt_and_model # where -# list_contains([1,2,3,4,5, 108, 107, 106, 105, 104], rank) +# list_contains([1,2,3,4,5, 108, 107, 106, 105, 104], rank) # order by model_configuration_id, rank # - @@ -319,12 +203,16 @@ by_prompt_and_model_with_rank_df.to_csv('./data/outputs/new_prompt_model_bottoms.csv') -# + language="sql" +# + magic_args="avg_model_correct_rate <<" language="sql" # select model_configuration_id, mean(correct_rate) # from by_prompt_and_model # group by model_configuration_id # order by model_configuration_id # - +avg_model_correct_rate_df = avg_model_correct_rate.DataFrame() + +avg_model_correct_rate_df + @@ -332,6 +220,16 @@ # ## Model, Prompt Family, Topic aggregations +# + magic_args="--save res_with_prompt_family" language="sql" +# select +# r.*, +# p.prompt_family +# from result_to_analyze r left join all_prompts p on r.prompt_variation_id = p.variation_id + +# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql" +# select * from res_with_prompt_family where score != 0 +# - + # ### highest variance by model # + magic_args="--save prompt_variance_stat" language="sql" @@ -378,15 +276,6 @@ # I need to check the variance cause by Prompt Family for each Model. # So I will first check the answer variance of each question, then get the average variance of all questions. -# + magic_args="--save res_with_prompt_family" language="sql" -# select -# r.*, -# p.prompt_family -# from result_to_analyze_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id - -# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql" -# select * from res_with_prompt_family where score != 0 - # + magic_args="--save model_prompt_stat1" language="sql" # select # prompt_family, @@ -405,14 +294,6 @@ # question_id # order by # "correct_rate" desc -# - - - - - - - - # + magic_args="--save model_prompt_stat2" language="sql" # select # r.prompt_family, @@ -432,24 +313,6 @@ # r.model_configuration_id, # r.question_id -# + -# # %%sql -# select -# r.prompt_family, -# r.model_configuration_id, -# r.prompt_variation_id, -# r.question_id, -# r.score, -# s1.mode_score -# from -# res_with_prompt_family_exclude_ind r -# left join model_prompt_stat1 s1 -# on -# r.prompt_family = s1.prompt_family AND -# r.model_configuration_id = s1.model_configuration_id AND -# r.question_id = s1.question_id -# where r.prompt_family = 'geo' and r.question_id = '41' - # + magic_args="--save model_prompt_stat3" language="sql" # select # prompt_family, @@ -589,188 +452,460 @@ -# ### Topic vs Prompt Family - -# + -# we will reuse the res_with_prompt_family_exclude_ind and res_with_prompt_family queries defined above. - -# + magic_args="--save question_prompt_family_stat1" language="sql" -# select -# question_id, -# prompt_family, -# count(*) filter (score = 3) / count(*) * 100 as correct_rate, -# stddev_pop(score) / mean (score) * 100 as variance -# -- count(DISTINCT score) as variance -# from -# res_with_prompt_family_exclude_ind -# group by -# question_id, -# prompt_family - -# + magic_args="--save question_prompt_family_stat2" language="sql" -# select -# question_id, -# prompt_family, -# count(*) filter (score = 1) / count(*) * 100 as indecisive_rate -# from -# res_with_prompt_family -# group by -# question_id, -# prompt_family -# - - - - -# + magic_args="--save question_prompt_family_stat_all" language="sql" -# select -# r1.question_id, -# r1.prompt_family, -# mean (correct_rate) as correct_rate, -# mean (indecisive_rate) as indecisive_rate, -# mode (variance) as variance -# from -# question_prompt_family_stat1 r1 -# left join question_prompt_family_stat2 r2 on r1.question_id = r2.question_id -# and r1.prompt_family = r2.prompt_family -# group by -# r1.question_id, -# r1.prompt_family - -# + magic_args="--save topic_prompt_family_stat" language="sql" -# select -# r.*, -# q.sdg_topic, -# q.other_topics, -# case -# when q.sdg_topic is null then other_topics -# else list_append (q.other_topics, q.sdg_topic) -# end as all_topics -# from -# question_prompt_family_stat_all r -# left join q_and_t q on r.question_id = q.question_id -# + magic_args="--with topic_prompt_family_stat topic_prompt_family_res <<" language="sql" -# select -# topic, -# -- count(*) as "number of qs", -# prompt_family, -# mean (correct_rate) as correct_rate, -# mean (indecisive_rate) as indecisive_rate, -# median (variance) as variance -# from -# (select -# * exclude (all_topics, sdg_topic, other_topics), -# unnest(all_topics) as topic -# from topic_prompt_family_stat) -# group by -# topic, -# prompt_family -# order by -# topic, -# prompt_family -# - - -topic_prompt_family_df = topic_prompt_family_res.DataFrame().set_index(['topic', 'prompt_family']) -topic_prompt_family_df.to_csv('./data/outputs/new_topic_vs_prompt.csv') +# ## Questions where AI worse than human and monkey -topic_prompt_family_df.describe() +# ### human score +100 - all_questions['wrongPercentage'].mean() -# ## Questions where AI worse than human and monkey # + language="sql" # select * from model_topic_stat; -# + magic_args="model_topic_human_diff <<" language="sql" +# + magic_args="model_topic_diff <<" language="sql" # select # question_id, # model_configuration_id, # (100 - correct_rate) as ai_wrong_percentage, # human_wrong_percentage, -# ai_wrong_percentage - human_wrong_percentage as diff, +# 2/3 * 100 as monkey_wrong_percentage, +# ai_wrong_percentage - human_wrong_percentage as compare_to_human, +# ai_wrong_percentage - monkey_wrong_percentage as compare_to_monkey, # sdg_topic, # other_topics # from model_topic_stat -# where diff > 0 +# where compare_to_human > 0 OR compare_to_monkey > 0 # order by # "sdg_topic", # cast(other_topics as varchar), # "model_configuration_id" # - -model_topic_human_diff_df = model_topic_human_diff.DataFrame() +model_topic_diff -model_topic_human_diff_df.to_csv('./data/outputs/new_ai_worse_human.csv', index=False) +model_topic_diff_df = model_topic_diff.DataFrame() +model_topic_diff_df.shape +model_topic_diff_df.to_csv('./data/outputs/new_ai_worse_all.csv', index=False) -# + magic_args="model_topic_monkey_diff <<" language="sql" -# select -# question_id, -# model_configuration_id, -# (100 - correct_rate) as ai_wrong_percentage, -# 100 * (2/3) as monkey_wrong_percentage, -# ai_wrong_percentage - monkey_wrong_percentage as diff, -# sdg_topic, -# other_topics -# from model_topic_stat -# where diff > 0 -# order by -# "sdg_topic", -# cast(other_topics as varchar), -# "model_configuration_id" +# + +# make a complete list combining worse than human and worse than monkey + +# + magic_args="all_worse_questions <<" language="sql" +# select question_id, model_configuration_id +# from +# model_topic_diff_df + +# + magic_args="very_wrong_res <<" language="sql" +# select * from result_to_analyze where result = 'very_wrong' # - -model_topic_monkey_diff_df = model_topic_monkey_diff.DataFrame() -model_topic_monkey_diff_df.to_csv('./data/outputs/new_ai_worse_monkey.csv', index=False) +# + +# now find one case for very wrong for these questions. +# - +r1 = all_worse_questions.DataFrame() +r2 = very_wrong_res.DataFrame() +r2_ = r2.groupby(['question_id', 'model_configuration_id']).agg(lambda x: x.sample(1)).reset_index() +# + magic_args="--save all_worse_very_wrong" language="sql" +# select +# r1.question_id, r1.model_configuration_id, prompt_variation_id +# from +# r1 +# left join +# r2_ +# on +# r1.question_id = r2_.question_id and r1.model_configuration_id = r2_.model_configuration_id + +# + language="sql" +# select * +# from r1 +# where +# question_id = '1640' and model_configuration_id = 'mc039' + +# + language="sql" +# select * +# from r2_ +# where +# question_id = '1640' and model_configuration_id = 'mc039' # + -# summary stats for human and monkey vs ai +# Why??? Because there is no very wrong answer for this combination! +# - -# + magic_args="summary_human_ai <<" language="sql" -# select -# question_id, -# count(*) as num_of_models, -# mean(diff) as average_diff, -# from -# model_topic_human_diff_df -# group by -# question_id -# ORDER BY -# num_of_models desc, -# average_diff desc + + +# all_worse_very_wrong_df = %sql select * from all_worse_very_wrong + +all_worse_very_wrong_df = all_worse_very_wrong_df.DataFrame() + +all_worse_very_wrong_df[pd.isnull(all_worse_very_wrong_df['prompt_variation_id'])] + + + + + +# + +# query example responses +# but first, we need to read all result data... +# - + +# FIXME: change ../output/results.parquet to correct archive path. +raw_data_fs = [ + '../output/results.parquet', # for mc039 + '../output/archives/20240521/results.xlsx', + '../output/archives/20240401/results.xlsx', + '../output/archives/20240501/results.xlsx', + '../output/archives/20240516/results.xlsx', + '../output/archives/20240601/results.xlsx', + '../output/archives/20240910/results.xlsx' +] + +pd.read_parquet(raw_data_fs[0]).columns + +# + +cols = ['experiment_date', 'question_id', 'model_id', 'prompt_template', 'question', 'raw_output'] + +raw_data_lst = list() + +for x in raw_data_fs: + if 'parquet' in x: + raw_data_lst.append(pd.read_parquet(x)[cols]) + else: + raw_data_lst.append(pd.read_excel(x)[cols]) +# - + +raw_data = pd.concat(raw_data_lst, ignore_index=True) + +raw_data + +# fix a few experiment model id +raw_data.loc[raw_data['model_id'] == 'gpt-4', 'model_id'] = 'gpt-4-0613' +raw_data.loc[raw_data['model_id'] == 'gpt-4o', 'model_id'] = 'gpt-4o-2024-05-13' + + + + + +# + +# now we should make all columns we needed +# 1. question and answers +# 2. prompt template +# 3. model configuration id # - -summary_human_ai.DataFrame() +# first do prompt template +# load all configuration files and get a mapping. +import yaml + +sorted([str(x) for x in raw_data['experiment_date'].unique()]) + +configuration_list = [ + '../experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml', + '../experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml', + '../experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml', + '../experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml', + '../experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml', + '../experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml', + '../experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml', + '../experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml', + '../experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml', + '../experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml', + '../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml', + '../experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml', + '../experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml', + '../experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml', + '../experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml', + '../experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml', + '../experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml', + '../experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml', + '../experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml', + '../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml', +] + +# + +prompt_template_list = list() + +for x in configuration_list: + c = yaml.safe_load(open(x, 'r')) + p = pd.DataFrame.from_records(c['variations'][1]['variations']) + prompt_template_list.append(p) +# - + +all_prompt_templates = pd.concat(prompt_template_list, ignore_index=True) + +all_prompt_templates = all_prompt_templates.drop_duplicates(subset=['value']) + +all_prompt_templates_mapping = all_prompt_templates.set_index('value')['variation_id'].to_dict() + +for k, v in all_prompt_templates_mapping.items(): + print(k) + print(v) + break + + -summary_human_ai.DataFrame().to_csv('./data/outputs/new_summary_human_ai.csv') +raw_data['prompt_template'].map(all_prompt_templates_mapping).hasnans # should be False +raw_data['prompt_variation_id'] = raw_data['prompt_template'].map(all_prompt_templates_mapping) -# + magic_args="summary_monkey_ai <<" language="sql" + +# + +# next add model_configuration_id +# - + +# all_models_ = %sql select * from all_models where repeat_times = 1 + +all_models_ = all_models_.DataFrame() + +all_models_mapping = all_models_.set_index('model_id')['model_config_id'].to_dict() + +raw_data['model_id'].map(all_models_mapping).hasnans + +raw_data['model_configuration_id'] = raw_data['model_id'].map(all_models_mapping) + +raw_data + + + +# + language="sql" # select -# question_id, -# count(*) as num_of_models, -# mean(diff) as average_diff, -# from -# model_topic_monkey_diff_df -# group by -# question_id -# ORDER BY -# num_of_models desc, -# average_diff desc +# DISTINCT model_id +# from +# raw_data +# where +# prompt_variation_id like '%zh%' +# - + + + + + + + + + +# + +# questions and answers mapping +# - + +all_questions.columns + +qs = ai_eval_sheet.questions.data.df.copy() +qs = qs[['question_id', 'language', 'published_version_of_question']] + +qs + +q_dict = qs.set_index(["question_id", "language"])["published_version_of_question"].to_dict() + +# + +ans = ai_eval_sheet.question_options.data.df.copy() +ans_dict = dict() + +for qid, adf in ans.groupby(["question_id", "language"]): + adict = adf.set_index('letter')['question_option'].to_dict() + ans_dict[qid] = adict +# - + +ans_dict[("1", "en-US")] + +q_dict[("1", "en-US")] + + + +# + +# create final output +# - + +all_worse_very_wrong_df + +raw_data.dtypes + +raw_data['experiment_date'] = raw_data['experiment_date'].map(lambda x: str(x)) +raw_data['question_id'] = raw_data['question_id'].map(lambda x: str(x)) +raw_data['model_id'] = raw_data['model_id'].map(lambda x: str(x)) + +raw_data_pl = pl.from_pandas(raw_data) + +# + +raw_output_lst = list() +prompt_lst = list() + + +for _, row in all_worse_very_wrong_df.iterrows(): + question_id = row['question_id'] + model_configuration_id = row['model_configuration_id'] + prompt_variation_id = row['prompt_variation_id'] + # print(question_id, model_configuration_id, prompt_variation_id) + + raw_data_row = raw_data_pl.filter( + (pl.col('question_id') == question_id) & (pl.col('model_configuration_id') == model_configuration_id) & (pl.col('prompt_variation_id') == prompt_variation_id) + ) + + if raw_data_row.is_empty(): + raw_output_lst.append(None) + prompt_lst.append(None) + else: + question_text = raw_data_row['question'].item() + question_id = raw_data_row['question_id'].item() + language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US' + answers = ans_dict[(question_id, language)] + option_a = answers['A'] + option_b = answers['B'] + option_c = answers['C'] + + prompt_template = raw_data_row['prompt_template'].item() + prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c) + # print(prompt) + + prompt_lst.append(prompt) + raw_output_lst.append(raw_data_row['raw_output'].item()) + +# - +raw_data_row + +all_worse_very_wrong_df['prompt'] = prompt_lst +all_worse_very_wrong_df['model_output'] = raw_output_lst + +all_worse_very_wrong_df + +all_worse_very_wrong_df.to_csv('./data/outputs/new_ai_worse_sample.csv', index=False) + + + + + + + +# ## Examples for high variance questions + +high_variance_questions_df + +# + language="sql" +# select * from result_to_analyze # - -summary_monkey_ai.DataFrame().to_csv('./data/outputs/new_summary_monkey_ai.csv') + + +question_id = '1792' +model_configuration_id = 'mc039' +grade = 'very_wrong' + + +# + magic_args="--save grade_example" language="sql" +# +# select * from +# ( +# select * from result_to_analyze +# where +# question_id = '{{question_id}}' +# and model_configuration_id = '{{model_configuration_id}}' +# and result = '{{grade}}' +# ) +# using sample 1 +# - + +def filter_grade(question_id, model_configuration_id, grade): + # res = %sql select * from (select * from result_to_analyze where question_id = '{{question_id}}' and model_configuration_id = '{{model_configuration_id}}' and result = '{{grade}}') using sample 1 + return res + + +filter_grade(question_id, model_configuration_id, grade) + + + + + +# + +correct_lst = list() +wrong_lst = list() +very_wrong_lst = list() +correct_prompt_lst = list() +wrong_prompt_lst = list() +very_wrong_prompt_lst = list() + +output_lists = [correct_lst, wrong_lst, very_wrong_lst] +prompt_lists = [correct_prompt_lst, wrong_prompt_lst, very_wrong_prompt_lst] + +for _, row in high_variance_questions_df.iterrows(): + question_id = row['question_id'] + model_configuration_id = row['model_configuration_id'] + # prompt_variation_id = row['prompt_variation_id'] + # print(question_id, model_configuration_id) + + examples = list() + for g in ['correct', 'wrong', 'very_wrong']: + grade = g + example = filter_grade(question_id, model_configuration_id, grade) + # print(example) + if len(example) > 0: + e = next(example.dicts()) + assert e['result'] == grade + examples.append(e) + else: + examples.append(None) + + for i, e in enumerate(examples): + if e: + prompt_variation_id = e['prompt_variation_id'] + raw_data_row = raw_data_pl.filter( + (pl.col('question_id') == question_id) + & (pl.col('model_configuration_id') == model_configuration_id) + & (pl.col('prompt_variation_id') == prompt_variation_id) + ) + if raw_data_row.is_empty(): + print(question_id, model_configuration_id, prompt_variation_id) + output_lists[i].append(None) + prompt_lists[i].append(None) + continue + question_text = raw_data_row['question'].item() + language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US' + answers = ans_dict[(question_id, language)] + option_a = answers['A'] + option_b = answers['B'] + option_c = answers['C'] + prompt_template = raw_data_row['prompt_template'].item() + prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c) + output_lists[i].append(raw_data_row['raw_output'].item()) + prompt_lists[i].append(prompt) + else: + output_lists[i].append(None) + prompt_lists[i].append(None) + +# - +prompt_lists[0][0] + +prompt_lists[1][0] + + + + + + + +high_variance_questions_df['correct_prompt_example'] = prompt_lists[0] +high_variance_questions_df['correct_answer_example'] = output_lists[0] +high_variance_questions_df['wrong_prompt_example'] = prompt_lists[1] +high_variance_questions_df['wrong_answer_example'] = output_lists[1] +high_variance_questions_df['very_wrong_prompt_example'] = prompt_lists[2] +high_variance_questions_df['very_wrong_answer_example'] = output_lists[2] + +high_variance_questions_df + +high_variance_questions_df.to_csv('./data/outputs/new_high_variance_questions_sample.csv', index=False) + + @@ -794,16 +929,6 @@ -# ## Question vs Prompt Family - -# + magic_args="question_prompt_family_stat << " language="sql" -# select * from question_prompt_family_stat_all -# - - -question_prompt_family_stat_df = question_prompt_family_stat.DataFrame() - -question_prompt_family_stat_df.to_csv('./data/outputs/new_question_prompt_family_stat.csv') - # # for double checking the evaluators diff --git a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py index d3453e5..36083e1 100644 --- a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py +++ b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py @@ -145,7 +145,7 @@ def suggest_language(q_text): # to get the most accurate mapping, we will load the prompts from the experiment files # be sure to change the name cn_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml', 'r')) -en_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml', 'r')) +en_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml', 'r')) assert cn_exp_config['variations'][1]['name'] == 'prompt_template' assert en_exp_config['variations'][1]['name'] == 'prompt_template' @@ -173,13 +173,25 @@ def suggest_language(q_text): (row['model_id'], row['model_params'])] for _, row in result.iterrows()] # update the correctness column with human scores -result['final_score'] = result['human_rating_score'].fillna(result['correctness']) +result['final_score'] = (result['human_rating_score'] + .fillna(result['vertex_ai_evaluator_correctness']) + .fillna(result['gpt4_evaluator_correctness']) +) + +result[pd.isnull(result["final_score"])] # this sould be empty # counting # let's use polars from now result = pl.DataFrame(result) result +# concat all evaluation results as list +result = result.with_columns( + pl.concat_list(pl.col(['gpt4_evaluator_gpt4_eval_correctness', + 'vertex_ai_evaluator_gemini_eval_correctness', + 'vertex_ai_evaluator_claude_eval_correctness',])).alias("evaluation_results") +) + # + # result.group_by( # ['question_id', 'language', 'prompt_variant_id', 'model_conf_id'] @@ -197,14 +209,12 @@ def suggest_language(q_text): -result_counts = result.group_by( - ['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date'] -).agg( - pl.col('final_score').filter(pl.col('final_score') == 0).count().alias('fail'), - pl.col('final_score').filter(pl.col('final_score') == 1).count().alias('very_wrong'), - pl.col('final_score').filter(pl.col('final_score') == 2).count().alias('wrong'), - pl.col('final_score').filter(pl.col('final_score') == 3).count().alias('correct'), - pl.col('final_score').count().alias('rounds') +# then calculate the distribution +result_counts = result.with_columns( + pl.col('evaluation_results').list.count_matches(0).alias('fail'), + pl.col('evaluation_results').list.count_matches(1).alias('very_wrong'), + pl.col('evaluation_results').list.count_matches(2).alias('wrong'), + pl.col('evaluation_results').list.count_matches(3).alias('correct'), ) result_counts @@ -212,11 +222,14 @@ def suggest_language(q_text): result_counts['rounds'].max() +# set the number of evaluators +num_of_evaluators = 3 + result_pct = result_counts.with_columns( - pl.col('fail') / pl.col('rounds') * 100, - pl.col('very_wrong') / pl.col('rounds') * 100, - pl.col('wrong') / pl.col('rounds') * 100, - pl.col('correct') / pl.col('rounds') * 100, + pl.col('fail') / num_of_evaluators * 100, + pl.col('very_wrong') / num_of_evaluators * 100, + pl.col('wrong') / num_of_evaluators * 100, + pl.col('correct') / num_of_evaluators * 100, ) result_pct @@ -234,13 +247,31 @@ def get_grade(dictionary): result_full = result_pct.with_columns( pl.struct(pl.col(['fail', 'very_wrong', 'wrong', 'correct'])).map_elements(get_grade).alias('result'), + pl.lit(1).alias('rounds') ) + +# then if we have human ratings, update the results. +result_full = result_full.with_columns( + pl.col('human_rating_score').replace( + dict(enumerate(['fail', 'very_wrong', 'wrong', 'correct'])) + ).fill_null(pl.col('result')).alias('result') +) + + result_full result_full_df = result_full.to_pandas() result_full_df.columns + +result_full_df = result_full_df.loc[:, + [ + 'question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date', + 'fail', 'very_wrong', 'wrong', 'correct', 'rounds', 'result', + ] +] + result_full_df.columns = ['question_id', 'language', 'prompt_variation_id', 'model_configuration_id', 'last_evaluation_datetime', 'percent_eval_failed', 'percent_very_wrong', 'percent_wrong', diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py index c978830..3ca1e9d 100644 --- a/automation-api/yival_experiments/scripts/generate_result.py +++ b/automation-api/yival_experiments/scripts/generate_result.py @@ -24,6 +24,28 @@ # In this script, we store all responses into an excel file. output_dir = current_script_path / "../output" +option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1} + + +def exact_match_correctness(answer, options, correctness): + option_occurance = [0, 0, 0] + scores = [option_score_mapping[x] for x in correctness] + for i, o in zip(range(3), options): + if o.strip().lower() in answer.strip().lower(): + option_occurance[i] = 1 + if sum(option_occurance) == 1: + score = scores[option_occurance.index(1)] + else: + score = 0 + + return score + + +def extract_correct_answer(options, correctness): + for t, c in zip(options, correctness): + if c == "Correct": + return t + if __name__ == "__main__": output_list = [] @@ -49,6 +71,10 @@ option_b_correctness, option_c_correctness, ] + auto_mark_correctness = exact_match_correctness( + answer, options, correctness + ) + correct_answer = extract_correct_answer(options, correctness) result_dict = dict( experiment_date=expr_date, question_id=str(result.input_data.content["question_id"]), @@ -57,6 +83,8 @@ prompt_template=result.combination["prompt_template"], question=result.input_data.content["question_text"], raw_output=result.raw_output.text_output, + correct_answer=correct_answer, + auto_mark_correctness=auto_mark_correctness, ) for eval_output in result.evaluator_outputs: col_name = f"{eval_output.name}_{eval_output.display_name}"