diff --git a/automation-api/yival_experiments/notebooks/compare_evaluators.py b/automation-api/yival_experiments/notebooks/compare_evaluators.py
index d0d0540..c80dbed 100644
--- a/automation-api/yival_experiments/notebooks/compare_evaluators.py
+++ b/automation-api/yival_experiments/notebooks/compare_evaluators.py
@@ -12,8 +12,12 @@
 conn = duckdb.connect()
 
 
-conn.query("select * from df")
+simple_eval_check = conn.query("select * from df where simple_evaluator_matching <> auto_mark_correctness")
+simple_eval_check
+simple_eval_check.to_csv("./simple_eval_check.csv")
 
+
+# NEXT: review the query and begin to check results.
 q = """select
   *
 from
@@ -22,19 +26,35 @@
   not (
     llama3_evaluator_correctness = vertex_ai_evaluator_correctness
     and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
+  )
+  or (
+    auto_mark_correctness <> 0
+    and (
+      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
+      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
+    )
+    and auto_mark_correctness <> gpt4_evaluator_correctness 
+  )
+  or (
+    simple_evaluator_matching <> 0
     and (
-      simple_evaluator_matching <> 0
-      and simple_evaluator_matching = gpt4_evaluator_correctness
+      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
+      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
     )
+    and simple_evaluator_matching <> gpt4_evaluator_correctness 
   )
-  and simple_evaluator_matching <> 0"""
+  
+  """
 
 diffs = conn.query(q)
 
 conn.query("select count(*) from df")
 conn.query("select count(*) from diffs")
 
-diffs.to_csv("to_check.csv")
+diffs.to_csv("to_check_all.csv")
+
+410 / 30780
 
-1281 / 30780
+# FIXME: the simple evaluator seems not working very well?
+# just use the exact matching algo
 
diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py
index a93872a..e7815db 100644
--- a/automation-api/yival_experiments/notebooks/result_data_analysis.py
+++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py
@@ -2,7 +2,7 @@
 #
 # This notebook is for producing tables listed in https://docs.google.com/spreadsheets/d/1ln5ui3f13AfAQkBuEMbNomBXlZLhkQPYVEpBlZjUtu0/edit?pli=1#gid=0
 #
-# Results are from the experiments in Apr and May 2023
+# Latest Update: 2024-10-02
 
 # going to use duckdb
 # %load_ext sql
@@ -10,6 +10,7 @@
 # %sql duckdb://
 
 import pandas as pd
+import polars as pl
 from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
 from lib.config import read_config
 import matplotlib.pyplot as plt
@@ -23,7 +24,10 @@
 
 # results to be analyzed
 # manually download from AI eval spreadsheet.
-result = pd.read_csv('./data/Gapminder AI evaluations - Master Output.csv')
+result = pd.concat([
+    pd.read_csv('./data/Gapminder AI evaluations - Master Output.csv'),
+    # pd.read_csv('./data/Gapminder AI evaluations - Latest Results.csv'),
+])
 
 # load ai eval spreadsheet
 ai_eval_sheet = read_ai_eval_spreadsheet()
@@ -50,28 +54,11 @@
 # from result
 # where model_configuration_id != 'mc026'  -- exclude qwen 1201
 
-# + magic_args="--save result_to_analyze_latest_only" language="sql"
-# select * from result_to_analyze
-# where 
-#     model_configuration_id = 'mc030' 
-#     OR model_configuration_id = 'mc035' 
-#     OR model_configuration_id = 'mc032'
-#     OR model_configuration_id = 'mc033'
-#     OR model_configuration_id = 'mc034'
-
 # + magic_args="--with result_to_analyze --save result_chn_prompt_renamed" language="sql"
 # select
 #    * exclude (prompt_variation_id),
 #    replace(prompt_variation_id, '_zh', '') as prompt_variation_id
 # from result_to_analyze
-# + magic_args="--save result_chn_prompt_renamed_latest_only" language="sql"
-# select * from result_chn_prompt_renamed
-# where 
-#     model_configuration_id = 'mc030' 
-#     OR model_configuration_id = 'mc035' 
-#     OR model_configuration_id = 'mc032'
-#     OR model_configuration_id = 'mc033'
-#     OR model_configuration_id = 'mc034'
 # -
 
 
@@ -166,27 +153,11 @@
 
 
 
+# ## Summary
 
+# ### Correctness Break Down by Model
 
-
-# ## Experiment Total
-
-# + language="sql"
-# select
-#     'AI' as name,
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_to_analyze_latest_only
-# -
-
-
-
-# ### Break down by Model
-
-# + language="sql"
+# + magic_args="result_by_models <<" language="sql"
 # select
 #     m.model_id as model_id,
 #     count(*) as total_count,
@@ -194,103 +165,16 @@
 #     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
 #     100 - correct_rate as wrong_rate,
 #     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_to_analyze_latest_only r left join all_models m on r.model_configuration_id = m.model_config_id
+# from result_to_analyze r left join all_models m on r.model_configuration_id = m.model_config_id
 # GROUP BY m.model_id
 # order by correct_rate desc
 # -
+result_by_models_df = result_by_models.DataFrame()
+result_by_models_df
 
 
 
-# ### break down by prompt and prompt family
-
-# + magic_args="by_prompt_family <<" language="sql"
-# select
-#     p.prompt_family as prompt_family,
-#     count(DISTINCT p.variation_id) as number_of_prompts,
-#     -- count(DISTINCT p.variation_id) / 2 as number_of_prompts,  -- uncomment this to treat chinese prompt and english prompt the same.
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_to_analyze_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id
-# GROUP BY p.prompt_family
-# ORDER BY correct_rate desc
-# -
-
-by_prompt_family.DataFrame().set_index('prompt_family')
-
-# + magic_args="by_prompt <<" language="sql"
-# select
-#     any_value(p.prompt_family) as prompt_family,
-#     prompt_variation_id,
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_chn_prompt_renamed_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id
-# GROUP BY r.prompt_variation_id
-# ORDER BY correct_rate desc
-# -
-
-by_prompt.DataFrame().to_csv('./data/outputs/new_total_by_prompts.csv', index=False)
-
-
-
-
-
-
-
-# ### break down by topics
-
-# + magic_args="by_topics_1 <<" language="sql"
-# select
-#     q.sdg_topic as sdg_topic,
-#     count(DISTINCT q.question_id) as number_of_questions,  -- treat chinese prompt and english prompt the same.
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_to_analyze_latest_only r left join q_and_t q on r.question_id = q.question_id
-# GROUP BY q.sdg_topic
-# ORDER BY sdg_topic
-# -
-
-by_topics_1.DataFrame().set_index('sdg_topic')
-
-# +
-# other topics
-
-# + magic_args="--save res_with_other_topics" language="sql"
-# select
-#     r.*,
-#     unnest(q.other_topics) as topic
-# from result_to_analyze_latest_only r left join q_and_t q on r.question_id = q.question_id
-# -
-
-
-
-# + magic_args="--with res_with_other_topics by_topics_2 <<" language="sql"
-# select
-#     topic,
-#     count(DISTINCT question_id) as number_of_questions,  -- treat chinese prompt and english prompt the same.
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from res_with_other_topics
-# GROUP BY topic
-# ORDER BY topic
-# -
-
-by_topics_2.DataFrame().set_index('topic')
-
-
-
-# ## The Top 5 and Bottom 5 prompts of a model
+# ### The Top 5 and Bottom 5 prompts of a model
 
 # + magic_args="--save by_prompt_and_model" language="sql"
 # select
@@ -309,7 +193,7 @@
 # select *
 # from by_prompt_and_model
 # where
-#     list_contains([1,2,3,4,5, 108, 107, 106, 105, 104], rank)
+#    list_contains([1,2,3,4,5, 108, 107, 106, 105, 104], rank)
 # order by model_configuration_id, rank
 # -
 
@@ -319,12 +203,16 @@
 
 by_prompt_and_model_with_rank_df.to_csv('./data/outputs/new_prompt_model_bottoms.csv')
 
-# + language="sql"
+# + magic_args="avg_model_correct_rate <<" language="sql"
 # select model_configuration_id, mean(correct_rate)
 # from by_prompt_and_model
 # group by model_configuration_id
 # order by model_configuration_id
 # -
+avg_model_correct_rate_df = avg_model_correct_rate.DataFrame()
+
+avg_model_correct_rate_df
+
 
 
 
@@ -332,6 +220,16 @@
 
 # ## Model, Prompt Family, Topic aggregations
 
+# + magic_args="--save res_with_prompt_family" language="sql"
+# select
+#     r.*,
+#     p.prompt_family
+# from result_to_analyze r left join all_prompts p on r.prompt_variation_id = p.variation_id
+
+# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql"
+# select * from res_with_prompt_family where score != 0
+# -
+
 # ### highest variance by model
 
 # + magic_args="--save prompt_variance_stat" language="sql"
@@ -378,15 +276,6 @@
 # I need to check the variance cause by Prompt Family for each Model.
 # So I will first check the answer variance of each question, then get the average variance of all questions.
 
-# + magic_args="--save res_with_prompt_family" language="sql"
-# select
-#     r.*,
-#     p.prompt_family
-# from result_to_analyze_latest_only r left join all_prompts p on r.prompt_variation_id = p.variation_id
-
-# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql"
-# select * from res_with_prompt_family where score != 0
-
 # + magic_args="--save model_prompt_stat1" language="sql"
 # select
 #       prompt_family,
@@ -405,14 +294,6 @@
 #       question_id
 #     order by
 #       "correct_rate" desc
-# -
-
-
-
-
-
-
-
 # + magic_args="--save model_prompt_stat2" language="sql"
 # select
 #       r.prompt_family,
@@ -432,24 +313,6 @@
 #       r.model_configuration_id,
 #       r.question_id
 
-# +
-# # %%sql
-# select
-#       r.prompt_family,
-#       r.model_configuration_id,
-#       r.prompt_variation_id,
-#       r.question_id,
-#       r.score,
-#       s1.mode_score
-#     from
-#       res_with_prompt_family_exclude_ind r
-#     left join model_prompt_stat1 s1
-#     on
-#       r.prompt_family = s1.prompt_family AND
-#       r.model_configuration_id = s1.model_configuration_id AND
-#       r.question_id = s1.question_id
-#     where r.prompt_family = 'geo' and r.question_id = '41'
-
 # + magic_args="--save model_prompt_stat3" language="sql"
 # select
 #       prompt_family,
@@ -589,188 +452,460 @@
 
 
 
-# ### Topic vs Prompt Family
-
-# +
-# we will reuse the res_with_prompt_family_exclude_ind and res_with_prompt_family queries defined above.
-
-# + magic_args="--save question_prompt_family_stat1" language="sql"
-#     select
-#       question_id,
-#       prompt_family,
-#       count(*) filter (score = 3) / count(*) * 100 as correct_rate,
-#       stddev_pop(score) / mean (score) * 100 as variance
-#       -- count(DISTINCT score) as variance
-#     from
-#       res_with_prompt_family_exclude_ind
-#     group by
-#       question_id,
-#       prompt_family
-
-# + magic_args="--save question_prompt_family_stat2" language="sql"
-#     select
-#       question_id,
-#       prompt_family,
-#       count(*) filter (score = 1) / count(*) * 100 as indecisive_rate
-#     from
-#       res_with_prompt_family
-#     group by
-#       question_id,
-#       prompt_family
-# -
-
-
-
-# + magic_args="--save question_prompt_family_stat_all" language="sql"
-#     select
-#       r1.question_id,
-#       r1.prompt_family,
-#       mean (correct_rate) as correct_rate,
-#       mean (indecisive_rate) as indecisive_rate,
-#       mode (variance) as variance
-#     from
-#       question_prompt_family_stat1 r1
-#       left join question_prompt_family_stat2 r2 on r1.question_id = r2.question_id
-#       and r1.prompt_family = r2.prompt_family
-#     group by
-#       r1.question_id,
-#       r1.prompt_family
-
-# + magic_args="--save topic_prompt_family_stat" language="sql"
-#     select
-#       r.*,
-#       q.sdg_topic,
-#       q.other_topics,
-#       case
-#         when q.sdg_topic is null then other_topics
-#         else list_append (q.other_topics, q.sdg_topic)
-#       end as all_topics
-#     from
-#       question_prompt_family_stat_all r
-#       left join q_and_t q on r.question_id = q.question_id
 
-# + magic_args="--with topic_prompt_family_stat topic_prompt_family_res <<" language="sql"
-# select
-#   topic,
-#   -- count(*) as "number of qs",
-#   prompt_family,
-#   mean (correct_rate) as correct_rate,
-#   mean (indecisive_rate) as indecisive_rate,
-#   median (variance) as variance
-# from
-#   (select
-#     * exclude (all_topics, sdg_topic, other_topics),
-#     unnest(all_topics) as topic
-#    from topic_prompt_family_stat)
-# group by
-#   topic,
-#   prompt_family
-# order by
-#   topic,
-#   prompt_family
-# -
-
-topic_prompt_family_df = topic_prompt_family_res.DataFrame().set_index(['topic', 'prompt_family'])
 
-topic_prompt_family_df.to_csv('./data/outputs/new_topic_vs_prompt.csv')
+# ## Questions where AI worse than human and monkey
 
-topic_prompt_family_df.describe()
+# ### human score
 
+100 - all_questions['wrongPercentage'].mean()
 
 
-# ## Questions where AI worse than human and monkey
 
 # + language="sql"
 # select * from model_topic_stat;
 
-# + magic_args="model_topic_human_diff <<" language="sql"
+# + magic_args="model_topic_diff <<" language="sql"
 # select
 #   question_id,
 #   model_configuration_id,
 #     (100 - correct_rate) as ai_wrong_percentage,
 #     human_wrong_percentage,
-#   ai_wrong_percentage - human_wrong_percentage as diff,
+#     2/3 * 100 as monkey_wrong_percentage,
+#   ai_wrong_percentage - human_wrong_percentage as compare_to_human,
+#     ai_wrong_percentage - monkey_wrong_percentage as compare_to_monkey,
 #     sdg_topic,
 #     other_topics
 # from model_topic_stat
-# where diff > 0
+# where compare_to_human > 0 OR compare_to_monkey > 0
 # order by
 #     "sdg_topic",
 #     cast(other_topics as varchar),
 #     "model_configuration_id"
 # -
 
-model_topic_human_diff_df = model_topic_human_diff.DataFrame()
+model_topic_diff
 
-model_topic_human_diff_df.to_csv('./data/outputs/new_ai_worse_human.csv', index=False)
+model_topic_diff_df = model_topic_diff.DataFrame()
 
+model_topic_diff_df.shape
 
+model_topic_diff_df.to_csv('./data/outputs/new_ai_worse_all.csv', index=False)
 
 
 
-# + magic_args="model_topic_monkey_diff <<" language="sql"
-# select
-#   question_id,
-#   model_configuration_id,
-#     (100 - correct_rate) as ai_wrong_percentage,
-#     100 * (2/3) as monkey_wrong_percentage,
-#   ai_wrong_percentage - monkey_wrong_percentage as diff,
-#     sdg_topic,
-#     other_topics
-# from model_topic_stat
-# where diff > 0
-# order by
-#     "sdg_topic",
-#     cast(other_topics as varchar),
-#     "model_configuration_id"
+# +
+# make a complete list combining worse than human and worse than monkey
+
+# + magic_args="all_worse_questions <<" language="sql"
+# select question_id, model_configuration_id 
+# from
+#     model_topic_diff_df
+
+# + magic_args="very_wrong_res <<" language="sql"
+# select * from result_to_analyze where result = 'very_wrong'
 # -
 
-model_topic_monkey_diff_df = model_topic_monkey_diff.DataFrame()
 
-model_topic_monkey_diff_df.to_csv('./data/outputs/new_ai_worse_monkey.csv', index=False)
 
+# +
+# now find one case for very wrong for these questions.
+# -
 
+r1 = all_worse_questions.DataFrame()
+r2 = very_wrong_res.DataFrame()
 
+r2_ = r2.groupby(['question_id', 'model_configuration_id']).agg(lambda x: x.sample(1)).reset_index()
 
+# + magic_args="--save all_worse_very_wrong" language="sql"
+# select 
+#     r1.question_id, r1.model_configuration_id, prompt_variation_id
+# from 
+#      r1 
+#     left join 
+#      r2_ 
+#     on 
+#         r1.question_id = r2_.question_id and r1.model_configuration_id = r2_.model_configuration_id
+
+# + language="sql"
+# select *
+# from r1
+# where 
+#     question_id = '1640' and model_configuration_id = 'mc039'
+
+# + language="sql"
+# select *
+# from r2_
+# where 
+#     question_id = '1640' and model_configuration_id = 'mc039'
 
 # +
-# summary stats for human and monkey vs ai
+# Why??? Because there is no very wrong answer for this combination!
+# -
 
-# + magic_args="summary_human_ai <<" language="sql"
-# select
-#     question_id,
-#     count(*) as num_of_models,
-#     mean(diff) as average_diff,
-# from
-#     model_topic_human_diff_df
-# group by
-#     question_id
-# ORDER BY
-#     num_of_models desc,
-#     average_diff desc
+
+
+# all_worse_very_wrong_df = %sql select * from all_worse_very_wrong
+
+all_worse_very_wrong_df = all_worse_very_wrong_df.DataFrame()
+
+all_worse_very_wrong_df[pd.isnull(all_worse_very_wrong_df['prompt_variation_id'])]
+
+
+
+
+
+# +
+# query example responses
+# but first, we need to read all result data...
+# -
+
+# FIXME: change ../output/results.parquet to correct archive path.
+raw_data_fs = [
+    '../output/results.parquet',  # for mc039
+    '../output/archives/20240521/results.xlsx',
+    '../output/archives/20240401/results.xlsx',
+    '../output/archives/20240501/results.xlsx',
+    '../output/archives/20240516/results.xlsx',
+    '../output/archives/20240601/results.xlsx',
+    '../output/archives/20240910/results.xlsx'
+]
+
+pd.read_parquet(raw_data_fs[0]).columns
+
+# +
+cols = ['experiment_date', 'question_id', 'model_id', 'prompt_template', 'question', 'raw_output']
+
+raw_data_lst = list()
+
+for x in raw_data_fs:
+    if 'parquet' in x:
+        raw_data_lst.append(pd.read_parquet(x)[cols])
+    else:
+        raw_data_lst.append(pd.read_excel(x)[cols])
+# -
+
+raw_data = pd.concat(raw_data_lst, ignore_index=True)
+
+raw_data
+
+# fix a few experiment model id
+raw_data.loc[raw_data['model_id'] == 'gpt-4', 'model_id'] = 'gpt-4-0613' 
+raw_data.loc[raw_data['model_id'] == 'gpt-4o', 'model_id'] = 'gpt-4o-2024-05-13' 
+
+
+
+
+
+# +
+# now we should make all columns we needed
+# 1. question and answers
+# 2. prompt template
+# 3. model configuration id
 # -
 
-summary_human_ai.DataFrame()
+# first do prompt template
+# load all configuration files and get a mapping.
+import yaml
+
+sorted([str(x) for x in raw_data['experiment_date'].unique()])
+
+configuration_list = [
+    '../experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml',
+    '../experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml',
+    '../experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml',
+    '../experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml',
+    '../experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml',
+    '../experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml',
+    '../experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml',
+    '../experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml',
+    '../experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml',
+    '../experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml',
+    '../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml',
+    '../experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml',
+    '../experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml',
+    '../experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml',
+    '../experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml',
+    '../experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml',
+    '../experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml',
+    '../experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml',
+    '../experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml',
+    '../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml',
+]
+
+# +
+prompt_template_list = list()
+
+for x in configuration_list:
+    c = yaml.safe_load(open(x, 'r'))
+    p = pd.DataFrame.from_records(c['variations'][1]['variations'])
+    prompt_template_list.append(p)
+# -
+
+all_prompt_templates = pd.concat(prompt_template_list, ignore_index=True)
+
+all_prompt_templates = all_prompt_templates.drop_duplicates(subset=['value'])
+
+all_prompt_templates_mapping = all_prompt_templates.set_index('value')['variation_id'].to_dict()
+
+for k, v in all_prompt_templates_mapping.items():
+    print(k)
+    print(v)
+    break
+
+
 
-summary_human_ai.DataFrame().to_csv('./data/outputs/new_summary_human_ai.csv')
+raw_data['prompt_template'].map(all_prompt_templates_mapping).hasnans  # should be False
 
+raw_data['prompt_variation_id'] = raw_data['prompt_template'].map(all_prompt_templates_mapping)
 
 
-# + magic_args="summary_monkey_ai <<" language="sql"
+
+# +
+# next add model_configuration_id
+# -
+
+# all_models_ = %sql select * from all_models where repeat_times = 1
+
+all_models_ = all_models_.DataFrame()
+
+all_models_mapping = all_models_.set_index('model_id')['model_config_id'].to_dict()
+
+raw_data['model_id'].map(all_models_mapping).hasnans
+
+raw_data['model_configuration_id'] = raw_data['model_id'].map(all_models_mapping)
+
+raw_data
+
+
+
+# + language="sql"
 # select
-#     question_id,
-#     count(*) as num_of_models,
-#     mean(diff) as average_diff,
-# from
-#     model_topic_monkey_diff_df
-# group by
-#     question_id
-# ORDER BY
-#     num_of_models desc,
-#     average_diff desc
+#     DISTINCT model_id 
+# from 
+#     raw_data
+# where
+#     prompt_variation_id like '%zh%'
+# -
+
+
+
+
+
+
+
+
+
+# +
+# questions and answers mapping
+# -
+
+all_questions.columns
+
+qs = ai_eval_sheet.questions.data.df.copy()
+qs = qs[['question_id', 'language', 'published_version_of_question']]
+
+qs
+
+q_dict = qs.set_index(["question_id", "language"])["published_version_of_question"].to_dict()
+
+# +
+ans = ai_eval_sheet.question_options.data.df.copy()
+ans_dict = dict()
+
+for qid, adf in ans.groupby(["question_id", "language"]):
+    adict = adf.set_index('letter')['question_option'].to_dict()
+    ans_dict[qid] = adict
+# -
+
+ans_dict[("1", "en-US")]
+
+q_dict[("1", "en-US")]
+
+
+
+# +
+# create final output
+# -
+
+all_worse_very_wrong_df
+
+raw_data.dtypes
+
+raw_data['experiment_date'] = raw_data['experiment_date'].map(lambda x: str(x))
+raw_data['question_id'] = raw_data['question_id'].map(lambda x: str(x))
+raw_data['model_id'] = raw_data['model_id'].map(lambda x: str(x))
+
+raw_data_pl = pl.from_pandas(raw_data)
+
+# +
+raw_output_lst = list()
+prompt_lst = list()
+
+
+for _, row in all_worse_very_wrong_df.iterrows():
+    question_id = row['question_id']
+    model_configuration_id = row['model_configuration_id']
+    prompt_variation_id = row['prompt_variation_id']
+    # print(question_id, model_configuration_id, prompt_variation_id)
+
+    raw_data_row = raw_data_pl.filter(
+        (pl.col('question_id') == question_id) & (pl.col('model_configuration_id') == model_configuration_id) & (pl.col('prompt_variation_id') == prompt_variation_id)
+    )
+
+    if raw_data_row.is_empty():
+        raw_output_lst.append(None)
+        prompt_lst.append(None)
+    else:
+        question_text = raw_data_row['question'].item()
+        question_id = raw_data_row['question_id'].item()
+        language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US'
+        answers = ans_dict[(question_id, language)]
+        option_a = answers['A']
+        option_b = answers['B']
+        option_c = answers['C']
+
+        prompt_template = raw_data_row['prompt_template'].item()
+        prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c)
+        # print(prompt)
+
+        prompt_lst.append(prompt)
+        raw_output_lst.append(raw_data_row['raw_output'].item())
+    
+# -
+raw_data_row
+
+all_worse_very_wrong_df['prompt'] = prompt_lst
+all_worse_very_wrong_df['model_output'] = raw_output_lst
+
+all_worse_very_wrong_df
+
+all_worse_very_wrong_df.to_csv('./data/outputs/new_ai_worse_sample.csv', index=False)
+
+
+
+
+
+
+
+# ## Examples for high variance questions
+
+high_variance_questions_df
+
+# + language="sql"
+# select * from result_to_analyze
 # -
 
-summary_monkey_ai.DataFrame().to_csv('./data/outputs/new_summary_monkey_ai.csv')
+
+
+question_id = '1792'
+model_configuration_id = 'mc039'
+grade = 'very_wrong'
+
+
+# + magic_args="--save grade_example" language="sql"
+#
+# select * from
+#     (
+#     select * from result_to_analyze
+#     where
+#         question_id = '{{question_id}}' 
+#         and model_configuration_id = '{{model_configuration_id}}' 
+#         and result = '{{grade}}'
+#     )
+# using sample 1
+# -
+
+def filter_grade(question_id, model_configuration_id, grade):
+    # res = %sql select * from (select * from result_to_analyze where question_id = '{{question_id}}' and model_configuration_id = '{{model_configuration_id}}' and result = '{{grade}}') using sample 1
+    return res
+
+
+filter_grade(question_id, model_configuration_id, grade)
+
+
+
+
+
+# +
+correct_lst = list()
+wrong_lst = list()
+very_wrong_lst = list()
+correct_prompt_lst = list()
+wrong_prompt_lst = list()
+very_wrong_prompt_lst = list()
+
+output_lists = [correct_lst, wrong_lst, very_wrong_lst]
+prompt_lists = [correct_prompt_lst, wrong_prompt_lst, very_wrong_prompt_lst]
+
+for _, row in high_variance_questions_df.iterrows():
+    question_id = row['question_id']
+    model_configuration_id = row['model_configuration_id']
+    # prompt_variation_id = row['prompt_variation_id']
+    # print(question_id, model_configuration_id)
+
+    examples = list()
+    for g in ['correct', 'wrong', 'very_wrong']:
+        grade = g
+        example = filter_grade(question_id, model_configuration_id, grade)
+        # print(example)
+        if len(example) > 0:
+            e = next(example.dicts())
+            assert e['result'] == grade
+            examples.append(e)
+        else:
+            examples.append(None)
+
+    for i, e in enumerate(examples):
+        if e:
+            prompt_variation_id = e['prompt_variation_id']
+            raw_data_row = raw_data_pl.filter(
+                (pl.col('question_id') == question_id) 
+                & (pl.col('model_configuration_id') == model_configuration_id) 
+                & (pl.col('prompt_variation_id') == prompt_variation_id)
+            )
+            if raw_data_row.is_empty():
+                print(question_id, model_configuration_id, prompt_variation_id)
+                output_lists[i].append(None)
+                prompt_lists[i].append(None)
+                continue
+            question_text = raw_data_row['question'].item()
+            language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US'
+            answers = ans_dict[(question_id, language)]
+            option_a = answers['A']
+            option_b = answers['B']
+            option_c = answers['C']
+            prompt_template = raw_data_row['prompt_template'].item()
+            prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c)
+            output_lists[i].append(raw_data_row['raw_output'].item())
+            prompt_lists[i].append(prompt)
+        else:
+            output_lists[i].append(None)
+            prompt_lists[i].append(None)
+
+# -
+prompt_lists[0][0]
+
+prompt_lists[1][0]
+
+
+
+
+
+
+
+high_variance_questions_df['correct_prompt_example'] = prompt_lists[0]
+high_variance_questions_df['correct_answer_example'] = output_lists[0]
+high_variance_questions_df['wrong_prompt_example'] = prompt_lists[1]
+high_variance_questions_df['wrong_answer_example'] = output_lists[1]
+high_variance_questions_df['very_wrong_prompt_example'] = prompt_lists[2]
+high_variance_questions_df['very_wrong_answer_example'] = output_lists[2]
+
+high_variance_questions_df
+
+high_variance_questions_df.to_csv('./data/outputs/new_high_variance_questions_sample.csv', index=False)
+
+
 
 
 
@@ -794,16 +929,6 @@
 
 
 
-# ## Question vs Prompt Family
-
-# + magic_args="question_prompt_family_stat << " language="sql"
-# select * from question_prompt_family_stat_all
-# -
-
-question_prompt_family_stat_df = question_prompt_family_stat.DataFrame()
-
-question_prompt_family_stat_df.to_csv('./data/outputs/new_question_prompt_family_stat.csv')
-
 
 
 # # for double checking the evaluators
diff --git a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
index d3453e5..36083e1 100644
--- a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
+++ b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
@@ -145,7 +145,7 @@ def suggest_language(q_text):
 # to get the most accurate mapping, we will load the prompts from the experiment files
 # be sure to change the name
 cn_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml', 'r'))
-en_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml', 'r'))
+en_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml', 'r'))
 
 assert cn_exp_config['variations'][1]['name'] == 'prompt_template'
 assert en_exp_config['variations'][1]['name'] == 'prompt_template'
@@ -173,13 +173,25 @@ def suggest_language(q_text):
     (row['model_id'], row['model_params'])] for _, row in result.iterrows()]
 
 # update the correctness column with human scores
-result['final_score'] = result['human_rating_score'].fillna(result['correctness'])
+result['final_score'] = (result['human_rating_score']
+    .fillna(result['vertex_ai_evaluator_correctness'])
+    .fillna(result['gpt4_evaluator_correctness'])
+)
+
+result[pd.isnull(result["final_score"])]  # this sould be empty
 
 # counting
 # let's use polars from now
 result = pl.DataFrame(result)
 result
 
+# concat all evaluation results as list
+result = result.with_columns(
+    pl.concat_list(pl.col(['gpt4_evaluator_gpt4_eval_correctness',
+         'vertex_ai_evaluator_gemini_eval_correctness',
+         'vertex_ai_evaluator_claude_eval_correctness',])).alias("evaluation_results")
+)
+
 # +
 # result.group_by(
 #     ['question_id', 'language', 'prompt_variant_id', 'model_conf_id']
@@ -197,14 +209,12 @@ def suggest_language(q_text):
 
 
 
-result_counts = result.group_by(
-    ['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date']
-).agg(
-    pl.col('final_score').filter(pl.col('final_score') == 0).count().alias('fail'),
-    pl.col('final_score').filter(pl.col('final_score') == 1).count().alias('very_wrong'),
-    pl.col('final_score').filter(pl.col('final_score') == 2).count().alias('wrong'),
-    pl.col('final_score').filter(pl.col('final_score') == 3).count().alias('correct'),
-    pl.col('final_score').count().alias('rounds')
+# then calculate the distribution
+result_counts = result.with_columns(
+    pl.col('evaluation_results').list.count_matches(0).alias('fail'),
+    pl.col('evaluation_results').list.count_matches(1).alias('very_wrong'),
+    pl.col('evaluation_results').list.count_matches(2).alias('wrong'),
+    pl.col('evaluation_results').list.count_matches(3).alias('correct'),
 )
 
 result_counts
@@ -212,11 +222,14 @@ def suggest_language(q_text):
 result_counts['rounds'].max()
 
 
+# set the number of evaluators
+num_of_evaluators = 3
+
 result_pct = result_counts.with_columns(
-    pl.col('fail') / pl.col('rounds') * 100,
-    pl.col('very_wrong') / pl.col('rounds') * 100,
-    pl.col('wrong') / pl.col('rounds') * 100,
-    pl.col('correct') / pl.col('rounds') * 100,
+    pl.col('fail') / num_of_evaluators * 100,
+    pl.col('very_wrong') / num_of_evaluators * 100,
+    pl.col('wrong') / num_of_evaluators * 100,
+    pl.col('correct') / num_of_evaluators * 100,
 )
 
 result_pct
@@ -234,13 +247,31 @@ def get_grade(dictionary):
 
 result_full = result_pct.with_columns(
     pl.struct(pl.col(['fail', 'very_wrong', 'wrong', 'correct'])).map_elements(get_grade).alias('result'),
+    pl.lit(1).alias('rounds')
 )
 
+
+# then if we have human ratings, update the results.
+result_full = result_full.with_columns(
+    pl.col('human_rating_score').replace(
+        dict(enumerate(['fail', 'very_wrong', 'wrong', 'correct']))
+    ).fill_null(pl.col('result')).alias('result')
+)
+
+
 result_full
 
 result_full_df = result_full.to_pandas()
 result_full_df.columns
 
+
+result_full_df = result_full_df.loc[:, 
+    [
+        'question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date',
+          'fail', 'very_wrong', 'wrong', 'correct', 'rounds', 'result', 
+    ]
+]
+
 result_full_df.columns = ['question_id', 'language', 'prompt_variation_id',
                           'model_configuration_id', 'last_evaluation_datetime',
                           'percent_eval_failed', 'percent_very_wrong', 'percent_wrong',
diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py
index c978830..3ca1e9d 100644
--- a/automation-api/yival_experiments/scripts/generate_result.py
+++ b/automation-api/yival_experiments/scripts/generate_result.py
@@ -24,6 +24,28 @@
 # In this script, we store all responses into an excel file.
 output_dir = current_script_path / "../output"
 
+option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1}
+
+
+def exact_match_correctness(answer, options, correctness):
+    option_occurance = [0, 0, 0]
+    scores = [option_score_mapping[x] for x in correctness]
+    for i, o in zip(range(3), options):
+        if o.strip().lower() in answer.strip().lower():
+            option_occurance[i] = 1
+    if sum(option_occurance) == 1:
+        score = scores[option_occurance.index(1)]
+    else:
+        score = 0
+
+    return score
+
+
+def extract_correct_answer(options, correctness):
+    for t, c in zip(options, correctness):
+        if c == "Correct":
+            return t
+
 
 if __name__ == "__main__":
     output_list = []
@@ -49,6 +71,10 @@
                     option_b_correctness,
                     option_c_correctness,
                 ]
+                auto_mark_correctness = exact_match_correctness(
+                    answer, options, correctness
+                )
+                correct_answer = extract_correct_answer(options, correctness)
                 result_dict = dict(
                     experiment_date=expr_date,
                     question_id=str(result.input_data.content["question_id"]),
@@ -57,6 +83,8 @@
                     prompt_template=result.combination["prompt_template"],
                     question=result.input_data.content["question_text"],
                     raw_output=result.raw_output.text_output,
+                    correct_answer=correct_answer,
+                    auto_mark_correctness=auto_mark_correctness,
                 )
                 for eval_output in result.evaluator_outputs:
                     col_name = f"{eval_output.name}_{eval_output.display_name}"