diff --git a/automation-api/yival_experiments/notebooks/compare_evaluators.py b/automation-api/yival_experiments/notebooks/compare_evaluators.py new file mode 100644 index 0000000..d0d0540 --- /dev/null +++ b/automation-api/yival_experiments/notebooks/compare_evaluators.py @@ -0,0 +1,40 @@ +import duckdb +import polars as pl + + +results = pl.read_parquet("../output/results.parquet") + +results.columns + +df = results.filter(pl.col("model_id").str.contains("llama")) + + +conn = duckdb.connect() + + +conn.query("select * from df") + +q = """select + * +from + df +where + not ( + llama3_evaluator_correctness = vertex_ai_evaluator_correctness + and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness + and ( + simple_evaluator_matching <> 0 + and simple_evaluator_matching = gpt4_evaluator_correctness + ) + ) + and simple_evaluator_matching <> 0""" + +diffs = conn.query(q) + +conn.query("select count(*) from df") +conn.query("select count(*) from diffs") + +diffs.to_csv("to_check.csv") + +1281 / 30780 +