update scripts and notebooks

Gapminder · Nov 21, 2024 · b3abfa9 · b3abfa9
1 parent 7d77230
commit b3abfa9
Show file tree

Hide file tree

Showing 4 changed files with 533 additions and 329 deletions.
diff --git a/automation-api/yival_experiments/notebooks/compare_evaluators.py b/automation-api/yival_experiments/notebooks/compare_evaluators.py
@@ -12,8 +12,12 @@
 conn = duckdb.connect()
 
 
-conn.query("select * from df")
+simple_eval_check = conn.query("select * from df where simple_evaluator_matching <> auto_mark_correctness")
+simple_eval_check
+simple_eval_check.to_csv("./simple_eval_check.csv")
 
+
+# NEXT: review the query and begin to check results.
 q = """select
   *
 from
@@ -22,19 +26,35 @@
   not (
     llama3_evaluator_correctness = vertex_ai_evaluator_correctness
     and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
+  )
+  or (
+    auto_mark_correctness <> 0
+    and (
+      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
+      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
+    )
+    and auto_mark_correctness <> gpt4_evaluator_correctness 
+  )
+  or (
+    simple_evaluator_matching <> 0
     and (
-      simple_evaluator_matching <> 0
-      and simple_evaluator_matching = gpt4_evaluator_correctness
+      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
+      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
     )
+    and simple_evaluator_matching <> gpt4_evaluator_correctness 
   )
-  and simple_evaluator_matching <> 0"""
+  
+  """
 
 diffs = conn.query(q)
 
 conn.query("select count(*) from df")
 conn.query("select count(*) from diffs")
 
-diffs.to_csv("to_check.csv")
+diffs.to_csv("to_check_all.csv")
+
+410 / 30780
 
-1281 / 30780
+# FIXME: the simple evaluator seems not working very well?
+# just use the exact matching algo