modified merge script

openml-labs · Jul 12, 2024 · 8800c96 · 8800c96
1 parent e29d229
commit 8800c96
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ tools/data/*
 *.pid
 tools/data/
 tools/data/*
+.idea/
diff --git a/tools/consistence_eval.py b/tools/consistence_eval.py
@@ -0,0 +1,45 @@
+import json
+import numpy as np
+from sklearn.metrics import cohen_kappa_score
+import pandas as pd
+
+# Get all category names
+topic_path = "data/LLM Evaluation - Topic Queries.csv"
+df = pd.read_csv(topic_path)
+CLS = df['Topic'].unique().tolist()
+CLS = [c.strip().lower() for c in CLS]
+
+# Read the JSON files
+with open('data/merged_labels_1.json', 'r') as f:
+    labels_1 = json.load(f)
+with open('data/merged_labels_2.json', 'r') as f:
+    labels_2 = json.load(f)
+
+
+# Create label matrix
+def create_label_matrix(labels, CLS):
+    ids = sorted(labels.keys())
+    label_matrix = np.zeros((len(ids), len(CLS)))
+    for i, id in enumerate(ids):
+        for label in labels[id]:
+            if label.strip().lower() in CLS:
+                label_matrix[i, CLS.index(label.strip().lower())] = 1
+    return ids, label_matrix
+
+
+# Generate label matrices for both files
+ids_1, label_matrix_1 = create_label_matrix(labels_1, CLS)
+ids_2, label_matrix_2 = create_label_matrix(labels_2, CLS)
+
+# Ensure IDs in both matrices match
+assert ids_1 == ids_2, "IDs in both files do not match"
+
+# Calculate Cohen's Kappa for each category
+kappas = []
+for i in range(label_matrix_1.shape[1]):
+    kappa = cohen_kappa_score(label_matrix_1[:, i], label_matrix_2[:, i], labels=[0, 1])
+    kappas.append(kappa)
+
+# Calculate average Cohen's Kappa
+average_kappa = np.mean(kappas)
+print(f"Cohen's Kappa: {average_kappa}")
diff --git a/tools/query_csv_to_json.py b/tools/query_csv_to_json.py
@@ -21,30 +21,46 @@ def read_query_csv_and_convert_to_json(file_path: str):
 
 
 def merge_dict_and_old_json_and_save(
-    dict1: dict,
-    file_path_2: str = "data/labels.json",
-    file_path_save: str = "data/merged_labels.json",
+        dict1: dict,
+        file_path_2: str = "data/labels.json",
+        file_path_save: str = "data/merged_labels.json",
+        conflict: bool = False,
 ):
-
     # json_dict = json.dumps(dict1)
     if os.path.exists(file_path_2):
         with open(file_path_2, "r") as f:
             labels = json.load(f)
 
         for key in labels:
             if key in dict1:
-                dict1[key] = dict1[key].extend(labels[key])
+                if not conflict:
+                    dict1[key] = list(set(dict1[key].extend(labels[key])))
+                else:
+                    intersection = list(set(dict1[key]).intersection(set(labels[key])))
+                    dict1[key] = intersection
+            else:
+                dict1[key] = labels[key]
 
     # make evaluation dir
     os.makedirs(os.path.dirname(file_path_save), exist_ok=True)
 
     with open(file_path_save, "w") as f:
         json.dump(dict1, f)
+    return dict1
+
 
+# The first file (fixed)
+file = "data/LLM Evaluation - Topic Queries.csv"
+dict_ids = read_query_csv_and_convert_to_json(file)
+save_path = "data/merged_labels.json"
+dict1 = merge_dict_and_old_json_and_save(
+    dict_ids, file_path_save=save_path
+)
 
-file_paths = ["data/LLM Evaluation - Topic Queries.csv"]
+# More files to merge, handle conflicts
+file_paths = ["data/labels.json"]
 for file in file_paths:
-    dict_ids = read_query_csv_and_convert_to_json(file)
-    merge_dict_and_old_json_and_save(
-        dict_ids, file_path_save="../data/evaluation/merged_labels.json"
+    print(f"Merging {file}")
+    dict1 = merge_dict_and_old_json_and_save(
+        dict1, file, file_path_save=save_path, conflict=True
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,3 +17,4 @@ tools/data/* @@
     *.pid
     tools/data/
     tools/data/*
+    .idea/