Skip to content

Commit

Permalink
modified merge script
Browse files Browse the repository at this point in the history
  • Loading branch information
LiinXemmon committed Jul 12, 2024
1 parent e29d229 commit 8800c96
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ tools/data/*
*.pid
tools/data/
tools/data/*
.idea/
45 changes: 45 additions & 0 deletions tools/consistence_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import numpy as np
from sklearn.metrics import cohen_kappa_score
import pandas as pd

# Get all category names
topic_path = "data/LLM Evaluation - Topic Queries.csv"
df = pd.read_csv(topic_path)
CLS = df['Topic'].unique().tolist()
CLS = [c.strip().lower() for c in CLS]

# Read the JSON files
with open('data/merged_labels_1.json', 'r') as f:
labels_1 = json.load(f)
with open('data/merged_labels_2.json', 'r') as f:
labels_2 = json.load(f)


# Create label matrix
def create_label_matrix(labels, CLS):
ids = sorted(labels.keys())
label_matrix = np.zeros((len(ids), len(CLS)))
for i, id in enumerate(ids):
for label in labels[id]:
if label.strip().lower() in CLS:
label_matrix[i, CLS.index(label.strip().lower())] = 1
return ids, label_matrix


# Generate label matrices for both files
ids_1, label_matrix_1 = create_label_matrix(labels_1, CLS)
ids_2, label_matrix_2 = create_label_matrix(labels_2, CLS)

# Ensure IDs in both matrices match
assert ids_1 == ids_2, "IDs in both files do not match"

# Calculate Cohen's Kappa for each category
kappas = []
for i in range(label_matrix_1.shape[1]):
kappa = cohen_kappa_score(label_matrix_1[:, i], label_matrix_2[:, i], labels=[0, 1])
kappas.append(kappa)

# Calculate average Cohen's Kappa
average_kappa = np.mean(kappas)
print(f"Cohen's Kappa: {average_kappa}")
34 changes: 25 additions & 9 deletions tools/query_csv_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,46 @@ def read_query_csv_and_convert_to_json(file_path: str):


def merge_dict_and_old_json_and_save(
dict1: dict,
file_path_2: str = "data/labels.json",
file_path_save: str = "data/merged_labels.json",
dict1: dict,
file_path_2: str = "data/labels.json",
file_path_save: str = "data/merged_labels.json",
conflict: bool = False,
):

# json_dict = json.dumps(dict1)
if os.path.exists(file_path_2):
with open(file_path_2, "r") as f:
labels = json.load(f)

for key in labels:
if key in dict1:
dict1[key] = dict1[key].extend(labels[key])
if not conflict:
dict1[key] = list(set(dict1[key].extend(labels[key])))
else:
intersection = list(set(dict1[key]).intersection(set(labels[key])))
dict1[key] = intersection
else:
dict1[key] = labels[key]

# make evaluation dir
os.makedirs(os.path.dirname(file_path_save), exist_ok=True)

with open(file_path_save, "w") as f:
json.dump(dict1, f)
return dict1


# The first file (fixed)
file = "data/LLM Evaluation - Topic Queries.csv"
dict_ids = read_query_csv_and_convert_to_json(file)
save_path = "data/merged_labels.json"
dict1 = merge_dict_and_old_json_and_save(
dict_ids, file_path_save=save_path
)

file_paths = ["data/LLM Evaluation - Topic Queries.csv"]
# More files to merge, handle conflicts
file_paths = ["data/labels.json"]
for file in file_paths:
dict_ids = read_query_csv_and_convert_to_json(file)
merge_dict_and_old_json_and_save(
dict_ids, file_path_save="../data/evaluation/merged_labels.json"
print(f"Merging {file}")
dict1 = merge_dict_and_old_json_and_save(
dict1, file, file_path_save=save_path, conflict=True
)

0 comments on commit 8800c96

Please sign in to comment.