-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtsv_to_json.py
75 lines (70 loc) · 2.6 KB
/
tsv_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from tqdm import tqdm
import os
def get_image_file_location(root, row):
if int(row['visual_input']) == 0:
return None
img_file = row['set_id'] + "_" + row['figure_id'] + ".png"
return os.path.join(root, row['category'], row['subcategory'], img_file)
col_idx = {
'category':0,
'subcategory':1,
'visual_input':2,
'set_id':3,
'figure_id':4,
'sample_note':5,
'question_id':6,
'question':7,
'gt_answer_details':8,
'gt_answer':9,
# 'gpt4v_output':10,
# 'gpt4v_output_human_check': 11,
# 'llava_1_5_output':12,
# 'llava_1_5_output_human_check': 13,
}
import csv
import json
data_vd = []
data_vs = []
root_dir = "."
input_file_name = 'HallusionBench.tsv'
with open(input_file_name) as file:
tsv_file = csv.reader(file, delimiter="\t")
flag = 0
for line in tsv_file:
# if line[0] not in ["VD", "VS"]:
# if line[0] in ["NOTE", "category"]:
if "VD" not in line[0] and "VS" not in line[0]:
continue
data_dict = {}
try:
for k, v in col_idx.items():
data_dict[k] = line[v]
assert int(line[col_idx["gt_answer"]]) == 0 or int(line[col_idx["gt_answer"]]) == 1 or int(line[col_idx["gt_answer"]]) == 2
# assert int(line[col_idx["gpt4v_output_human_check"]]) == 0 or int(line[col_idx["gpt4v_output_human_check"]]) == 1 or int(line[col_idx["gpt4v_output_human_check"]]) == 2
# assert int(line[col_idx["llava_1_5_output_human_check"]]) == 0 or int(line[col_idx["llava_1_5_output_human_check"]]) == 1 or int(line[col_idx["llava_1_5_output_human_check"]]) == 2
except:
from IPython import embed;embed()
data_dict["filename"] = get_image_file_location(root_dir, data_dict)
if line[0] == "VD":
data_vd.append(data_dict)
else:
data_vs.append(data_dict)
result = data_vs + data_vd
print(len(result))
result1 = []
for re in result:
result1.append({"category": re['category'].strip(),
"subcategory": re['subcategory'].strip(),
"visual_input": re['visual_input'].strip(),
"set_id": re['set_id'].strip(),
"figure_id": re['figure_id'].strip(),
"sample_note": re['sample_note'].strip(),
"question_id": re['question_id'].strip(),
"question": re['question'].strip(),
"gt_answer": re['gt_answer'].strip(),
"gt_answer_details": re["gt_answer_details"].strip(),
"filename": re['filename'].strip() if re['filename'] else re['filename']})
print(len(result))
with open('./HallusionBench.json', 'w') as f:
json.dump(result, f)