-
Notifications
You must be signed in to change notification settings - Fork 5
/
json2spreadsheet.py
194 lines (159 loc) · 7.25 KB
/
json2spreadsheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import json
import glob
import re
import os
import pandas as pd
import html
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
'api_key.json', scope)
gc = gspread.authorize(credentials)
# 자신이 만든 구글 시트 이름
sh = gc.open('반려동물_데이터셋생성')
sheet = sh.worksheet('시트1')
# tagtog
#폴더 경로
folder_name = "./"
#context list
context_name_list = os.listdir(folder_name + "ann.json/master/pool")
#relation 폴더 경로
relation_folder_paths = glob.glob(folder_name + "ann.json/master/pool/*")
#context 폴더 경로
# contexts_folders_paths = glob.glob(folder_name + "plain.html/pool")
contexts_folders_paths = [folder_name + "plain.html/pool/" + c for c in context_name_list]
#anntation_lenged 정보
annotation_legend = folder_name + "annotations-legend.json"
with open(annotation_legend,"r") as f:
annotation_legend = json.load(f)
def chg_num2number_of_members(label):
if 'num' in label:
label = "number_of_members"
return label
def get_needed_relation_data(tmp_relation):
subject_token = re.findall("\(+(.+)+\)",annotation_legend[tmp_relation["relations"][0]['classId']])[0].split("|")[0]
if subject_token == tmp_relation['entities'][0]['classId']:
sub_entity, obj_entity = tmp_relation['entities']
else:
obj_entity, sub_entity = tmp_relation['entities']
# get preprocessed entities
def _get_entity(entity):
outputs = {'start':'','end':'','text':'','type':''}
outputs['start'] = entity['offsets'][0]['start']
outputs['text'] = entity['offsets'][0]['text']
outputs['end'] = outputs['start'] + len(outputs['text']) - 1
outputs['type'] = chg_num2number_of_members(annotation_legend[entity['classId']].split("-")[1].lower())
return outputs
output_subject = _get_entity(sub_entity)
output_object = _get_entity(obj_entity)
# 데이터 생성 도중 바뀐 기준 적용 (upper_group 삭제 -> sub_group으로 변경)
if output_subject['type'] == 'upper_group':
return output_object, output_subject
return output_subject, output_object
# mk class sentence w/ relation
def get_label(relation_json):
label_tag = relation_json['relations'][0]['classId'] #r_6
try:
_,sub_type, label = annotation_legend[re.findall("\(+(.+)+\|",annotation_legend[label_tag])[0]].split("-")
# num -> number_of_members
# 데이터 생성 도중 바뀐 기준 적용 ORG-NOH -> ANM-NOH
# 데이터 생성 도중 바뀐 기준 적용 (upper_group 삭제 -> sub_group으로 변경)
label = chg_num2number_of_members(label)
if label == 'no_relation':
return f'{label}'
elif label == 'number_of_members':
sub_type = 'anm'
elif label == 'upper_group':
label = 'sub_group'
return f"{sub_type}:{label}"
except:
print("Change to no_relation.")
_,sub_type, = annotation_legend[re.findall("\(+(.+)+\|",annotation_legend[label_tag])[0]].split("-")
return f"no_relation"
def get_context_from_html(html_file):
html_file = re.sub(r"\n","", html_file)
html_file = html.unescape(html_file) # 21-11-17 추가, " 등 제거
return re.findall("(<pre.+>)(.+)(</pre>)",html_file)[0][1]
def get_sentence_with_entites(subject_entity, object_entity, sentence, label):
if subject_entity['start'] < object_entity['start']:
entity1,entity2 = subject_entity, object_entity
else:
entity1,entity2 = object_entity, subject_entity
#entity 시작 위치 및 길이
ett1_stt, ett1_len = entity1['start'], len(entity1['text'])
ett2_stt, ett2_len = entity2['start'], len(entity2['text'])
#문장 분리
bf, ett1, mid, ett2, af = sentence[:ett1_stt], \
sentence[ett1_stt:ett1_stt+ett1_len], \
sentence[ett1_stt+ett1_len:ett2_stt], \
sentence[ett2_stt:ett2_stt+ett2_len], \
sentence[ett2_stt+ett2_len:]
if subject_entity['start'] < object_entity['start']:
ett1,ett2 = f"<sbj:{ett1}>", f"<obj:{ett2}>"
else:
ett1,ett2 = f"<obj:{ett1}>", f"<sbj:{ett2}>"
return "".join([bf, ett1, mid, ett2, af])
#dataframe column
# sentence w/o entity
# sentence w/ entity
# subject_entity
# object_entity
# class
id_list = []
sentence_list = []
sentence_with_entities_list = []
subject_entity_list = []
object_entity_list = []
relation_list = []
# tagtog 데이터를 CSV 형태로 변경
for context_name, relation_folder, contexts_folder in zip(context_name_list, relation_folder_paths, contexts_folders_paths):
# relation files와 context files 리스트 출력
file_ids = [file_name.split(".txt.")[0] for file_name in os.listdir(relation_folder)]
file_nums = [ids.split("-")[1] for ids in file_ids]
relation_files = [relation_folder + "/"+ file_id + ".txt.ann.json" for file_id in file_ids]
context_files = [contexts_folder + "/"+ file_id + ".txt.plain.html" for file_id in file_ids]
#json으로 된 relation data와 html로 된 context 데이터 읽기
for relation_file, context_file, file_num in zip(relation_files,context_files, file_nums):
#subject, object 정보 추출
with open(relation_file, "r") as f:
relation_json = json.load(f)
try:
tmp_subject, tmp_object = get_needed_relation_data(relation_json) #subject, object
tmp_label = get_label(relation_json)
except:
print(f"Can't get relations.{relation_file}")
continue
#sentence, sentence with entities 정보 추출
with open(context_file, "r") as f:
context_json = f.read()
tmp_sentence = get_context_from_html(context_json)
tmp_sentence_w_entities = get_sentence_with_entites(tmp_subject,tmp_object,tmp_sentence,tmp_label)
#각 list에 데이터 저장
id_list.append(f"{context_name}")
sentence_list.append(tmp_sentence)
sentence_with_entities_list.append(tmp_sentence_w_entities)
subject_entity_list.append(tmp_subject)
object_entity_list.append(tmp_object)
relation_list.append(tmp_label.lower())
# 구글시트에 입력
values = sheet.get_all_values()
header, rows = values[0], values[1:]
data = pd.DataFrame(rows, columns=header)
column_list = ["sentence","sentence_with_entity","subject_entity","object_entity","class"]
data = data[column_list]
sen_list = list(data.sentence_with_entity.values)
print(f'sentence list : {len(sen_list)}')
sheet.resize(len(sentence_list)+1,10)
list_range = f"a2:e{len(sentence_list)+1}"
cell_list = sheet.range(list_range)
col_num = len(column_list)
print(f'c : {len(cell_list)}, {col_num}')
for i in range(len(cell_list)//len(column_list)):
cell_list[(col_num*i)].value = sentence_list[i]
cell_list[(col_num*i)+1].value = sentence_with_entities_list[i]
cell_list[(col_num*i)+2].value = str(subject_entity_list[i])
cell_list[(col_num*i)+3].value = str(object_entity_list[i])
cell_list[(col_num*i)+4].value = relation_list[i]
sheet.update_cells(cell_list)