forked from microsoft/BioGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rebuild_data.py
82 lines (62 loc) · 2.39 KB
/
rebuild_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import sys
import re
import json
data_dir=sys.argv[1]
prefix=sys.argv[2]
def build_source_seq(question, context, long_answer=None):
if long_answer:
src = "question: {} context: {} answer: {}".format(question.strip(), context.strip(), long_answer.strip())
else:
src = "question: {} context: {} ".format(question.strip(), context.strip())
return src
def build_target_seq(tgt):
tgt = 'the answer to the question given the context is ' + tgt + '.'
return tgt
def loader(fname, fn, required_long_answer=False):
ret = []
cnt = 0
with open(fname, 'r') as file:
data = json.load(file)
for pmid, content in data.items():
cnt += 1
question = content['QUESTION']
context = ' '.join(sen.strip() for sen in content['CONTEXTS'])
context = re.sub(r'\n', ' ', context)
# remove duplicate spaces
context = re.sub(r'\s+', ' ', context)
long_answer = content['LONG_ANSWER']
if required_long_answer:
source = build_source_seq(question, context, long_answer)
else:
source = build_source_seq(question, context)
if 'final_decision' in content:
label = content['final_decision']
target = fn(label)
else:
target = ''
if isinstance(target, list):
for i in range(len(target)):
data_pair = [source, target[i]]
ret.append(data_pair)
else:
data_pair = [source, target]
ret.append(data_pair)
print(f"{cnt} samples in {fname} has been processed")
return ret
def dumper(content_list, prefix):
fw_source = open(prefix + ".x", "w")
fw_target = open(prefix + ".y", "w")
for ele in content_list:
print(ele[0], file=fw_source)
print(ele[1], file=fw_target)
fw_source.close()
fw_target.close()
def worker(fname, prefix, fn):
ret = loader(fname, fn)
dumper(ret, prefix)
worker(os.path.join(f"{data_dir}", "train_set.json"), os.path.join(f"{data_dir}", f"{prefix}_train"), build_target_seq)
worker(os.path.join(f"{data_dir}", "dev_set.json"), os.path.join(f"{data_dir}", f"{prefix}_valid"), build_target_seq)
worker(os.path.join(f"{data_dir}", "test_set.json"), os.path.join(f"{data_dir}", f"{prefix}_test"), build_target_seq)