-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_data4RACE.py
80 lines (67 loc) · 3.36 KB
/
prepare_data4RACE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
import json
import random
from transformers import AutoTokenizer
from utils import *
tokenizer = None
import os
def get_one_sample_features(one):
alternatives = one['alternatives'].split('|')
if len(alternatives) != 4:
return []
label = int(one['answer'])
query = one['query']
text_a = one['passage']
choices_inputs = []
for ending in alternatives:
if query.find("_") != -1:
text_b = query.replace("_", ending)
else:
text_b = query + " " + ending
inputs = {}
inputs["input_ids"] = [tokenizer.cls_token_id] \
+ tokenizer.encode(text_b,add_special_tokens=False,max_length=128,truncation=True) \
+ [tokenizer.sep_token_id]
inputs["token_type_ids"] = [0] * len(inputs["input_ids"])
inputs["input_ids"] += tokenizer.encode(text_a, max_length=tokenizer.max_len - len(inputs["input_ids"])-1,truncation=True,add_special_tokens=False) + [tokenizer.sep_token_id]
inputs["attention_mask"] = [1] * len(inputs["input_ids"])
inputs["token_type_ids"] += [1] * (len(inputs["input_ids"]) - len(inputs["token_type_ids"]))
# inputs = tokenizer(
# text_b,
# text_a,
# add_special_tokens=True,
# max_length=tokenizer.max_len,
# #padding="max_length",
# truncation=True,
# return_overflowing_tokens=True,
# )
choices_inputs.append(inputs)
input_ids = [x["input_ids"] for x in choices_inputs]
attention_mask = (
[x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None)
token_type_ids = (
[x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None)
global_attention_mask = None
return [input_ids, label, attention_mask, token_type_ids,global_attention_mask]
def convert_to_features(filename):
with open(filename, encoding='utf-8') as f:
raw = json.load(f)
data = multi_process(get_one_sample_features, raw)
# for data in raw:
# get_one_sample_features(data)
print('get {} with {} samples'.format(filename, len(data)))
return data
def prepare_bert_data(model_type='bert-base-chinese', data_path="/home/bzw/data/ReCO"):
assert data_path.split('/')[-1] == 'RACE'
global tokenizer
#tokenizer = BertTokenizer.from_pretrained(model_type)
tokenizer = AutoTokenizer.from_pretrained(model_type)
if not os.path.exists(os.path.join(data_path,'test.{}.obj'.format(model_type.replace('/', '.')))):
test_data = convert_to_features(os.path.join(data_path,'test.json'))
dump_file(test_data, os.path.join(data_path,'test.{}.obj'.format(model_type.replace('/', '.'))))
if not os.path.exists(os.path.join(data_path,'valid.{}.obj'.format(model_type.replace('/', '.')))):
valid_data = convert_to_features(os.path.join(data_path,'dev.json'))
dump_file(valid_data, os.path.join(data_path,'valid.{}.obj'.format(model_type.replace('/', '.'))))
if not os.path.exists(os.path.join(data_path,'train.{}.obj'.format(model_type.replace('/', '.')))):
train_data = convert_to_features(os.path.join(data_path,'train.json'))
dump_file(train_data, os.path.join(data_path,'train.{}.obj'.format(model_type.replace('/', '.'))))