-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata2cv.py
115 lines (96 loc) · 3.24 KB
/
data2cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import cPickle
import math
import time
from dataset import *
# Get new sentence with zero padding
def get_idx(sentence, filter_h=5, max_l=100):
pad = int(math.ceil(filter_h / 2.0))
x = [0] * pad
if len(sentence) < max_l:
for ind in sentence:
x.append(ind)
else:
for i in xrange(max_l):
x.append(sentence[i])
# padding the end of sentence
while len(x) < max_l + 2 * pad:
x.append(0)
return x
# TODO(swjung): check what is the purpose of this function
def get_pf(sentLen, allIndice, filter_h=5, max_l=100):
if sentLen < max_l:
index = np.arange(sentLen)
else:
index = np.arange(max_l)
sent_pf1 = index - allIndice[0] + (max_l - 1)
sent_pf2 = index - allIndice[1] + (max_l - 1)
# padding the begining of sentence
pad = int(math.ceil(filter_h / 2.0))
pf1 = [-1] * pad
pf2 = [-1] * pad
pf1.extend(sent_pf1)
pf2.extend(sent_pf2)
# padding the end of sentence
while len(pf1) < max_l + 2 * pad:
pf1.append(-1)
pf2.append(-1)
return [pf1, pf2]
# ouput: list of InstnaceBag
# InstanceBag: bag of instances in the form (entities, rels, num,...) related with one entity pair
def make_idx_data_cv(data, filter_h, max_l):
newData = []
for ins in data:
entities = ins.entities
rel = ins.rel
num = ins.num
sentences = ins.sentences
positions = ins.positions
entitiesPos = ins.entitiesPos
newSent = []
newPos = []
newEPos = []
# Handling some weird entity positions
remove_idx = []
for i, pos in enumerate(entitiesPos):
if pos[0] > max_l - 1 or pos[1] > max_l - 1:
remove_idx.append(i)
num -= 1
elif pos[0] == 0 or pos[1] == 0:
remove_idx.append(i)
num -= 1
elif pos[0] == pos[1]:
remove_idx.append(i)
num -= 1
for i, sentence in enumerate(sentences):
if i in remove_idx:
continue
idx = get_idx(sentence, filter_h, max_l)
newSent.append(idx)
pf = get_pf(len(sentence), positions[i], filter_h, max_l)
newPos.append(pf)
newEPos.append(entitiesPos[i])
if len(newSent) == 0:
continue
newIns = InstanceBag(entities, rel, num, newSent, newPos, newEPos)
newData += [newIns]
return newData
if __name__ == "__main__":
print "load test and train raw data..."
testData = cPickle.load(open('test_len_60_gap_40.p'))
trainData = cPickle.load(open('test_len_60_gap_40.p'))
sentence_len = 60
max_filter_len = 3
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 0 time: ' + '\t\t' + str(now)
test = make_idx_data_cv(testData, max_filter_len, sentence_len)
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 1 time: ' + '\t\t' + str(now)
train = make_idx_data_cv(trainData, max_filter_len, sentence_len)
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 2 time: ' + '\t\t' + str(now)
f = open('test_3_60.p', 'w')
cPickle.dump(test, f, -1)
f.close()
f = open('train_3_60.p', 'w')
cPickle.dump(train, f, -1)
f.close()