-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
206 lines (173 loc) · 8.06 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 7 14:00:10 2017
self.data_process
@author: xuweijia
"""
import os # operate path
import glob # get document name. glob.glob(path)
import numpy as np
MAX_WORD_LEN=10
SYMB_S="@begin"
SYMB_E="@end"
data_dir='/media/xuweijia/00023F0D000406A9/fake_data/wdw'
class data_holder:
def __init__(self, dictionary, training, validation, test, n_entities):
self.dictionary = dictionary
self.training =np.asarray(training)
self.validation = np.asarray(validation)
self.test = np.asarray(test)
self.vocab_size = len(dictionary[0])
self.n_chars = len(dictionary[1])
self.n_entities = n_entities
self.inv_dictionary = {v: k for k, v in dictionary[0].items()}
class preprocessing():
def preprocess(self,data_dir,use_chars=True,no_training_set=False):
# vocab file should store in each dataset;
# note:just join, not create
vocab_file=os.path.join(data_dir,'vocab.txt')
word_dict,char_dict,num_entities=self.make_dictionary(data_dir,vocab_file)
# training
if no_training_set:
training = None
else:
print ("preparing training data ...")
training_path=os.path.join(data_dir,'training')
training=self.parse_all_file(training_path,word_dict,char_dict, use_chars)
print ("preparing validation data ...")
validation_path=os.path.join(data_dir,'validation')
validation=self.parse_all_file(validation_path,word_dict,char_dict, use_chars)
print ("preparing test data ...")
test_path=os.path.join(data_dir,'test')
test=self.parse_all_file(test_path,word_dict,char_dict, use_chars)
dictionary=(word_dict,char_dict)
data=data_holder(dictionary, training, validation, test, num_entities)
return data
# make dict / words write in vocab_file
def make_dictionary(self,data_dir,vocab_file):
if os.path.exists(vocab_file):
print('vocab file found, create vocab list...')
# f.readlines().strip()
vocabulary=list(map(lambda x:x.strip(), open(vocab_file).readlines()))
else:
vocab_set=set()
print('no vocab file exist,create from dataset...')
# get all example files' name list
file_names=[]
file_names+=glob.glob(os.path.join(data_dir,'test/*.question'))
file_names+=glob.glob(os.path.join(data_dir,'validation/*.question'))
file_names+=glob.glob(os.path.join(data_dir,'training/*.question'))
n=0.
for file in file_names:
fp=open(file)
fp.readline()
fp.readline()
# doc
# document=fp.readline().strip().split()
document=fp.readline().split()
fp.readline()
# query
#query=fp.readline().strip().split()
query=fp.readline().split()
fp.close()
vocab_set|=set(document)|set(query)
# show progress
n+=1
if n%10000==0:
print ('%3f %%'%(n/len(file_names)*100))
entities= set([v for v in vocab_set if v.startswith('@entity')])
# token include @placehoder, @begin and @end
tokens= vocab_set.difference(entities)
tokens.add(SYMB_S)
tokens.add(SYMB_E)
vocabulary=list(tokens)+list(entities)
#vocabulary.insert(0, UNK) # any word's not in doc+quy,index 0
# write vocubulary in file,store in dataset folder
# seperate by '\n'
f=open(vocab_file,'w')
f.write('\n'.join(vocabulary))
f.close()
# word dictionary: word -> index
vocab_size=len(vocabulary)
word_dict=dict(zip(vocabulary,range(vocab_size)))
# char dictionary: char -> index
char_set=set([c for w in vocabulary for c in list(w)])
char_set.add(' ')
char_list=list(char_set)
char_dict=dict(zip(char_list,range(len(char_set))))
num_entities = len(
[v for v in vocabulary if v.startswith('@entity')])
print ("vocab_size = %d" % vocab_size)
print ("num characters = %d" % len(char_set))
print ("%d anonymoused entities" % num_entities)
print ("%d other tokens (including @placeholder, %s and %s)" % (
vocab_size-num_entities, SYMB_S, SYMB_E))
return word_dict,char_dict,num_entities
def parse_one_file(self,file,word_dict,char_dict, use_chars):
with open(file) as f:
fp=f.readlines()
#document_raw=fp[2].strip().split()
#query_raw=fp[4].strip().split()
#answer_raw=fp[6].strip().split()
document_raw=fp[2].split()
query_raw=fp[4].split()
answer_raw=fp[6].strip()
candidates_raw=list(map(lambda x: x.strip().split(':')[0].split() ,fp[8:])) # candidate answers,just @entityid,maybe more than 1 word
# add SOS,EOS for query
query_raw.insert(0, SYMB_S)
query_raw.append(SYMB_E)
# cloze's position in query
try:
cloze = query_raw.index('@placeholder') # cloze place in qury
except ValueError:
print ('@placeholder not found in ', file, '. Fixing...')
at = query_raw.index('@') # where @ is
query_raw = query_raw[:at] + [''.join(query_raw[at:at+2])] + query_raw[at+2:]
# new query @--> @placeholder
cloze = query_raw.index('@placeholder')
# tokens/entities --> indexes
doc_words=list(map(lambda w:word_dict[w],document_raw))
qury_words=list(map(lambda w:word_dict[w],query_raw))
# any word in answear/candidate not in vocab,defualt index 0 may be many words in an answear
answer_words=list(map(lambda w:word_dict.get(w,0),answer_raw.split()))
cand_words=[list(map(lambda w:word_dict.get(w,0) ,c)) for c in candidates_raw] # every candidate
# use char:C(w): char's index embedding
# not emerge: take value of dict[' ']
# return char list of every word
char_2_index=lambda c:char_dict.get(c,char_dict[' '])
charlist_2_index=lambda w: list(map(char_2_index, list(w)[:MAX_WORD_LEN]))
if use_chars:
doc_chars =list(map(charlist_2_index,document_raw))
qry_chars =list(map(charlist_2_index, query_raw))
else:
doc_chars, qry_chars = [], []
return doc_words, qury_words, answer_words, cand_words, doc_chars, qry_chars, cloze
def parse_all_file(self,files_path,word_dict,char_dict, use_chars):
files=glob.glob(files_path+'/*.question')
# 每个训练样本,后面跟文件名
questions = [self.parse_one_file(f,word_dict,char_dict, use_chars) + (f,) for f in files]
return questions
# write all sample's ducument ,' ',query into a text_file.txt
def gen_text_for_word2vec(self, data_dir, text_file):
fnames = []
fnames += glob.glob(data_dir + "/training/*.question")
out = open(text_file, "w")
for fname in fnames:
fp = open(fname)
fp.readline()
fp.readline()
document = fp.readline()
fp.readline()
query = fp.readline()
fp.close()
out.write(document.strip())
out.write(" ")
out.write(query.strip())
out.close()
#pre=preprocessing()
#data_hold=pre.preprocess(data_dir,use_chars=True,no_training_set=False)
#train,valid,test,dic,n_chars=data_hold.training,data_hold.validation,data_hold.test,data_hold.dictionary,data_hold.n_chars
## docu,query,ans,cani,docu)char,query_char,
## return
#pre.gen_text_for_word2vec(data_dir,os.path.join(data_dir,'cbtcn_D_Q_train.txt'))