diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..0647067 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c34d28b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 701caac..f7aacc4 100755 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# Sequential Labeling +Updated to py3.6, tf 1.0 + # Sequential Labeling - HMM @@ -52,8 +55,8 @@ python test.py model test.in test.out -c char_emb -g 2 The first line of the embedding file is the number of char and embedding dimension, seperating by space, e.g 5 10. The remaining line is the char and embedding vector, seperating by space, e.g N dim1 ... dim 10 # Installation Dependencies -- python 2.7 -- tensorflow 0.8 +- python 3.6 +- tensorflow 1.0 - numpy - pandas diff --git a/code/.DS_Store b/code/.DS_Store new file mode 100644 index 0000000..061b4e3 Binary files /dev/null and b/code/.DS_Store differ diff --git a/code/bilstm_crf/.DS_Store b/code/bilstm_crf/.DS_Store new file mode 100644 index 0000000..bd211a1 Binary files /dev/null and b/code/bilstm_crf/.DS_Store differ diff --git a/code/bilstm_crf/BILSTM_CRF.py b/code/bilstm_crf/BILSTM_CRF.py index 55d18a2..d1e45c2 100755 --- a/code/bilstm_crf/BILSTM_CRF.py +++ b/code/bilstm_crf/BILSTM_CRF.py @@ -1,8 +1,10 @@ import math import helper import numpy as np +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' import tensorflow as tf -from tensorflow.models.rnn import rnn, rnn_cell +# from tensorflow.contrib import rnn class BILSTM_CRF(object): @@ -19,13 +21,13 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.num_steps = num_steps self.num_chars = num_chars self.num_classes = num_classes - + # placeholder of x, y and weight self.inputs = tf.placeholder(tf.int32, [None, self.num_steps]) self.targets = tf.placeholder(tf.int32, [None, self.num_steps]) self.targets_weight = tf.placeholder(tf.float32, [None, self.num_steps]) self.targets_transition = tf.placeholder(tf.int32, [None]) - + # char embedding if embedding_matrix != None: self.embedding = tf.Variable(embedding_matrix, trainable=False, name="emb", dtype=tf.float32) @@ -34,35 +36,37 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.inputs) self.inputs_emb = tf.transpose(self.inputs_emb, [1, 0, 2]) self.inputs_emb = tf.reshape(self.inputs_emb, [-1, self.emb_dim]) - self.inputs_emb = tf.split(0, self.num_steps, self.inputs_emb) + self.inputs_emb = tf.split(self.inputs_emb,axis=0,num_or_size_splits=self.num_steps) - # lstm cell - lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim) - lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim) + # # lstm cell + lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim) + lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim) # dropout if is_training: - lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate)) - lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate)) + lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate)) + lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate)) - lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_fw] * self.num_layers) - lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_bw] * self.num_layers) + lstm_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell_fw] * self.num_layers) + lstm_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell_bw] * self.num_layers) - # get the length of each sample + # # get the length of each sample self.length = tf.reduce_sum(tf.sign(self.inputs), reduction_indices=1) self.length = tf.cast(self.length, tf.int32) - # forward and backward - self.outputs, _, _ = rnn.bidirectional_rnn( - lstm_cell_fw, + # # forward and backward + self.outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( + lstm_cell_fw, lstm_cell_bw, - self.inputs_emb, + self.inputs_emb, dtype=tf.float32, sequence_length=self.length ) # softmax - self.outputs = tf.reshape(tf.concat(1, self.outputs), [-1, self.hidden_dim * 2]) + # print(self.outputs) + # print(self.hidden_dim) + self.outputs = tf.reshape(tf.concat(self.outputs,1),[-1,self.hidden_dim*2]) self.softmax_w = tf.get_variable("softmax_w", [self.hidden_dim * 2, self.num_classes]) self.softmax_b = tf.get_variable("softmax_b", [self.num_classes]) self.logits = tf.matmul(self.outputs, self.softmax_w) + self.softmax_b @@ -75,14 +79,14 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd dummy_val = -1000 class_pad = tf.Variable(dummy_val * np.ones((self.batch_size, self.num_steps, 1)), dtype=tf.float32) - self.observations = tf.concat(2, [self.tags_scores, class_pad]) + self.observations = tf.concat([self.tags_scores, class_pad],2) begin_vec = tf.Variable(np.array([[dummy_val] * self.num_classes + [0] for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32) end_vec = tf.Variable(np.array([[0] + [dummy_val] * self.num_classes for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32) begin_vec = tf.reshape(begin_vec, [self.batch_size, 1, self.num_classes + 1]) end_vec = tf.reshape(end_vec, [self.batch_size, 1, self.num_classes + 1]) - self.observations = tf.concat(1, [begin_vec, self.observations, end_vec]) + self.observations = tf.concat([begin_vec, self.observations, end_vec],1) self.mask = tf.cast(tf.reshape(tf.sign(self.targets),[self.batch_size * self.num_steps]), tf.float32) @@ -103,8 +107,8 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.loss = - (self.target_path_score - self.total_path_score) # summary - self.train_summary = tf.scalar_summary("loss", self.loss) - self.val_summary = tf.scalar_summary("loss", self.loss) + self.train_summary = tf.summary.scalar("loss", self.loss) + self.val_summary = tf.summary.scalar("loss", self.loss) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) @@ -115,7 +119,7 @@ def logsumexp(self, x, axis=None): def forward(self, observations, transitions, length, is_viterbi=True, return_best_seq=True): length = tf.reshape(length, [self.batch_size]) - transitions = tf.reshape(tf.concat(0, [transitions] * self.batch_size), [self.batch_size, 6, 6]) + transitions = tf.reshape(tf.concat([transitions] * self.batch_size, 0), [self.batch_size, 6, 6]) observations = tf.reshape(observations, [self.batch_size, self.num_steps + 2, 6, 1]) observations = tf.transpose(observations, [1, 0, 2, 3]) previous = observations[0, :, :, :] @@ -133,15 +137,15 @@ def forward(self, observations, transitions, length, is_viterbi=True, return_bes alphas.append(alpha_t) previous = alpha_t - alphas = tf.reshape(tf.concat(0, alphas), [self.num_steps + 2, self.batch_size, 6, 1]) + alphas = tf.reshape(tf.concat(alphas,0), [self.num_steps + 2, self.batch_size, 6, 1]) alphas = tf.transpose(alphas, [1, 0, 2, 3]) alphas = tf.reshape(alphas, [self.batch_size * (self.num_steps + 2), 6, 1]) last_alphas = tf.gather(alphas, tf.range(0, self.batch_size) * (self.num_steps + 2) + length) last_alphas = tf.reshape(last_alphas, [self.batch_size, 6, 1]) - max_scores = tf.reshape(tf.concat(0, max_scores), (self.num_steps + 1, self.batch_size, 6)) - max_scores_pre = tf.reshape(tf.concat(0, max_scores_pre), (self.num_steps + 1, self.batch_size, 6)) + max_scores = tf.reshape(tf.concat(max_scores,0), (self.num_steps + 1, self.batch_size, 6)) + max_scores_pre = tf.reshape(tf.concat(max_scores_pre,0), (self.num_steps + 1, self.batch_size, 6)) max_scores = tf.transpose(max_scores, [1, 0, 2]) max_scores_pre = tf.transpose(max_scores_pre, [1, 0, 2]) @@ -153,9 +157,9 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") - merged = tf.merge_all_summaries() - summary_writer_train = tf.train.SummaryWriter('loss_log/train_loss', sess.graph) - summary_writer_val = tf.train.SummaryWriter('loss_log/val_loss', sess.graph) + merged = tf.summary.merge_all() + summary_writer_train = tf.summary.FileWriter('loss_log/train_loss', sess.graph) + summary_writer_val = tf.summary.FileWriter('loss_log/val_loss', sess.graph) num_iterations = int(math.ceil(1.0 * len(X_train) / self.batch_size)) @@ -166,7 +170,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): np.random.shuffle(sh_index) X_train = X_train[sh_index] y_train = y_train[sh_index] - print "current epoch: %d" % (epoch) + print( "current epoch: %d" % (epoch)) for iteration in range(num_iterations): # train X_train_batch, y_train_batch = helper.nextBatch(X_train, y_train, start_index=iteration * self.batch_size, batch_size=self.batch_size) @@ -194,7 +198,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): cnt += 1 precision_train, recall_train, f1_train = self.evaluate(X_train_batch, y_train_batch, predicts_train, id2char, id2label) summary_writer_train.add_summary(train_summary, cnt) - print "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train) + print( "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train) ) # validation if iteration % 100 == 0: @@ -220,21 +224,21 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): predicts_val = self.viterbi(max_scores, max_scores_pre, length, predict_size=self.batch_size) precision_val, recall_val, f1_val = self.evaluate(X_val_batch, y_val_batch, predicts_val, id2char, id2label) summary_writer_val.add_summary(val_summary, cnt) - print "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val) + print( "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val)) if f1_val > self.max_f1: self.max_f1 = f1_val save_path = saver.save(sess, save_file) - print "saved the best model with f1: %.5f" % (self.max_f1) + print( "saved the best model with f1: %.5f" % (self.max_f1)) def test(self, sess, X_test, X_test_str, output_path): char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_iterations = int(math.ceil(1.0 * len(X_test) / self.batch_size)) - print "number of iteration: " + str(num_iterations) + print( "number of iteration: " + str(num_iterations)) with open(output_path, "wb") as outfile: for i in range(num_iterations): - print "iteration: " + str(i + 1) + print( "iteration: " + str(i + 1)) results = [] X_test_batch = X_test[i * self.batch_size : (i + 1) * self.batch_size] X_test_str_batch = X_test_str[i * self.batch_size : (i + 1) * self.batch_size] diff --git a/code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc b/code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc new file mode 100644 index 0000000..943a181 Binary files /dev/null and b/code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc differ diff --git a/code/bilstm_crf/__pycache__/helper.cpython-36.pyc b/code/bilstm_crf/__pycache__/helper.cpython-36.pyc new file mode 100644 index 0000000..fff7c1f Binary files /dev/null and b/code/bilstm_crf/__pycache__/helper.cpython-36.pyc differ diff --git a/code/bilstm_crf/char2id b/code/bilstm_crf/char2id new file mode 100644 index 0000000..e232a9c --- /dev/null +++ b/code/bilstm_crf/char2id @@ -0,0 +1,7 @@ +B 1 +N 2 +D 3 +Z 4 +A 5 + 0 + 6 diff --git a/code/bilstm_crf/helper.py b/code/bilstm_crf/helper.py index 0a8391d..a3c2288 100755 --- a/code/bilstm_crf/helper.py +++ b/code/bilstm_crf/helper.py @@ -103,12 +103,12 @@ def extractEntity(sentence, labels): def loadMap(token2id_filepath): if not os.path.isfile(token2id_filepath): - print "file not exist, building map" + print( "file not exist, building map") buildMap() token2id = {} id2token = {} - with open(token2id_filepath) as infile: + with open(token2id_filepath,'rb') as infile: for row in infile: row = row.rstrip().decode("utf-8") token = row.split('\t')[0] @@ -118,13 +118,13 @@ def loadMap(token2id_filepath): return token2id, id2token def saveMap(id2char, id2label): - with open("char2id", "wb") as outfile: + with open("char2id", "w") as outfile: for idx in id2char: outfile.write(id2char[idx] + "\t" + str(idx) + "\r\n") - with open("label2id", "wb") as outfile: + with open("label2id", "w") as outfile: for idx in id2label: outfile.write(id2label[idx] + "\t" + str(idx) + "\r\n") - print "saved map between token and id" + print( "saved map between token and id") def buildMap(train_path="train.in"): df_train = pd.read_csv(train_path, delimiter='\t', quoting=csv.QUOTE_NONE, skip_blank_lines=False, header=None, names=["char", "label"]) @@ -174,7 +174,7 @@ def getTrain(train_path, val_path, train_val_ratio=0.99, use_custom_val=False, s X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] - print "train size: %d, validation size: %d" %(len(X_train), len(y_val)) + print( "train size: %d, validation size: %d" %(len(X_train), len(y_val))) return X_train, y_train, X_val, y_val @@ -202,7 +202,7 @@ def mapFunc(x, char2id): df_test["char"] = df_test.char.map(lambda x : -1 if str(x) == str(np.nan) else x) X_test, _ = prepare(df_test["char_id"], df_test["char_id"], seq_max_len) X_test_str, _ = prepare(df_test["char"], df_test["char_id"], seq_max_len, is_padding=False) - print "test size: %d" %(len(X_test)) + print( "test size: %d" %(len(X_test))) return X_test, X_test_str def getTransition(y_train_batch): diff --git a/code/bilstm_crf/label2id b/code/bilstm_crf/label2id new file mode 100644 index 0000000..4e5d484 --- /dev/null +++ b/code/bilstm_crf/label2id @@ -0,0 +1,5 @@ +O 1 +B 2 +M 3 +E 4 + 0 diff --git a/code/bilstm_crf/test.py b/code/bilstm_crf/test.py index 92e27ae..cb31526 100755 --- a/code/bilstm_crf/test.py +++ b/code/bilstm_crf/test.py @@ -25,7 +25,7 @@ start_time = time.time() -print "preparing test data" +print( "preparing test data") X_test, X_test_str = helper.getTest(test_path=test_path, seq_max_len=num_steps) char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") @@ -36,7 +36,7 @@ else: embedding_matrix = None -print "building model" +print( "building model") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device(gpu_config): @@ -44,12 +44,12 @@ with tf.variable_scope("model", reuse=None, initializer=initializer): model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, embedding_matrix=embedding_matrix, is_training=False) - print "loading model parameter" + print( "loading model parameter") saver = tf.train.Saver() saver.restore(sess, model_path) - print "testing" + print( "testing") model.test(sess, X_test, X_test_str, output_path) end_time = time.time() - print "time used %f(hour)" % ((end_time - start_time) / 3600) \ No newline at end of file + print( "time used %f(hour)" % ((end_time - start_time) / 3600)) \ No newline at end of file diff --git a/code/bilstm_crf/train.py b/code/bilstm_crf/train.py index 030ca7c..aee4a5a 100755 --- a/code/bilstm_crf/train.py +++ b/code/bilstm_crf/train.py @@ -3,6 +3,8 @@ import argparse import numpy as np import pandas as pd +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' import tensorflow as tf from BILSTM_CRF import BILSTM_CRF @@ -23,34 +25,32 @@ val_path = args.val_path num_epochs = args.epoch emb_path = args.char_emb -gpu_config = "/gpu:"+str(args.gpu) +# gpu_config = "/gpu:"+str(args.gpu) num_steps = 200 # it must consist with the test start_time = time.time() -print "preparing train and validation data" X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps) char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_chars = len(id2char.keys()) num_classes = len(id2label.keys()) if emb_path != None: - embedding_matrix = helper.getEmbedding(emb_path) + embedding_matrix = helper.getEmbedding(emb_path) else: - embedding_matrix = None + embedding_matrix = None -print "building model" config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: - with tf.device(gpu_config): - initializer = tf.random_uniform_initializer(-0.1, 0.1) - with tf.variable_scope("model", reuse=None, initializer=initializer): - model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True) +# # with tf.device(gpu_config): + initializer = tf.random_uniform_initializer(-0.1, 0.1) + with tf.variable_scope("model", reuse=None, initializer=initializer): + model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True) - print "training model" - tf.initialize_all_variables().run() - model.train(sess, save_path, X_train, y_train, X_val, y_val) + print ("training model") + tf.global_variables_initializer().run() + model.train(sess, save_path, X_train, y_train, X_val, y_val) - print "final best f1 is: %f" % (model.max_f1) + print ("final best f1 is: %f" % (model.max_f1)) - end_time = time.time() - print "time used %f(hour)" % ((end_time - start_time) / 3600) + end_time = time.time() + print ("time used %f(hour)" % ((end_time - start_time) / 3600))