Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated a few things #20

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__
.DS_Store
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Sequential Labeling
Updated to py3.6, tf 1.0

# Sequential Labeling

- HMM
Expand Down Expand Up @@ -52,8 +55,8 @@ python test.py model test.in test.out -c char_emb -g 2
The first line of the embedding file is the number of char and embedding dimension, seperating by space, e.g 5 10. The remaining line is the char and embedding vector, seperating by space, e.g N dim1 ... dim 10

# Installation Dependencies
- python 2.7
- tensorflow 0.8
- python 3.6
- tensorflow 1.0
- numpy
- pandas

Expand Down
Binary file added code/.DS_Store
Binary file not shown.
Binary file added code/bilstm_crf/.DS_Store
Binary file not shown.
72 changes: 38 additions & 34 deletions code/bilstm_crf/BILSTM_CRF.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import math
import helper
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
# from tensorflow.contrib import rnn

class BILSTM_CRF(object):

Expand All @@ -19,13 +21,13 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd
self.num_steps = num_steps
self.num_chars = num_chars
self.num_classes = num_classes

# placeholder of x, y and weight
self.inputs = tf.placeholder(tf.int32, [None, self.num_steps])
self.targets = tf.placeholder(tf.int32, [None, self.num_steps])
self.targets_weight = tf.placeholder(tf.float32, [None, self.num_steps])
self.targets_transition = tf.placeholder(tf.int32, [None])

# char embedding
if embedding_matrix != None:
self.embedding = tf.Variable(embedding_matrix, trainable=False, name="emb", dtype=tf.float32)
Expand All @@ -34,35 +36,37 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd
self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.inputs)
self.inputs_emb = tf.transpose(self.inputs_emb, [1, 0, 2])
self.inputs_emb = tf.reshape(self.inputs_emb, [-1, self.emb_dim])
self.inputs_emb = tf.split(0, self.num_steps, self.inputs_emb)
self.inputs_emb = tf.split(self.inputs_emb,axis=0,num_or_size_splits=self.num_steps)

# lstm cell
lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim)
lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim)
# # lstm cell
lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim)
lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim)

# dropout
if is_training:
lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate))
lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate))
lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate))
lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate))

lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_fw] * self.num_layers)
lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_bw] * self.num_layers)
lstm_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell_fw] * self.num_layers)
lstm_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell_bw] * self.num_layers)

# get the length of each sample
# # get the length of each sample
self.length = tf.reduce_sum(tf.sign(self.inputs), reduction_indices=1)
self.length = tf.cast(self.length, tf.int32)

# forward and backward
self.outputs, _, _ = rnn.bidirectional_rnn(
lstm_cell_fw,
# # forward and backward
self.outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(
lstm_cell_fw,
lstm_cell_bw,
self.inputs_emb,
self.inputs_emb,
dtype=tf.float32,
sequence_length=self.length
)

# softmax
self.outputs = tf.reshape(tf.concat(1, self.outputs), [-1, self.hidden_dim * 2])
# print(self.outputs)
# print(self.hidden_dim)
self.outputs = tf.reshape(tf.concat(self.outputs,1),[-1,self.hidden_dim*2])
self.softmax_w = tf.get_variable("softmax_w", [self.hidden_dim * 2, self.num_classes])
self.softmax_b = tf.get_variable("softmax_b", [self.num_classes])
self.logits = tf.matmul(self.outputs, self.softmax_w) + self.softmax_b
Expand All @@ -75,14 +79,14 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd

dummy_val = -1000
class_pad = tf.Variable(dummy_val * np.ones((self.batch_size, self.num_steps, 1)), dtype=tf.float32)
self.observations = tf.concat(2, [self.tags_scores, class_pad])
self.observations = tf.concat([self.tags_scores, class_pad],2)

begin_vec = tf.Variable(np.array([[dummy_val] * self.num_classes + [0] for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32)
end_vec = tf.Variable(np.array([[0] + [dummy_val] * self.num_classes for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32)
begin_vec = tf.reshape(begin_vec, [self.batch_size, 1, self.num_classes + 1])
end_vec = tf.reshape(end_vec, [self.batch_size, 1, self.num_classes + 1])

self.observations = tf.concat(1, [begin_vec, self.observations, end_vec])
self.observations = tf.concat([begin_vec, self.observations, end_vec],1)

self.mask = tf.cast(tf.reshape(tf.sign(self.targets),[self.batch_size * self.num_steps]), tf.float32)

Expand All @@ -103,8 +107,8 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd
self.loss = - (self.target_path_score - self.total_path_score)

# summary
self.train_summary = tf.scalar_summary("loss", self.loss)
self.val_summary = tf.scalar_summary("loss", self.loss)
self.train_summary = tf.summary.scalar("loss", self.loss)
self.val_summary = tf.summary.scalar("loss", self.loss)

self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

Expand All @@ -115,7 +119,7 @@ def logsumexp(self, x, axis=None):

def forward(self, observations, transitions, length, is_viterbi=True, return_best_seq=True):
length = tf.reshape(length, [self.batch_size])
transitions = tf.reshape(tf.concat(0, [transitions] * self.batch_size), [self.batch_size, 6, 6])
transitions = tf.reshape(tf.concat([transitions] * self.batch_size, 0), [self.batch_size, 6, 6])
observations = tf.reshape(observations, [self.batch_size, self.num_steps + 2, 6, 1])
observations = tf.transpose(observations, [1, 0, 2, 3])
previous = observations[0, :, :, :]
Expand All @@ -133,15 +137,15 @@ def forward(self, observations, transitions, length, is_viterbi=True, return_bes
alphas.append(alpha_t)
previous = alpha_t

alphas = tf.reshape(tf.concat(0, alphas), [self.num_steps + 2, self.batch_size, 6, 1])
alphas = tf.reshape(tf.concat(alphas,0), [self.num_steps + 2, self.batch_size, 6, 1])
alphas = tf.transpose(alphas, [1, 0, 2, 3])
alphas = tf.reshape(alphas, [self.batch_size * (self.num_steps + 2), 6, 1])

last_alphas = tf.gather(alphas, tf.range(0, self.batch_size) * (self.num_steps + 2) + length)
last_alphas = tf.reshape(last_alphas, [self.batch_size, 6, 1])

max_scores = tf.reshape(tf.concat(0, max_scores), (self.num_steps + 1, self.batch_size, 6))
max_scores_pre = tf.reshape(tf.concat(0, max_scores_pre), (self.num_steps + 1, self.batch_size, 6))
max_scores = tf.reshape(tf.concat(max_scores,0), (self.num_steps + 1, self.batch_size, 6))
max_scores_pre = tf.reshape(tf.concat(max_scores_pre,0), (self.num_steps + 1, self.batch_size, 6))
max_scores = tf.transpose(max_scores, [1, 0, 2])
max_scores_pre = tf.transpose(max_scores_pre, [1, 0, 2])

Expand All @@ -153,9 +157,9 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val):
char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")

merged = tf.merge_all_summaries()
summary_writer_train = tf.train.SummaryWriter('loss_log/train_loss', sess.graph)
summary_writer_val = tf.train.SummaryWriter('loss_log/val_loss', sess.graph)
merged = tf.summary.merge_all()
summary_writer_train = tf.summary.FileWriter('loss_log/train_loss', sess.graph)
summary_writer_val = tf.summary.FileWriter('loss_log/val_loss', sess.graph)

num_iterations = int(math.ceil(1.0 * len(X_train) / self.batch_size))

Expand All @@ -166,7 +170,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val):
np.random.shuffle(sh_index)
X_train = X_train[sh_index]
y_train = y_train[sh_index]
print "current epoch: %d" % (epoch)
print( "current epoch: %d" % (epoch))
for iteration in range(num_iterations):
# train
X_train_batch, y_train_batch = helper.nextBatch(X_train, y_train, start_index=iteration * self.batch_size, batch_size=self.batch_size)
Expand Down Expand Up @@ -194,7 +198,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val):
cnt += 1
precision_train, recall_train, f1_train = self.evaluate(X_train_batch, y_train_batch, predicts_train, id2char, id2label)
summary_writer_train.add_summary(train_summary, cnt)
print "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train)
print( "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train) )

# validation
if iteration % 100 == 0:
Expand All @@ -220,21 +224,21 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val):
predicts_val = self.viterbi(max_scores, max_scores_pre, length, predict_size=self.batch_size)
precision_val, recall_val, f1_val = self.evaluate(X_val_batch, y_val_batch, predicts_val, id2char, id2label)
summary_writer_val.add_summary(val_summary, cnt)
print "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val)
print( "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val))

if f1_val > self.max_f1:
self.max_f1 = f1_val
save_path = saver.save(sess, save_file)
print "saved the best model with f1: %.5f" % (self.max_f1)
print( "saved the best model with f1: %.5f" % (self.max_f1))

def test(self, sess, X_test, X_test_str, output_path):
char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")
num_iterations = int(math.ceil(1.0 * len(X_test) / self.batch_size))
print "number of iteration: " + str(num_iterations)
print( "number of iteration: " + str(num_iterations))
with open(output_path, "wb") as outfile:
for i in range(num_iterations):
print "iteration: " + str(i + 1)
print( "iteration: " + str(i + 1))
results = []
X_test_batch = X_test[i * self.batch_size : (i + 1) * self.batch_size]
X_test_str_batch = X_test_str[i * self.batch_size : (i + 1) * self.batch_size]
Expand Down
Binary file not shown.
Binary file added code/bilstm_crf/__pycache__/helper.cpython-36.pyc
Binary file not shown.
7 changes: 7 additions & 0 deletions code/bilstm_crf/char2id
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
B 1
N 2
D 3
Z 4
A 5
<PAD> 0
<NEW> 6
14 changes: 7 additions & 7 deletions code/bilstm_crf/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,12 @@ def extractEntity(sentence, labels):

def loadMap(token2id_filepath):
if not os.path.isfile(token2id_filepath):
print "file not exist, building map"
print( "file not exist, building map")
buildMap()

token2id = {}
id2token = {}
with open(token2id_filepath) as infile:
with open(token2id_filepath,'rb') as infile:
for row in infile:
row = row.rstrip().decode("utf-8")
token = row.split('\t')[0]
Expand All @@ -118,13 +118,13 @@ def loadMap(token2id_filepath):
return token2id, id2token

def saveMap(id2char, id2label):
with open("char2id", "wb") as outfile:
with open("char2id", "w") as outfile:
for idx in id2char:
outfile.write(id2char[idx] + "\t" + str(idx) + "\r\n")
with open("label2id", "wb") as outfile:
with open("label2id", "w") as outfile:
for idx in id2label:
outfile.write(id2label[idx] + "\t" + str(idx) + "\r\n")
print "saved map between token and id"
print( "saved map between token and id")

def buildMap(train_path="train.in"):
df_train = pd.read_csv(train_path, delimiter='\t', quoting=csv.QUOTE_NONE, skip_blank_lines=False, header=None, names=["char", "label"])
Expand Down Expand Up @@ -174,7 +174,7 @@ def getTrain(train_path, val_path, train_val_ratio=0.99, use_custom_val=False, s
X_val = X[int(num_samples * train_val_ratio):]
y_val = y[int(num_samples * train_val_ratio):]

print "train size: %d, validation size: %d" %(len(X_train), len(y_val))
print( "train size: %d, validation size: %d" %(len(X_train), len(y_val)))

return X_train, y_train, X_val, y_val

Expand Down Expand Up @@ -202,7 +202,7 @@ def mapFunc(x, char2id):
df_test["char"] = df_test.char.map(lambda x : -1 if str(x) == str(np.nan) else x)
X_test, _ = prepare(df_test["char_id"], df_test["char_id"], seq_max_len)
X_test_str, _ = prepare(df_test["char"], df_test["char_id"], seq_max_len, is_padding=False)
print "test size: %d" %(len(X_test))
print( "test size: %d" %(len(X_test)))
return X_test, X_test_str

def getTransition(y_train_batch):
Expand Down
5 changes: 5 additions & 0 deletions code/bilstm_crf/label2id
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
O 1
B 2
M 3
E 4
<PAD> 0
10 changes: 5 additions & 5 deletions code/bilstm_crf/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

start_time = time.time()

print "preparing test data"
print( "preparing test data")
X_test, X_test_str = helper.getTest(test_path=test_path, seq_max_len=num_steps)
char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")
Expand All @@ -36,20 +36,20 @@
else:
embedding_matrix = None

print "building model"
print( "building model")
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
with tf.device(gpu_config):
initializer = tf.random_uniform_initializer(-0.1, 0.1)
with tf.variable_scope("model", reuse=None, initializer=initializer):
model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, embedding_matrix=embedding_matrix, is_training=False)

print "loading model parameter"
print( "loading model parameter")
saver = tf.train.Saver()
saver.restore(sess, model_path)

print "testing"
print( "testing")
model.test(sess, X_test, X_test_str, output_path)

end_time = time.time()
print "time used %f(hour)" % ((end_time - start_time) / 3600)
print( "time used %f(hour)" % ((end_time - start_time) / 3600))
30 changes: 15 additions & 15 deletions code/bilstm_crf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse
import numpy as np
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import tensorflow as tf
from BILSTM_CRF import BILSTM_CRF

Expand All @@ -23,34 +25,32 @@
val_path = args.val_path
num_epochs = args.epoch
emb_path = args.char_emb
gpu_config = "/gpu:"+str(args.gpu)
# gpu_config = "/gpu:"+str(args.gpu)
num_steps = 200 # it must consist with the test

start_time = time.time()
print "preparing train and validation data"
X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps)
char2id, id2char = helper.loadMap("char2id")
label2id, id2label = helper.loadMap("label2id")
num_chars = len(id2char.keys())
num_classes = len(id2label.keys())
if emb_path != None:
embedding_matrix = helper.getEmbedding(emb_path)
embedding_matrix = helper.getEmbedding(emb_path)
else:
embedding_matrix = None
embedding_matrix = None

print "building model"
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
with tf.device(gpu_config):
initializer = tf.random_uniform_initializer(-0.1, 0.1)
with tf.variable_scope("model", reuse=None, initializer=initializer):
model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True)
# # with tf.device(gpu_config):
initializer = tf.random_uniform_initializer(-0.1, 0.1)
with tf.variable_scope("model", reuse=None, initializer=initializer):
model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True)

print "training model"
tf.initialize_all_variables().run()
model.train(sess, save_path, X_train, y_train, X_val, y_val)
print ("training model")
tf.global_variables_initializer().run()
model.train(sess, save_path, X_train, y_train, X_val, y_val)

print "final best f1 is: %f" % (model.max_f1)
print ("final best f1 is: %f" % (model.max_f1))

end_time = time.time()
print "time used %f(hour)" % ((end_time - start_time) / 3600)
end_time = time.time()
print ("time used %f(hour)" % ((end_time - start_time) / 3600))