Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bert train fix #16

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,15 @@ venv.bak/

# mypy
.mypy_cache/

# bert-embeddings
data/bert_da*

# Data Annotations
data/*.jsonl

# Data symbolic link
data/vcr1images

# Submission File
models/submission.csv
6 changes: 4 additions & 2 deletions data/get_bert_embeddings/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
config=run_config,
predict_batch_size=FLAGS.batch_size)

input_fn = input_fn_builder(
input_fn, input_init_hook = input_fn_builder(
features=features, seq_length=FLAGS.max_seq_length)

output_h5_qa = h5py.File(f'../{FLAGS.name}_answer_{FLAGS.split}.h5', 'w')
Expand Down Expand Up @@ -254,7 +254,9 @@ def alignment_gather(alignment, layer):
return output_embs


for result in tqdm(estimator.predict(input_fn, yield_single_examples=True)):
for result in tqdm(estimator.predict(input_fn,
hooks=[input_init_hook],
yield_single_examples=True)):
ind = unique_id_to_ind[int(result["unique_id"])]

text, ctx_alignment, choice_alignment = examples[ind]
Expand Down
60 changes: 40 additions & 20 deletions data/get_bert_embeddings/vcr_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,25 @@ def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids, is_
self.is_correct = is_correct


class IteratorInitializerHook(tf.train.SessionRunHook):
"""Hook to initialise data iterator after Session is created."""

def __init__(self):
super(IteratorInitializerHook, self).__init__()
self.__init__iterator_initializer_func = None

def after_create_session(self, session, coord):
"""Initialise the iterator after the session has been created."""
self.iterator_initializer_func(session)


def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""

iterator_initializer_hook = IteratorInitializerHook()

num_examples = len(features)

all_unique_ids = []
all_input_ids = []
all_input_mask = []
Expand All @@ -39,38 +55,42 @@ def input_fn_builder(features, seq_length):
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)


def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]

num_examples = len(features)

# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.

unique_ids_placeholder = tf.placeholder(tf.int32, [num_examples])
input_ids_placeholder = tf.placeholder(tf.int32, [num_examples, seq_length])
input_mask_placeholder = tf.placeholder(tf.int32, [num_examples, seq_length])
input_type_ids_placeholder = tf.placeholder(tf.int32, [num_examples, seq_length])


d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
"unique_ids": unique_ids_placeholder,
"input_ids": input_ids_placeholder,
"input_mask": input_mask_placeholder,
"input_type_ids": input_type_ids_placeholder
})

d = d.batch(batch_size=batch_size, drop_remainder=False)
return d

feed_dict_d = {unique_ids_placeholder:all_unique_ids, input_ids_placeholder: all_input_ids,
input_mask_placeholder: all_input_mask, input_type_ids_placeholder: all_input_type_ids}

iterator = d.make_initializable_iterator()
feats = iterator.get_next()

iterator_initializer_hook.iterator_initializer_func = lambda sess: sess.run(iterator.initializer,
feed_dict=feed_dict_d)

return feats

return input_fn
return input_fn, iterator_initializer_hook


GENDER_NEUTRAL_NAMES = ['Casey', 'Riley', 'Jessie', 'Jackie', 'Avery', 'Jaime', 'Peyton', 'Kerry', 'Jody', 'Kendall',
Expand Down
7 changes: 4 additions & 3 deletions models/eval_for_leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@
parser.add_argument(
'-answer_ckpt',
dest='answer_ckpt',
default='saves/flagship_answer/best.th',
default='/data/vcr/saves/flagship_answer/best.th',
help='Answer checkpoint',
type=str,
)
parser.add_argument(
'-rationale_ckpt',
dest='rationale_ckpt',
default='saves/flagship_rationale/best.th',
default='/data/vcr/saves/flagship_rationale/best.th',
help='Rationale checkpoint',
type=str,
)
Expand All @@ -71,7 +71,8 @@ def _to_gpu(td):
if NUM_GPUS > 1:
return td
for k in td:
td[k] = {k2: v.cuda(async=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda(
if k != 'metadata':
td[k] = {k2: v.cuda(async=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda(
async=True)
return td

Expand Down