diff --git a/README.md b/README.md index ee7808c..32fe8d2 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,10 @@ After you download our precomputed features/model you can test it using: python main.py --test --spatial --resume runs/cite_spatial_k4/model_best +You can test the ReferIt dataset by setting the dataset flag and adjusting the number of embeddings to match the trained model: + + python main.py --test --spatial --dataset referit --num_embeddings 12 --resume runs/referit_spatial_k12/model_best + ### Training New Models Our code contains everything required to train or test models using precomputed features. You can train a new model on Flickr30K Entites using: @@ -27,11 +31,11 @@ When it completes training it will output the localization accuracy using the be ### Precomputed Features -Along with our example data processing script in `data_processing_example` you can download our precomputed (PASCAL) features for the Flickr30K Entities dataset [here](https://drive.google.com/file/d/1m5DQ3kh2rCkPremgM91chQgJYZxnEbZw/view?usp=sharing) (52G). Unpack the features in a folder named `data` or update the path in the data loader class. +Along with our example data processing script in `data_processing_example` you can download our precomputed (PASCAL) features for the Flickr30K Entities dataset [here](https://drive.google.com/open?id=10h55xBQnaYAEwODsi8Wy5CEsajAoZuzc) (126G) and ReferIt dataset [here](https://drive.google.com/open?id=1tQNG4iUXiGatnbeaO6HV3por7U5WoruH) (88G). Unpack the features in a folder named `data` or update the path in the data loader class. -Our best CITE model on Flickr30K Entities using these precomputed features can be found [here](https://drive.google.com/open?id=1rmeIqYTCIduNc2QWUEdXLHFGrlOzz2xO). +Our best CITE model using these precomputed features can be on Flickr30K Entities can be found [here](https://drive.google.com/open?id=1vsFqVPVd3vtYfhYTcCmS3HvHOajTycbo) and ReferIt dataset [here]([here](https://drive.google.com/open?id=1P9g9C-BjY-DWIptvV80HE-hEbCDMk6jM)). -You can download the raw Flickr30K Entities data [here](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/), but isn't necessary to use our precomputed features. +You can download the raw Flickr30K Entities data [here](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/) and ReferIt [here](http://tamaraberg.com/referitgame/), but isn't necessary to use our precomputed features. Many thanks to [Kevin Shih](https://scholar.google.com/citations?user=4x3DhzAAAAAJ&hl=en) and [Liwei Wang](https://scholar.google.com/citations?user=qnbdnZEAAAAJ&hl=en) for access to their [Similarity Network](https://arxiv.org/abs/1704.03470) code that was used as the basis for this implementation. \ No newline at end of file diff --git a/data_loader.py b/data_loader.py index 4e7e520..d16b0d0 100644 --- a/data_loader.py +++ b/data_loader.py @@ -14,7 +14,7 @@ def __init__(self, args, region_dim, phrase_dim, plh, split): plh -- placeholder dictory containing the tensor inputs split -- the data split (i.e. 'train', 'test', 'val') """ - datafn = os.path.join('data', 'flickr', '%s_imfeats.h5' % split) + datafn = os.path.join('data', args.dataset, '%s_imfeats.h5' % split) self.data = h5py.File(datafn, 'r') vecs = np.array(self.data['phrase_features'], np.float32) phrases = list(self.data['phrases']) diff --git a/data_processing_example/README.md b/data_processing_example/README.md index 7679da8..5b66322 100644 --- a/data_processing_example/README.md +++ b/data_processing_example/README.md @@ -5,7 +5,7 @@ The code currently assumes datasets are divided into three hdf5 files named `` should return a #num_boxes x feature_dimensional matrix of the visual features. The features should contain the visual representation as well as the spatial features for the box followed by its coordinates (i.e. the precomputed features we released are 4096 (VGG) + 5 (spatial) + 4 (box coordinates) = 4105 dimensional). +4. Each `` should return a #num_boxes x feature_dimensional matrix of the visual features. The features should contain the visual representation as well as the spatial features for the box followed by its coordinates (i.e. the precomputed features we released are 4096 (VGG) + 5 (spatial) + 4 (box coordinates) = 4105 dimensional for Flickr30K Entities and 4096 (VGG) + 8 (spatial) + 4 (box coordinates) = 4108 dimensional) for ReferIt. 5. Each `__` should contain a vector containing the intersection over union with the ground truth box followed by the box's coordinates (i.e. for N boxes the vector should be N + 4 dimensional). diff --git a/main.py b/main.py index 97e6f92..08b5fd4 100644 --- a/main.py +++ b/main.py @@ -41,8 +41,8 @@ help='minimum testing intersection-over-union threshold for success (default: 0.5)') parser.add_argument('--dim_embed', type=int, default=256, help='how many dimensions in final embedding (default: 256)') -parser.add_argument('--max_boxes', type=int, default=200, - help='maximum number of edge boxes per image (default: 200)') +parser.add_argument('--max_boxes', type=int, default=500, + help='maximum number of edge boxes per image (default: 500)') parser.add_argument('--num_embeddings', type=int, default=4, help='number of embeddings to train (default: 4)') parser.add_argument('--spatial', dest='spatial', action='store_true', default=False, @@ -62,10 +62,10 @@ def main(): region_feature_dim += 8 # setup placeholders - labels_plh = tf.placeholder(tf.float32, shape=[args.batch_size, None]) + labels_plh = tf.placeholder(tf.float32, shape=[args.batch_size, args.max_boxes]) phrase_plh = tf.placeholder(tf.float32, shape=[args.batch_size, phrase_feature_dim]) - region_plh = tf.placeholder(tf.float32, shape=[args.batch_size, None, + region_plh = tf.placeholder(tf.float32, shape=[args.batch_size, args.max_boxes, region_feature_dim]) train_phase_plh = tf.placeholder(tf.bool, name='train_phase') num_boxes_plh = tf.placeholder(tf.int32) diff --git a/model.py b/model.py index a2a4aa3..2fa387b 100644 --- a/model.py +++ b/model.py @@ -1,10 +1,5 @@ import tensorflow as tf -from tensorflow.contrib.layers.python.layers import batch_norm -from tensorflow.contrib.layers.python.layers import convolution2d -from tensorflow.contrib.layers.python.layers import fully_connected -from tensorflow.contrib.layers.python.layers import l2_regularizer - def add_fc(x, outdim, train_phase_plh, scope_in): """Returns the output of a FC-BNORM-ReLU sequence. @@ -62,9 +57,10 @@ def embedding_branch(x, embed_dim, train_phase_plh, scope_in, do_l2norm = True, outdim = embed_dim l2_reg = tf.contrib.layers.l2_regularizer(0.001) - embed_fc2 = fully_connected(embed_fc1, outdim, activation_fn = None, - weights_regularizer = l2_reg, - scope = scope_in + '_embed_2') + embed_fc2 = tf.contrib.layers.fully_connected(embed_fc1, outdim, + activation_fn = None, + weights_regularizer = l2_reg, + scope = scope_in + '_embed_2') if do_l2norm: embed_fc2 = tf.nn.l2_normalize(embed_fc2, 1) @@ -88,9 +84,6 @@ def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh, num_b concept_loss -- L1 loss for the output of the concept weight branch region_prob -- each row contains the probability a region is associated with a phrase """ - labels_plh = tf.reshape(labels_plh, [-1, num_boxes_plh]) - eb_fea_plh = tf.reshape(region_plh, [-1, num_boxes_plh, region_feature_dim]) - final_embed = args.dim_embed embed_dim = final_embed * 4 phrase_embed = embedding_branch(phrase_plh, embed_dim, train_phase_plh, 'phrase') @@ -108,9 +101,9 @@ def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh, num_b concept_id, concept_weights) joint_embed_2 = tf.reshape(joint_embed_2, [tf.shape(joint_embed_2)[0], num_boxes_plh, final_embed]) - joint_embed_3 = fully_connected(joint_embed_2, 1, activation_fn=None , - weights_regularizer = l2_regularizer(0.005), - scope = 'joint_embed_3') + joint_embed_3 = tf.contrib.layers.fully_connected(joint_embed_2, 1, activation_fn=None , + weights_regularizer = tf.contrib.layers.l2_regularizer(0.005), + scope = 'joint_embed_3') joint_embed_3 = tf.squeeze(joint_embed_3, [2]) region_prob = 1. / (1. + tf.exp(-joint_embed_3))