Removed some unnecessary imports, updated parameters for most recent …

…arXiv version, and added best model for the ReferIt dataset
BryanPlummer · Mar 21, 2018 · 5dd1a08 · 5dd1a08
1 parent 73b3ded
commit 5dd1a08
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,10 @@ After you download our precomputed features/model you can test it using:
 
     python main.py --test --spatial --resume runs/cite_spatial_k4/model_best
 
+You can test the ReferIt dataset by setting the dataset flag and adjusting the number of embeddings to match the trained model:
+
+    python main.py --test --spatial --dataset referit --num_embeddings 12 --resume runs/referit_spatial_k12/model_best
+
 ### Training New Models
 Our code contains everything required to train or test models using precomputed features.  You can train a new model on Flickr30K Entites using:
 
@@ -27,11 +31,11 @@ When it completes training it will output the localization accuracy using the be
 
 ### Precomputed Features
 
-Along with our example data processing script in `data_processing_example` you can download our precomputed (PASCAL) features for the Flickr30K Entities dataset [here](https://drive.google.com/file/d/1m5DQ3kh2rCkPremgM91chQgJYZxnEbZw/view?usp=sharing) (52G).  Unpack the features in a folder named `data` or update the path in the data loader class.
+Along with our example data processing script in `data_processing_example` you can download our precomputed (PASCAL) features for the Flickr30K Entities dataset [here](https://drive.google.com/open?id=10h55xBQnaYAEwODsi8Wy5CEsajAoZuzc) (126G) and ReferIt dataset [here](https://drive.google.com/open?id=1tQNG4iUXiGatnbeaO6HV3por7U5WoruH) (88G).  Unpack the features in a folder named `data` or update the path in the data loader class.
 
-Our best CITE model on Flickr30K Entities using these precomputed features can be found [here](https://drive.google.com/open?id=1rmeIqYTCIduNc2QWUEdXLHFGrlOzz2xO).
+Our best CITE model using these precomputed features can be on Flickr30K Entities can be found [here](https://drive.google.com/open?id=1vsFqVPVd3vtYfhYTcCmS3HvHOajTycbo) and ReferIt dataset [here]([here](https://drive.google.com/open?id=1P9g9C-BjY-DWIptvV80HE-hEbCDMk6jM)).
 
-You can download the raw Flickr30K Entities data [here](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/), but isn't necessary to use our precomputed features.
+You can download the raw Flickr30K Entities data [here](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/) and ReferIt [here](http://tamaraberg.com/referitgame/), but isn't necessary to use our precomputed features.
 
 
 Many thanks to [Kevin Shih](https://scholar.google.com/citations?user=4x3DhzAAAAAJ&hl=en) and [Liwei Wang](https://scholar.google.com/citations?user=qnbdnZEAAAAJ&hl=en) for access to their [Similarity Network](https://arxiv.org/abs/1704.03470) code that was used as the basis for this implementation.
diff --git a/data_loader.py b/data_loader.py
@@ -14,7 +14,7 @@ def __init__(self, args, region_dim, phrase_dim, plh, split):
         plh -- placeholder dictory containing the tensor inputs
         split -- the data split (i.e. 'train', 'test', 'val')
         """
-        datafn = os.path.join('data', 'flickr', '%s_imfeats.h5' % split)
+        datafn = os.path.join('data', args.dataset, '%s_imfeats.h5' % split)
         self.data = h5py.File(datafn, 'r')
         vecs = np.array(self.data['phrase_features'], np.float32)
         phrases = list(self.data['phrases'])

diff --git a/data_processing_example/README.md b/data_processing_example/README.md
@@ -5,7 +5,7 @@ The code currently assumes datasets are divided into three hdf5 files named `<sp
 1. phrase_features: #num_phrase X 6000 dimensional matrix of phrase features
 2. phrases: array of #num_phrase strings corresponding to the phrase features
 3. pairs: 3 x M matrix where each column contains a string representation for the `[image name, phrase, pair identifier]` pairs in the split.
-4. Each `<image name>` should return a #num_boxes x feature_dimensional matrix of the visual features.  The features should contain the visual representation as well as the spatial features for the box followed by its coordinates (i.e. the precomputed features we released are 4096 (VGG) + 5 (spatial) + 4 (box coordinates) = 4105 dimensional).
+4. Each `<image name>` should return a #num_boxes x feature_dimensional matrix of the visual features.  The features should contain the visual representation as well as the spatial features for the box followed by its coordinates (i.e. the precomputed features we released are 4096 (VGG) + 5 (spatial) + 4 (box coordinates) = 4105 dimensional for Flickr30K Entities and 4096 (VGG) + 8 (spatial) + 4 (box coordinates) = 4108 dimensional) for ReferIt.
 5. Each `<image name>_<phrase>_<pair identifier>` should contain a vector containing the intersection over union with the ground truth box followed by the box's coordinates (i.e. for N boxes the vector should be N + 4 dimensional).
 
 

diff --git a/main.py b/main.py
@@ -41,8 +41,8 @@
                     help='minimum testing intersection-over-union threshold for success (default: 0.5)')
 parser.add_argument('--dim_embed', type=int, default=256,
                     help='how many dimensions in final embedding (default: 256)')
-parser.add_argument('--max_boxes', type=int, default=200,
-                    help='maximum number of edge boxes per image (default: 200)')
+parser.add_argument('--max_boxes', type=int, default=500,
+                    help='maximum number of edge boxes per image (default: 500)')
 parser.add_argument('--num_embeddings', type=int, default=4,
                     help='number of embeddings to train (default: 4)')
 parser.add_argument('--spatial', dest='spatial', action='store_true', default=False,
@@ -62,10 +62,10 @@ def main():
             region_feature_dim += 8
 
     # setup placeholders
-    labels_plh = tf.placeholder(tf.float32, shape=[args.batch_size, None])
+    labels_plh = tf.placeholder(tf.float32, shape=[args.batch_size, args.max_boxes])
     phrase_plh = tf.placeholder(tf.float32, shape=[args.batch_size,
                                                    phrase_feature_dim])
-    region_plh = tf.placeholder(tf.float32, shape=[args.batch_size, None,
+    region_plh = tf.placeholder(tf.float32, shape=[args.batch_size, args.max_boxes,
                                                    region_feature_dim])
     train_phase_plh = tf.placeholder(tf.bool, name='train_phase')
     num_boxes_plh = tf.placeholder(tf.int32)

diff --git a/model.py b/model.py
@@ -1,10 +1,5 @@
 import tensorflow as tf
 
-from tensorflow.contrib.layers.python.layers import batch_norm
-from tensorflow.contrib.layers.python.layers import convolution2d
-from tensorflow.contrib.layers.python.layers import fully_connected
-from tensorflow.contrib.layers.python.layers import l2_regularizer
-
 def add_fc(x, outdim, train_phase_plh, scope_in):
     """Returns the output of a FC-BNORM-ReLU sequence.
 
@@ -62,9 +57,10 @@ def embedding_branch(x, embed_dim, train_phase_plh, scope_in, do_l2norm = True,
         outdim = embed_dim
 
     l2_reg = tf.contrib.layers.l2_regularizer(0.001)
-    embed_fc2 = fully_connected(embed_fc1, outdim, activation_fn = None,
-                                weights_regularizer = l2_reg,
-                                scope = scope_in + '_embed_2')
+    embed_fc2 = tf.contrib.layers.fully_connected(embed_fc1, outdim, 
+                                                  activation_fn = None,
+                                                  weights_regularizer = l2_reg,
+                                                  scope = scope_in + '_embed_2')
     if do_l2norm:
         embed_fc2 = tf.nn.l2_normalize(embed_fc2, 1)
 
@@ -88,9 +84,6 @@ def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh, num_b
     concept_loss -- L1 loss for the output of the concept weight branch
     region_prob -- each row contains the probability a region is associated with a phrase
     """
-    labels_plh = tf.reshape(labels_plh, [-1, num_boxes_plh])
-    eb_fea_plh = tf.reshape(region_plh, [-1, num_boxes_plh, region_feature_dim])
-
     final_embed = args.dim_embed
     embed_dim = final_embed * 4
     phrase_embed = embedding_branch(phrase_plh, embed_dim, train_phase_plh, 'phrase')
@@ -108,9 +101,9 @@ def setup_model(args, phrase_plh, region_plh, train_phase_plh, labels_plh, num_b
                                        concept_id, concept_weights)
 
     joint_embed_2 = tf.reshape(joint_embed_2, [tf.shape(joint_embed_2)[0], num_boxes_plh, final_embed])
-    joint_embed_3 = fully_connected(joint_embed_2, 1, activation_fn=None ,
-                                    weights_regularizer = l2_regularizer(0.005),
-                                    scope = 'joint_embed_3')
+    joint_embed_3 = tf.contrib.layers.fully_connected(joint_embed_2, 1, activation_fn=None ,
+                                                      weights_regularizer = tf.contrib.layers.l2_regularizer(0.005),
+                                                      scope = 'joint_embed_3')
     joint_embed_3 = tf.squeeze(joint_embed_3, [2])
     region_prob = 1. / (1. + tf.exp(-joint_embed_3))