From d771232f24c9ba6076dff620f51405ff27de19e5 Mon Sep 17 00:00:00 2001
From: William Song <cheeyos@github.com>
Date: Wed, 19 Nov 2014 19:10:47 -0800
Subject: [PATCH] bb detection working

---
 models/brody/deploy.prototxt                  | 238 ++++++++++++------
 models/brody/solver.prototxt                  |   2 +-
 models/brody/train_val_brody.prototxt         | 111 +++++---
 .../train_val.prototxt                        |  10 +-
 python/caffe/classifier.py                    |  19 ++
 src/caffe/solver.cpp                          |  43 ++--
 tools/convert_detection_label.cpp             |  40 ++-
 7 files changed, 322 insertions(+), 141 deletions(-)

diff --git a/models/brody/deploy.prototxt b/models/brody/deploy.prototxt
index 14634682829..85e72f10774 100644
--- a/models/brody/deploy.prototxt
+++ b/models/brody/deploy.prototxt
@@ -1,13 +1,15 @@
-# This is not brody's net.
-name: "AlexNet"
+name: "BrodyNet"
 input: "data"
 input_dim: 10
 input_dim: 3
-input_dim: 227
-input_dim: 227
+input_dim: 480
+input_dim: 640
+
 layers {
   name: "conv1"
   type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
@@ -16,9 +18,15 @@ layers {
     num_output: 96
     kernel_size: 11
     stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
   }
-  bottom: "data"
-  top: "conv1"
 }
 layers {
   name: "relu1"
@@ -26,31 +34,33 @@ layers {
   bottom: "conv1"
   top: "conv1"
 }
-layers {
-  name: "norm1"
-  type: LRN
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-  bottom: "conv1"
-  top: "norm1"
-}
 layers {
   name: "pool1"
   type: POOLING
+  bottom: "conv1"
+  top: "pool1"
   pooling_param {
     pool: MAX
     kernel_size: 3
     stride: 2
   }
-  bottom: "norm1"
-  top: "pool1"
+}
+layers {
+  name: "norm1"
+  type: LRN
+  bottom: "pool1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
 }
 layers {
   name: "conv2"
   type: CONVOLUTION
+  bottom: "norm1"
+  top: "conv2"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
@@ -60,9 +70,15 @@ layers {
     pad: 2
     kernel_size: 5
     group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
   }
-  bottom: "pool1"
-  top: "conv2"
 }
 layers {
   name: "relu2"
@@ -70,31 +86,33 @@ layers {
   bottom: "conv2"
   top: "conv2"
 }
-layers {
-  name: "norm2"
-  type: LRN
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-  bottom: "conv2"
-  top: "norm2"
-}
 layers {
   name: "pool2"
   type: POOLING
+  bottom: "conv2"
+  top: "pool2"
   pooling_param {
     pool: MAX
     kernel_size: 3
     stride: 2
   }
-  bottom: "norm2"
-  top: "pool2"
+}
+layers {
+  name: "norm2"
+  type: LRN
+  bottom: "pool2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
 }
 layers {
   name: "conv3"
   type: CONVOLUTION
+  bottom: "norm2"
+  top: "conv3"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
@@ -103,9 +121,15 @@ layers {
     num_output: 384
     pad: 1
     kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
   }
-  bottom: "pool2"
-  top: "conv3"
 }
 layers {
   name: "relu3"
@@ -116,6 +140,8 @@ layers {
 layers {
   name: "conv4"
   type: CONVOLUTION
+  bottom: "conv3"
+  top: "conv4"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
@@ -125,9 +151,15 @@ layers {
     pad: 1
     kernel_size: 3
     group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
   }
-  bottom: "conv3"
-  top: "conv4"
 }
 layers {
   name: "relu4"
@@ -138,6 +170,8 @@ layers {
 layers {
   name: "conv5"
   type: CONVOLUTION
+  bottom: "conv4"
+  top: "conv5"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
@@ -147,9 +181,15 @@ layers {
     pad: 1
     kernel_size: 3
     group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
   }
-  bottom: "conv4"
-  top: "conv5"
 }
 layers {
   name: "relu5"
@@ -160,86 +200,140 @@ layers {
 layers {
   name: "pool5"
   type: POOLING
+  bottom: "conv5"
+  top: "pool5"
   pooling_param {
     pool: MAX
     kernel_size: 3
     stride: 2
   }
-  bottom: "conv5"
-  top: "pool5"
 }
 layers {
-  name: "fc6"
-  type: INNER_PRODUCT
+  name: "fc6-conv"
+  type: CONVOLUTION
+  bottom: "pool5"
+  top: "fc6-conv"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
   weight_decay: 0
-  inner_product_param {
+  convolution_param {
     num_output: 4096
+    kernel_size: 6
+    pad: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
   }
-  bottom: "pool5"
-  top: "fc6"
 }
 layers {
   name: "relu6"
   type: RELU
-  bottom: "fc6"
-  top: "fc6"
+  bottom: "fc6-conv"
+  top: "fc6-conv"
 }
 layers {
   name: "drop6"
   type: DROPOUT
+  bottom: "fc6-conv"
+  top: "fc6-conv"
   dropout_param {
-    dropout_ratio: 0.5
+    dropout_ratio: 0.0
   }
-  bottom: "fc6"
-  top: "fc6"
 }
 layers {
-  name: "fc7"
-  type: INNER_PRODUCT
+  name: "fc7-conv"
+  type: CONVOLUTION
+  bottom: "fc6-conv"
+  top: "fc7-conv"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
   weight_decay: 0
-  inner_product_param {
+  convolution_param {
     num_output: 4096
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
   }
-  bottom: "fc6"
-  top: "fc7"
 }
 layers {
   name: "relu7"
   type: RELU
-  bottom: "fc7"
-  top: "fc7"
+  bottom: "fc7-conv"
+  top: "fc7-conv"
 }
 layers {
   name: "drop7"
   type: DROPOUT
+  bottom: "fc7-conv"
+  top: "fc7-conv"
   dropout_param {
-    dropout_ratio: 0.5
+    dropout_ratio: 0.0
   }
-  bottom: "fc7"
-  top: "fc7"
 }
+
 layers {
-  name: "fc8"
-  type: INNER_PRODUCT
+  name: "bb-output"
+  type: CONVOLUTION
+  bottom: "fc7-conv"
+  top: "bb-output"
   blobs_lr: 1
   blobs_lr: 2
   weight_decay: 1
   weight_decay: 0
-  inner_product_param {
-    num_output: 1000
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1.0
+    }
+  }
+}
+
+layers {
+  name: "pixel-conv"
+  type: CONVOLUTION
+  bottom: "fc7-conv"
+  top: "pixel-conv"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1.0
+    }
   }
-  bottom: "fc7"
-  top: "fc8"
 }
+
+# Pixel level logistic prediction.
 layers {
-  name: "prob"
-  type: SOFTMAX
-  bottom: "fc8"
-  top: "prob"
+  name: "pixel-prob"
+  type: SIGMOID
+  bottom: "pixel-conv"
+  top: "pixel-prob"
 }
diff --git a/models/brody/solver.prototxt b/models/brody/solver.prototxt
index 347fb123911..56bf6f47fe9 100644
--- a/models/brody/solver.prototxt
+++ b/models/brody/solver.prototxt
@@ -7,7 +7,7 @@ gamma: 0.1
 stepsize: 100000
 display: 20
 max_iter: 1450000
-momentum: 0.
+momentum: 0.9
 weight_decay: 0.00005
 snapshot: 10000
 snapshot_prefix: "models/brody/caffe_brody_train"
diff --git a/models/brody/train_val_brody.prototxt b/models/brody/train_val_brody.prototxt
index cad276f2ee2..085ccdf9a94 100644
--- a/models/brody/train_val_brody.prototxt
+++ b/models/brody/train_val_brody.prototxt
@@ -65,9 +65,13 @@ layers {
   bottom: "label"
   top: "pixel-label"
   top: "bb-label"
+  top: "height-label"
+  top: "norm-label"
   slice_param {
     slice_dim: 1
     slice_point: 16
+    slice_point: 80
+    slice_point: 96
   }
 }
 
@@ -86,6 +90,19 @@ layers {
   }
 }
 
+layers {
+  name: "height-block"
+  type: CONCAT
+  bottom: "height-label"
+  bottom: "height-label"
+  bottom: "height-label"
+  bottom: "height-label"
+  top: "height-block"
+  concat_param {
+    concat_dim: 1
+  }
+}
+
 layers {
   name: "conv1"
   type: CONVOLUTION
@@ -105,7 +122,7 @@ layers {
     }
     bias_filler {
       type: "constant"
-      value: 0
+      value: 0.1
     }
   }
 }
@@ -150,14 +167,14 @@ layers {
     num_output: 256
     pad: 2
     kernel_size: 5
-    group: 1
+    group: 2
     weight_filler {
       type: "gaussian"
       std: 0.01
     }
     bias_filler {
       type: "constant"
-      value: 1
+      value: 0.1
     }
   }
 }
@@ -208,7 +225,7 @@ layers {
     }
     bias_filler {
       type: "constant"
-      value: 0
+      value: 0.1
     }
   }
 }
@@ -231,14 +248,14 @@ layers {
     num_output: 384
     pad: 1
     kernel_size: 3
-    group: 1
+    group: 2
     weight_filler {
       type: "gaussian"
       std: 0.01
     }
     bias_filler {
       type: "constant"
-      value: 1
+      value: 0.1
     }
   }
 }
@@ -261,14 +278,14 @@ layers {
     num_output: 256
     pad: 1
     kernel_size: 3
-    group: 1
+    group: 2
     weight_filler {
       type: "gaussian"
       std: 0.01
     }
     bias_filler {
       type: "constant"
-      value: 1
+      value: 0.1
     }
   }
 }
@@ -304,7 +321,7 @@ layers {
     pad: 3
     weight_filler {
       type: "gaussian"
-      std: 0.005
+      std: 0.01
     }
     bias_filler {
       type: "constant"
@@ -324,7 +341,7 @@ layers {
   bottom: "fc6-conv"
   top: "fc6-conv"
   dropout_param {
-    dropout_ratio: 0.5
+    dropout_ratio: 0.0
   }
 }
 layers {
@@ -341,7 +358,7 @@ layers {
     kernel_size: 1
     weight_filler {
       type: "gaussian"
-      std: 0.005
+      std: 0.01
     }
     bias_filler {
       type: "constant"
@@ -361,7 +378,7 @@ layers {
   bottom: "fc7-conv"
   top: "fc7-conv"
   dropout_param {
-    dropout_ratio: 0.5
+    dropout_ratio: 0.0
   }
 }
 
@@ -370,16 +387,16 @@ layers {
   type: CONVOLUTION
   bottom: "fc7-conv"
   top: "bb-output"
-  blobs_lr: 0.001
-  blobs_lr: 2
-  weight_decay: 1
+  blobs_lr: 100
+  blobs_lr: 200
+  weight_decay: 0.00001
   weight_decay: 0
   convolution_param {
     num_output: 64
     kernel_size: 1
     weight_filler {
       type: "gaussian"
-      std: 0.005
+      std: 0.01
     }
     bias_filler {
       type: "constant"
@@ -441,37 +458,61 @@ layers {
 }
 
 # Squared loss on the bounding boxes.
+#layers {
+#  name: "bb-loss"
+#  type: EUCLIDEAN_LOSS
+#  bottom: "bb-masked-output"
+#  bottom: "bb-label"
+#  top: "bb-loss"
+#  loss_weight: 0.01
+#}
+
+# L1 error loss
 layers {
-  name: "bb-loss"
-  type: EUCLIDEAN_LOSS
+  name: "bb-diff"
+  type: ELTWISE
   bottom: "bb-masked-output"
   bottom: "bb-label"
-  top: "bb-loss"
+  eltwise_param {
+    operation: SUM
+    coeff: 1.0
+    coeff: -1.0
+  }
+  top: "bb-diff"
 }
 
-# L1 error loss
-#layers {
-#  name: "bb-diff"
-#  type: ELTWISE
-#  bottom: "bb-label"
-#  bottom: "bb-masked-output"
-#  eltwise_param {
-#    operation: SUM
-#    coeff: 1.0
-#    coeff: -1.0
-#  }
-#  top: "bb-diff"
-#}
-
 #layers {
 #  name: "bb-loss"
 #  type: ABSVAL
 #  bottom: "bb-diff"
 #  top: "bb-loss"
-#  # 1 / (batch_size * 20 * 15 * 64)
-#  loss_weight: 0.00001041666666666
+#  # 1 / (20 * 15 * 64)
+#  loss_weight: 0.00000000001
 #}
 
+layers {
+  name: "bb-loss-pow2"
+  type: POWER
+  bottom: "bb-diff"
+  top: "bb-loss-pow2"
+  # 1 / (20 * 15 * 64)
+  power_param {
+    power: 2
+  }
+}
+
+layers {
+  name: "bb-loss-normalize"
+  type: ELTWISE
+  bottom: "bb-loss-pow2"
+  bottom: "height-block"
+  eltwise_param {
+    operation: PROD
+  }
+  top: "bb-loss"
+  loss_weight: 0.001
+}
+
 #layers {
 #  name: "bb-loss-silence"
 #  type: SILENCE
diff --git a/models/bvlc_reference_caffenet/train_val.prototxt b/models/bvlc_reference_caffenet/train_val.prototxt
index 073d8aeff4a..cdcfb61736c 100644
--- a/models/bvlc_reference_caffenet/train_val.prototxt
+++ b/models/bvlc_reference_caffenet/train_val.prototxt
@@ -5,7 +5,7 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    source: "/deep/u/willsong/data/ilsvrc12_train_lmdb"
     backend: LMDB
     batch_size: 256
   }
@@ -22,7 +22,7 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    source: "/deep/u/willsong/data/ilsvrc12_val_lmdb"
     backend: LMDB
     batch_size: 50
   }
@@ -97,7 +97,7 @@ layers {
     num_output: 256
     pad: 2
     kernel_size: 5
-    group: 2
+    group: 1
     weight_filler {
       type: "gaussian"
       std: 0.01
@@ -178,7 +178,7 @@ layers {
     num_output: 384
     pad: 1
     kernel_size: 3
-    group: 2
+    group: 1
     weight_filler {
       type: "gaussian"
       std: 0.01
@@ -208,7 +208,7 @@ layers {
     num_output: 256
     pad: 1
     kernel_size: 3
-    group: 2
+    group: 1
     weight_filler {
       type: "gaussian"
       std: 0.01
diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index fe471ca13b1..ddc2467cbbb 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -92,3 +92,22 @@ def predict(self, inputs, oversample=True):
             predictions = predictions.mean(1)
 
         return predictions
+
+
+    def ff(self, inputs):
+        # Scale to standardize input dimensions.
+        input_ = np.zeros((len(inputs),
+            self.image_dims[0], self.image_dims[1], inputs[0].shape[2]),
+            dtype=np.float32)
+        for ix, in_ in enumerate(inputs):
+            input_[ix] = caffe.io.resize_image(in_, self.image_dims)
+
+        # Generate center, corner, and mirrored crops.
+        input_ = caffe.io.oversample(input_, self.crop_dims)
+
+        # Classify
+        caffe_in = np.zeros(np.array(input_.shape)[[0,3,1,2]],
+                            dtype=np.float32)
+        for ix, in_ in enumerate(input_):
+            caffe_in[ix] = self.preprocess(self.inputs[0], in_)
+        out = self.forward_all(**{self.inputs[0]: caffe_in})
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ba262920a9a..bd28a54229a 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -197,16 +197,18 @@ void Solver<Dtype>::Solve(const char* resume_file) {
             net_->blob_names()[net_->output_blob_indices()[j]];
         const Dtype loss_weight =
             net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+        double loss_sum = 0.0;
         for (int k = 0; k < result[j]->count(); ++k) {
-          ostringstream loss_msg_stream;
-          if (loss_weight) {
-            loss_msg_stream << " (* " << loss_weight
-                            << " = " << loss_weight * result_vec[k] << " loss)";
-          }
-          LOG(INFO) << "    Train net output #"
-              << score_index++ << ": " << output_name << " = "
-              << result_vec[k] << loss_msg_stream.str();
+          loss_sum += result_vec[k];
         }
+        ostringstream loss_msg_stream;
+        if (loss_weight) {
+          loss_msg_stream << " (* " << loss_weight
+                          << " = " << loss_weight * loss_sum << " loss)";
+        }
+        LOG(INFO) << "    Train net output #"
+            << score_index++ << ": " << output_name << " = "
+            << loss_sum << loss_msg_stream.str();
       }
     }
 
@@ -284,20 +286,23 @@ void Solver<Dtype>::Test(const int test_net_id) {
     loss /= param_.test_iter(test_net_id);
     LOG(INFO) << "Test loss: " << loss;
   }
+  double sum_mean_score = 0.0;
   for (int i = 0; i < test_score.size(); ++i) {
-    const int output_blob_index =
-        test_net->output_blob_indices()[test_score_output_id[i]];
-    const string& output_name = test_net->blob_names()[output_blob_index];
-    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
-    ostringstream loss_msg_stream;
     const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
-    if (loss_weight) {
-      loss_msg_stream << " (* " << loss_weight
-                      << " = " << loss_weight * mean_score << " loss)";
-    }
-    LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-        << mean_score << loss_msg_stream.str();
+    sum_mean_score += mean_score;
   }
+  const int output_blob_index =
+      test_net->output_blob_indices()[test_score_output_id[0]];
+  const string& output_name = test_net->blob_names()[output_blob_index];
+  const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
+  ostringstream loss_msg_stream;
+  if (loss_weight) {
+    loss_msg_stream << " (* " << loss_weight
+                    << " = " << loss_weight * sum_mean_score << " loss)";
+  }
+  LOG(INFO) << "    Test net output #" << 0 << ": " << output_name << " = "
+      << sum_mean_score << loss_msg_stream.str();
+
   Caffe::set_phase(Caffe::TRAIN);
 }
 
diff --git a/tools/convert_detection_label.cpp b/tools/convert_detection_label.cpp
index 8d1f6b9a562..a8d27f73b6c 100644
--- a/tools/convert_detection_label.cpp
+++ b/tools/convert_detection_label.cpp
@@ -59,13 +59,18 @@ bool ReadBoundingBoxLabelToDatum(
   const int img_height = height * grid_dim;
 
   // 1 pixel label, 4 bounding box coordinates.
+  const int num_mask_label = 1;
+  const int num_bb_labels = 4;
+  const int num_norm_labels = 2;
+  const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels;
   vector<cv::Mat *> labels;
-  for (int i = 0; i < 5; ++i) {
+  for (int i = 0; i < num_total_labels; ++i) {
     labels.push_back(
         new cv::Mat(img_height, img_width, CV_32F, cv::Scalar(0.0)));
   }
 
-  CHECK_EQ(bbs.size() % 4, 0);
+  CHECK_EQ(bbs.size() % num_bb_labels, 0);
+  int total_num_pixels = 0;
   for (int i = 0; i < bbs.size(); i += 4) {
     float xmin = bbs[i];
     float ymin = bbs[i + 1];
@@ -96,28 +101,45 @@ bool ReadBoundingBoxLabelToDatum(
     cv::Rect r(gxmin, gymin,
                gxmax - gxmin + (gxmax == gxmin && gxmax < img_width ? 1 : 0),
                gymax - gymin + (gymax == gymin && gymax < img_height ? 1 : 0));
-    float flabels[5] = {1.0, xmin, ymin, xmax, ymax};
-    for (int j = 0; j < 5; ++j) {
+
+    total_num_pixels += r.area();
+    int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin;
+    CHECK_GT(normalization_height, 0);
+    float flabels[num_total_labels] =
+        {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_height, 1.0};
+    for (int j = 0; j < num_total_labels; ++j) {
       cv::Mat roi(*labels[j], r);
       roi = cv::Scalar(flabels[j]);
     }
   }
 
-  datum->set_channels(5 * grid_dim * grid_dim);
+  if (total_num_pixels == 0) {
+    total_num_pixels = 1;
+  }
+  float reweight_value = 1.0 / total_num_pixels;
+  for (int y = 0; y < img_height; ++y) {
+    for (int x = 0; x < img_width; ++x) {
+      if (labels[num_total_labels - 1]->at<float>(x, y) == 1.0) {
+        labels[num_total_labels - 1]->at<float>(x, y) = reweight_value;
+      }
+    }
+  }
+
+  datum->set_channels(num_total_labels * grid_dim * grid_dim);
   datum->set_height(height);
   datum->set_width(width);
   datum->set_label(0); // dummy label
   datum->clear_data();
   datum->clear_float_data();
 
-  for (int m = 0; m < 5; ++m) {
+  for (int m = 0; m < num_total_labels; ++m) {
     for (int dy = 0; dy < grid_dim; ++dy) {
       for (int dx = 0; dx < grid_dim; ++dx) {
         for (int y = 0; y < img_height; y += grid_dim) {
           for (int x = 0; x < img_width; x += grid_dim) {
             float adjustment = 0;
             float val = labels[m]->at<float>(y + dy, x + dx);
-            if (m == 0) {
+            if (m == 0 || m > 4) {
               // do nothing
             } else if (labels[0]->at<float>(y + dy, x + dx) == 0.0) {
               // do nothing
@@ -136,9 +158,9 @@ bool ReadBoundingBoxLabelToDatum(
       }
     }
   }
-  CHECK_EQ(datum->float_data_size(), 5 * img_height * img_width);
 
-  for (int i = 0; i < 5; ++i) {
+  CHECK_EQ(datum->float_data_size(), num_total_labels * img_height * img_width);
+  for (int i = 0; i < num_total_labels; ++i) {
     delete labels[i];
   }