From a6d13f0be2cbf46b95d656d23f84cc4c36dee0e3 Mon Sep 17 00:00:00 2001
From: William Song <cheeyos@github.com>
Date: Mon, 26 Jan 2015 09:30:21 -0800
Subject: [PATCH] normalization network

---
 models/brody/solver_normalization.prototxt    |  15 +
 .../train_val_driving_normalization.prototxt  | 563 ++++++++++++++++++
 src/caffe/layers/driving_data_layer.cpp       |  36 +-
 3 files changed, 590 insertions(+), 24 deletions(-)
 create mode 100644 models/brody/solver_normalization.prototxt
 create mode 100644 models/brody/train_val_driving_normalization.prototxt

diff --git a/models/brody/solver_normalization.prototxt b/models/brody/solver_normalization.prototxt
new file mode 100644
index 00000000000..74e11e8cc07
--- /dev/null
+++ b/models/brody/solver_normalization.prototxt
@@ -0,0 +1,15 @@
+net: "models/brody/train_val_driving_normalization.prototxt"
+test_iter: 20
+test_interval: 5000
+test_compute_loss: true
+base_lr: 0.0000001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 100000
+display: 20
+max_iter: 1450000
+momentum: 0.9
+weight_decay: 0.00005
+snapshot: 1000
+snapshot_prefix: "models/brody/driving_normalization"
+solver_mode: GPU
diff --git a/models/brody/train_val_driving_normalization.prototxt b/models/brody/train_val_driving_normalization.prototxt
new file mode 100644
index 00000000000..64d46a54cee
--- /dev/null
+++ b/models/brody/train_val_driving_normalization.prototxt
@@ -0,0 +1,563 @@
+name: "DrivingNet"
+
+# Training input.
+layers {
+  name: "data"
+  type: DRIVING_DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "new_driving_train"
+    backend: LMDB
+    batch_size: 10
+  }
+  transform_param {
+    mean_file: "driving_mean.binaryproto"
+  }
+  include: { phase: TRAIN }
+}
+
+# Test input.
+layers {
+  name: "data"
+  type: DRIVING_DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "new_driving_test"
+    backend: LMDB
+    batch_size: 10
+  }
+  transform_param {
+    mean_file: "driving_mean.binaryproto"
+  }
+  include: { phase: TEST }
+}
+
+# Split label layer into pixel and bounding box label.
+layers {
+  name: "slice-label"
+  type: SLICE
+  bottom: "label"
+  top: "pixel-label"
+  top: "bb-label"
+  top: "size-label"
+  top: "norm-label"
+  slice_param {
+    slice_dim: 1
+    slice_point: 1
+    slice_point: 5
+    slice_point: 7
+  }
+}
+
+# Concatenate the pixel labels 4 folds such that it can be used to mask
+# all 4 dimensions of the bounding box predictions.
+layers {
+  name: "pixel-block"
+  type: CONCAT
+  bottom: "pixel-label"
+  bottom: "pixel-label"
+  bottom: "pixel-label"
+  bottom: "pixel-label"
+  top: "pixel-block"
+  concat_param {
+    concat_dim: 1
+  }
+}
+
+layers {
+  name: "size-block"
+  type: CONCAT
+  bottom: "size-label"
+  bottom: "size-label"
+  top: "size-block"
+  concat_param {
+    concat_dim: 1
+  }
+}
+
+layers {
+  name: "norm-block"
+  type: CONCAT
+  bottom: "norm-label"
+  bottom: "norm-label"
+  bottom: "norm-label"
+  bottom: "norm-label"
+  top: "norm-block"
+  concat_param {
+    concat_dim: 1
+  }
+}
+
+layers {
+  name: "bb-label-size-normalization"
+  type: ELTWISE
+  bottom: "bb-label"
+  bottom: "size-block"
+  top: "bb-label-sn"
+  eltwise_param {
+    operation: PROD
+  }
+}
+
+layers {
+  name: "bb-label-num-pixel-normalization"
+  type: ELTWISE
+  bottom: "bb-label-sn"
+  bottom: "norm-block"
+  top: "bb-label-sn-nn"
+  eltwise_param {
+    operation: PROD
+  }
+}
+
+layers {
+  name: "L0"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "L0"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "L0"
+  top: "L0"
+}
+layers {
+  name: "norm1"
+  type: LRN_FIXED
+  bottom: "L0"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layers {
+  name: "L1"
+  type: CONVOLUTION
+  bottom: "pool1"
+  top: "L1"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu2"
+  type: RELU
+  bottom: "L1"
+  top: "L1"
+}
+layers {
+  name: "norm2"
+  type: LRN_FIXED
+  bottom: "L1"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layers {
+  name: "pool2"
+  type: POOLING
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layers {
+  name: "L2"
+  type: CONVOLUTION
+  bottom: "pool2"
+  top: "L2"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu3"
+  type: RELU
+  bottom: "L2"
+  top: "L2"
+}
+layers {
+  name: "L3"
+  type: CONVOLUTION
+  bottom: "L2"
+  top: "L3"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu4"
+  type: RELU
+  bottom: "L3"
+  top: "L3"
+}
+layers {
+  name: "L4"
+  type: CONVOLUTION
+  bottom: "L3"
+  top: "L4"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layers {
+  name: "relu5"
+  type: RELU
+  bottom: "L4"
+  top: "L4"
+}
+layers {
+  name: "pool5"
+  type: POOLING
+  bottom: "L4"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+
+layers {
+  name: "L5"
+  type: CONVOLUTION
+  bottom: "pool5"
+  top: "L5"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 4096
+    kernel_size: 6
+    pad: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layers {
+  name: "relu6"
+  type: RELU
+  bottom: "L5"
+  top: "L5"
+}
+layers {
+  name: "drop6"
+  type: DROPOUT_FIXED
+  bottom: "L5"
+  top: "L5"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layers {
+  name: "L6"
+  type: CONVOLUTION
+  bottom: "L5"
+  top: "L6"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 4096
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layers {
+  name: "relu7"
+  type: RELU
+  bottom: "L6"
+  top: "L6"
+}
+layers {
+  name: "drop7"
+  type: DROPOUT_FIXED
+  bottom: "L6"
+  top: "L6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+
+layers {
+  name: "bb-output"
+  type: CONVOLUTION
+  bottom: "L6"
+  top: "bb-output"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1.0
+    }
+  }
+}
+
+layers {
+  name: "pixel-conv"
+  type: CONVOLUTION
+  bottom: "L6"
+  top: "pixel-conv"
+  blobs_lr: 1
+  blobs_lr: 2
+  weight_decay: 1
+  weight_decay: 0
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1.0
+    }
+  }
+}
+
+layers {
+  name: "pixel-tile"
+  type: TILING
+  bottom: "pixel-conv"
+  top: "pixel-conv-tiled"
+  tiling_param {
+    tile_dim: 8
+  }
+}
+
+layers {
+  name: "bb-tile"
+  type: TILING
+  bottom: "bb-output"
+  top: "bb-output-tiled"
+  tiling_param {
+    tile_dim: 8
+  }
+}
+
+# Pixel level softmax loss.
+layers {
+  name: "pixel-loss"
+  type: SOFTMAX_LOSS
+  bottom: "pixel-conv-tiled"
+  bottom: "pixel-label"
+  top: "pixel-loss"
+}
+
+# Masking the bounding boxes with input label.
+layers {
+  name: "bb-prob-mask"
+  type: ELTWISE
+  bottom: "bb-output-tiled"
+  bottom: "pixel-block"
+  top: "bb-masked-output"
+  eltwise_param {
+    operation: PROD
+  }
+}
+
+layers {
+  name: "bb-size-normalization"
+  type: ELTWISE
+  bottom: "bb-masked-output"
+  bottom: "size-block"
+  top: "bb-masked-output-sn"
+  eltwise_param {
+    operation: PROD
+  }
+}
+
+layers {
+  name: "bb-num-pixel-normalization"
+  type: ELTWISE
+  bottom: "bb-masked-output-sn"
+  bottom: "norm-block"
+  top: "bb-masked-output-sn-nn"
+  eltwise_param {
+    operation: PROD
+  }
+}
+
+layers {
+  name: "bb-loss"
+  type: L1_LOSS
+  bottom: "bb-masked-output-sn-nn"
+  bottom: "bb-label-sn-nn"
+  top: "bb-loss"
+  loss_weight: 10.0
+}
+
+# L1 error loss
+#layers {
+#  name: "bb-diff"
+#  type: ELTWISE
+#  bottom: "bb-masked-output"
+#  bottom: "bb-label"
+#  eltwise_param {
+#    operation: SUM
+#    coeff: 1.0
+#    coeff: -1.0
+#  }
+#  top: "bb-diff"
+#}
+
+#layers {
+#  name: "bb-loss"
+#  type: ABSVAL
+#  bottom: "bb-diff"
+#  top: "bb-loss"
+#  # 1 / (20 * 15 * 64)
+#  loss_weight: 0.00000000001
+#}
+
+#layers {
+#  name: "bb-loss-pow2"
+#  type: POWER
+#  bottom: "bb-diff"
+#  top: "bb-loss-pow2"
+#  # 1 / (20 * 15 * 64)
+#  power_param {
+#    power: 2
+#  }
+#}
+
+#layers {
+#  name: "bb-loss-height-normalize"
+#  type: ELTWISE
+#  bottom: "bb-loss-pow2"
+#  bottom: "height-block"
+#  eltwise_param {
+#    operation: PROD
+#  }
+#  top: "bb-loss"
+#  loss_weight: 0.1
+#}
+
+#layers {
+#  name: "bb-loss-silence"
+#  type: SILENCE
+#  bottom: "bb-loss"
+#}
diff --git a/src/caffe/layers/driving_data_layer.cpp b/src/caffe/layers/driving_data_layer.cpp
index e0a70a65e77..849e49ab9dc 100644
--- a/src/caffe/layers/driving_data_layer.cpp
+++ b/src/caffe/layers/driving_data_layer.cpp
@@ -34,7 +34,7 @@ enum TopLayerType {
 const int kNumData = 1;
 const int kNumLabels = 1;
 const int kNumBBRegressionCoords = 4;
-const int kNumRegressionMasks = 7;
+const int kNumRegressionMasks = 8;
 
 namespace caffe {
 
@@ -272,11 +272,8 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatumLegacy(
   const int full_label_height = height * grid_dim;
   const float scaling = static_cast<float>(full_label_width) / data.car_cropped_width();
 
-  // 1 pixel label, 4 bounding box coordinates, 2 normalization labels.
-  const int num_mask_label = 1;
-  const int num_bb_labels = 4;
-  const int num_norm_labels = 2;
-  const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels;
+  // 1 pixel label, 4 bounding box coordinates, 3 normalization labels.
+  const int num_total_labels = kNumRegressionMasks;
   vector<cv::Mat *> labels;
   for (int i = 0; i < num_total_labels; ++i) {
     labels.push_back(
@@ -409,11 +406,8 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
   const float half_shrink_factor = data.car_shrink_factor() / 2;
   const float scaling = static_cast<float>(full_label_width) / data.car_cropped_width();
 
-  // 1 pixel label, 4 bounding box coordinates, 2 normalization labels.
-  const int num_mask_label = 1;
-  const int num_bb_labels = 4;
-  const int num_norm_labels = 2;
-  const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels;
+  // 1 pixel label, 4 bounding box coordinates, 3 normalization labels.
+  const int num_total_labels = kNumRegressionMasks;
   vector<cv::Mat *> labels;
   for (int i = 0; i < num_total_labels; ++i) {
     labels.push_back(
@@ -455,22 +449,16 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
                gxmax - gxmin + (gxmax == gxmin && gxmax < full_label_width ? 1 : 0),
                gymax - gymin + (gymax == gymin && gymax < full_label_height ? 1 : 0));
 
-    int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin;
-    CHECK_GT(normalization_height, 0);
     int normalization_width = xmax - xmin == 0 ? 1 : xmax - xmin;
     CHECK_GT(normalization_width, 0);
-    float default_flabels[num_total_labels] =
-        {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_width, 1.0};
-    float y_flabels[num_total_labels] =
-        {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_height, 1.0};
+    int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin;
+    CHECK_GT(normalization_height, 0);
+    float flabels[num_total_labels] =
+        {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_width,
+         1.0 / normalization_height, 1.0};
     for (int j = 0; j < num_total_labels; ++j) {
-      if (j == 2 || j == 4) {
-        cv::Mat roi(*labels[j], r);
-        roi = cv::Scalar(y_flabels[j]);
-      } else {
-        cv::Mat roi(*labels[j], r);
-        roi = cv::Scalar(default_flabels[j]);
-      }
+      cv::Mat roi(*labels[j], r);
+      roi = cv::Scalar(flabels[j]);
     }
   }