From a6d13f0be2cbf46b95d656d23f84cc4c36dee0e3 Mon Sep 17 00:00:00 2001 From: William Song Date: Mon, 26 Jan 2015 09:30:21 -0800 Subject: [PATCH] normalization network --- models/brody/solver_normalization.prototxt | 15 + .../train_val_driving_normalization.prototxt | 563 ++++++++++++++++++ src/caffe/layers/driving_data_layer.cpp | 36 +- 3 files changed, 590 insertions(+), 24 deletions(-) create mode 100644 models/brody/solver_normalization.prototxt create mode 100644 models/brody/train_val_driving_normalization.prototxt diff --git a/models/brody/solver_normalization.prototxt b/models/brody/solver_normalization.prototxt new file mode 100644 index 00000000000..74e11e8cc07 --- /dev/null +++ b/models/brody/solver_normalization.prototxt @@ -0,0 +1,15 @@ +net: "models/brody/train_val_driving_normalization.prototxt" +test_iter: 20 +test_interval: 5000 +test_compute_loss: true +base_lr: 0.0000001 +lr_policy: "step" +gamma: 0.1 +stepsize: 100000 +display: 20 +max_iter: 1450000 +momentum: 0.9 +weight_decay: 0.00005 +snapshot: 1000 +snapshot_prefix: "models/brody/driving_normalization" +solver_mode: GPU diff --git a/models/brody/train_val_driving_normalization.prototxt b/models/brody/train_val_driving_normalization.prototxt new file mode 100644 index 00000000000..64d46a54cee --- /dev/null +++ b/models/brody/train_val_driving_normalization.prototxt @@ -0,0 +1,563 @@ +name: "DrivingNet" + +# Training input. +layers { + name: "data" + type: DRIVING_DATA + top: "data" + top: "label" + data_param { + source: "new_driving_train" + backend: LMDB + batch_size: 10 + } + transform_param { + mean_file: "driving_mean.binaryproto" + } + include: { phase: TRAIN } +} + +# Test input. +layers { + name: "data" + type: DRIVING_DATA + top: "data" + top: "label" + data_param { + source: "new_driving_test" + backend: LMDB + batch_size: 10 + } + transform_param { + mean_file: "driving_mean.binaryproto" + } + include: { phase: TEST } +} + +# Split label layer into pixel and bounding box label. +layers { + name: "slice-label" + type: SLICE + bottom: "label" + top: "pixel-label" + top: "bb-label" + top: "size-label" + top: "norm-label" + slice_param { + slice_dim: 1 + slice_point: 1 + slice_point: 5 + slice_point: 7 + } +} + +# Concatenate the pixel labels 4 folds such that it can be used to mask +# all 4 dimensions of the bounding box predictions. +layers { + name: "pixel-block" + type: CONCAT + bottom: "pixel-label" + bottom: "pixel-label" + bottom: "pixel-label" + bottom: "pixel-label" + top: "pixel-block" + concat_param { + concat_dim: 1 + } +} + +layers { + name: "size-block" + type: CONCAT + bottom: "size-label" + bottom: "size-label" + top: "size-block" + concat_param { + concat_dim: 1 + } +} + +layers { + name: "norm-block" + type: CONCAT + bottom: "norm-label" + bottom: "norm-label" + bottom: "norm-label" + bottom: "norm-label" + top: "norm-block" + concat_param { + concat_dim: 1 + } +} + +layers { + name: "bb-label-size-normalization" + type: ELTWISE + bottom: "bb-label" + bottom: "size-block" + top: "bb-label-sn" + eltwise_param { + operation: PROD + } +} + +layers { + name: "bb-label-num-pixel-normalization" + type: ELTWISE + bottom: "bb-label-sn" + bottom: "norm-block" + top: "bb-label-sn-nn" + eltwise_param { + operation: PROD + } +} + +layers { + name: "L0" + type: CONVOLUTION + bottom: "data" + top: "L0" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + name: "relu1" + type: RELU + bottom: "L0" + top: "L0" +} +layers { + name: "norm1" + type: LRN_FIXED + bottom: "L0" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layers { + name: "pool1" + type: POOLING + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layers { + name: "L1" + type: CONVOLUTION + bottom: "pool1" + top: "L1" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + name: "relu2" + type: RELU + bottom: "L1" + top: "L1" +} +layers { + name: "norm2" + type: LRN_FIXED + bottom: "L1" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layers { + name: "pool2" + type: POOLING + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layers { + name: "L2" + type: CONVOLUTION + bottom: "pool2" + top: "L2" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + name: "relu3" + type: RELU + bottom: "L2" + top: "L2" +} +layers { + name: "L3" + type: CONVOLUTION + bottom: "L2" + top: "L3" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + name: "relu4" + type: RELU + bottom: "L3" + top: "L3" +} +layers { + name: "L4" + type: CONVOLUTION + bottom: "L3" + top: "L4" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + name: "relu5" + type: RELU + bottom: "L4" + top: "L4" +} +layers { + name: "pool5" + type: POOLING + bottom: "L4" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layers { + name: "L5" + type: CONVOLUTION + bottom: "pool5" + top: "L5" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 4096 + kernel_size: 6 + pad: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layers { + name: "relu6" + type: RELU + bottom: "L5" + top: "L5" +} +layers { + name: "drop6" + type: DROPOUT_FIXED + bottom: "L5" + top: "L5" + dropout_param { + dropout_ratio: 0.5 + } +} +layers { + name: "L6" + type: CONVOLUTION + bottom: "L5" + top: "L6" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 4096 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layers { + name: "relu7" + type: RELU + bottom: "L6" + top: "L6" +} +layers { + name: "drop7" + type: DROPOUT_FIXED + bottom: "L6" + top: "L6" + dropout_param { + dropout_ratio: 0.5 + } +} + +layers { + name: "bb-output" + type: CONVOLUTION + bottom: "L6" + top: "bb-output" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1.0 + } + } +} + +layers { + name: "pixel-conv" + type: CONVOLUTION + bottom: "L6" + top: "pixel-conv" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1.0 + } + } +} + +layers { + name: "pixel-tile" + type: TILING + bottom: "pixel-conv" + top: "pixel-conv-tiled" + tiling_param { + tile_dim: 8 + } +} + +layers { + name: "bb-tile" + type: TILING + bottom: "bb-output" + top: "bb-output-tiled" + tiling_param { + tile_dim: 8 + } +} + +# Pixel level softmax loss. +layers { + name: "pixel-loss" + type: SOFTMAX_LOSS + bottom: "pixel-conv-tiled" + bottom: "pixel-label" + top: "pixel-loss" +} + +# Masking the bounding boxes with input label. +layers { + name: "bb-prob-mask" + type: ELTWISE + bottom: "bb-output-tiled" + bottom: "pixel-block" + top: "bb-masked-output" + eltwise_param { + operation: PROD + } +} + +layers { + name: "bb-size-normalization" + type: ELTWISE + bottom: "bb-masked-output" + bottom: "size-block" + top: "bb-masked-output-sn" + eltwise_param { + operation: PROD + } +} + +layers { + name: "bb-num-pixel-normalization" + type: ELTWISE + bottom: "bb-masked-output-sn" + bottom: "norm-block" + top: "bb-masked-output-sn-nn" + eltwise_param { + operation: PROD + } +} + +layers { + name: "bb-loss" + type: L1_LOSS + bottom: "bb-masked-output-sn-nn" + bottom: "bb-label-sn-nn" + top: "bb-loss" + loss_weight: 10.0 +} + +# L1 error loss +#layers { +# name: "bb-diff" +# type: ELTWISE +# bottom: "bb-masked-output" +# bottom: "bb-label" +# eltwise_param { +# operation: SUM +# coeff: 1.0 +# coeff: -1.0 +# } +# top: "bb-diff" +#} + +#layers { +# name: "bb-loss" +# type: ABSVAL +# bottom: "bb-diff" +# top: "bb-loss" +# # 1 / (20 * 15 * 64) +# loss_weight: 0.00000000001 +#} + +#layers { +# name: "bb-loss-pow2" +# type: POWER +# bottom: "bb-diff" +# top: "bb-loss-pow2" +# # 1 / (20 * 15 * 64) +# power_param { +# power: 2 +# } +#} + +#layers { +# name: "bb-loss-height-normalize" +# type: ELTWISE +# bottom: "bb-loss-pow2" +# bottom: "height-block" +# eltwise_param { +# operation: PROD +# } +# top: "bb-loss" +# loss_weight: 0.1 +#} + +#layers { +# name: "bb-loss-silence" +# type: SILENCE +# bottom: "bb-loss" +#} diff --git a/src/caffe/layers/driving_data_layer.cpp b/src/caffe/layers/driving_data_layer.cpp index e0a70a65e77..849e49ab9dc 100644 --- a/src/caffe/layers/driving_data_layer.cpp +++ b/src/caffe/layers/driving_data_layer.cpp @@ -34,7 +34,7 @@ enum TopLayerType { const int kNumData = 1; const int kNumLabels = 1; const int kNumBBRegressionCoords = 4; -const int kNumRegressionMasks = 7; +const int kNumRegressionMasks = 8; namespace caffe { @@ -272,11 +272,8 @@ bool DrivingDataLayer::ReadBoundingBoxLabelToDatumLegacy( const int full_label_height = height * grid_dim; const float scaling = static_cast(full_label_width) / data.car_cropped_width(); - // 1 pixel label, 4 bounding box coordinates, 2 normalization labels. - const int num_mask_label = 1; - const int num_bb_labels = 4; - const int num_norm_labels = 2; - const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels; + // 1 pixel label, 4 bounding box coordinates, 3 normalization labels. + const int num_total_labels = kNumRegressionMasks; vector labels; for (int i = 0; i < num_total_labels; ++i) { labels.push_back( @@ -409,11 +406,8 @@ bool DrivingDataLayer::ReadBoundingBoxLabelToDatum( const float half_shrink_factor = data.car_shrink_factor() / 2; const float scaling = static_cast(full_label_width) / data.car_cropped_width(); - // 1 pixel label, 4 bounding box coordinates, 2 normalization labels. - const int num_mask_label = 1; - const int num_bb_labels = 4; - const int num_norm_labels = 2; - const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels; + // 1 pixel label, 4 bounding box coordinates, 3 normalization labels. + const int num_total_labels = kNumRegressionMasks; vector labels; for (int i = 0; i < num_total_labels; ++i) { labels.push_back( @@ -455,22 +449,16 @@ bool DrivingDataLayer::ReadBoundingBoxLabelToDatum( gxmax - gxmin + (gxmax == gxmin && gxmax < full_label_width ? 1 : 0), gymax - gymin + (gymax == gymin && gymax < full_label_height ? 1 : 0)); - int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin; - CHECK_GT(normalization_height, 0); int normalization_width = xmax - xmin == 0 ? 1 : xmax - xmin; CHECK_GT(normalization_width, 0); - float default_flabels[num_total_labels] = - {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_width, 1.0}; - float y_flabels[num_total_labels] = - {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_height, 1.0}; + int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin; + CHECK_GT(normalization_height, 0); + float flabels[num_total_labels] = + {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_width, + 1.0 / normalization_height, 1.0}; for (int j = 0; j < num_total_labels; ++j) { - if (j == 2 || j == 4) { - cv::Mat roi(*labels[j], r); - roi = cv::Scalar(y_flabels[j]); - } else { - cv::Mat roi(*labels[j], r); - roi = cv::Scalar(default_flabels[j]); - } + cv::Mat roi(*labels[j], r); + roi = cv::Scalar(flabels[j]); } }