From d771232f24c9ba6076dff620f51405ff27de19e5 Mon Sep 17 00:00:00 2001 From: William Song Date: Wed, 19 Nov 2014 19:10:47 -0800 Subject: [PATCH] bb detection working --- models/brody/deploy.prototxt | 238 ++++++++++++------ models/brody/solver.prototxt | 2 +- models/brody/train_val_brody.prototxt | 111 +++++--- .../train_val.prototxt | 10 +- python/caffe/classifier.py | 19 ++ src/caffe/solver.cpp | 43 ++-- tools/convert_detection_label.cpp | 40 ++- 7 files changed, 322 insertions(+), 141 deletions(-) diff --git a/models/brody/deploy.prototxt b/models/brody/deploy.prototxt index 14634682829..85e72f10774 100644 --- a/models/brody/deploy.prototxt +++ b/models/brody/deploy.prototxt @@ -1,13 +1,15 @@ -# This is not brody's net. -name: "AlexNet" +name: "BrodyNet" input: "data" input_dim: 10 input_dim: 3 -input_dim: 227 -input_dim: 227 +input_dim: 480 +input_dim: 640 + layers { name: "conv1" type: CONVOLUTION + bottom: "data" + top: "conv1" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 @@ -16,9 +18,15 @@ layers { num_output: 96 kernel_size: 11 stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } } - bottom: "data" - top: "conv1" } layers { name: "relu1" @@ -26,31 +34,33 @@ layers { bottom: "conv1" top: "conv1" } -layers { - name: "norm1" - type: LRN - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } - bottom: "conv1" - top: "norm1" -} layers { name: "pool1" type: POOLING + bottom: "conv1" + top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } - bottom: "norm1" - top: "pool1" +} +layers { + name: "norm1" + type: LRN + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } } layers { name: "conv2" type: CONVOLUTION + bottom: "norm1" + top: "conv2" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 @@ -60,9 +70,15 @@ layers { pad: 2 kernel_size: 5 group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } } - bottom: "pool1" - top: "conv2" } layers { name: "relu2" @@ -70,31 +86,33 @@ layers { bottom: "conv2" top: "conv2" } -layers { - name: "norm2" - type: LRN - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } - bottom: "conv2" - top: "norm2" -} layers { name: "pool2" type: POOLING + bottom: "conv2" + top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } - bottom: "norm2" - top: "pool2" +} +layers { + name: "norm2" + type: LRN + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } } layers { name: "conv3" type: CONVOLUTION + bottom: "norm2" + top: "conv3" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 @@ -103,9 +121,15 @@ layers { num_output: 384 pad: 1 kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } } - bottom: "pool2" - top: "conv3" } layers { name: "relu3" @@ -116,6 +140,8 @@ layers { layers { name: "conv4" type: CONVOLUTION + bottom: "conv3" + top: "conv4" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 @@ -125,9 +151,15 @@ layers { pad: 1 kernel_size: 3 group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } } - bottom: "conv3" - top: "conv4" } layers { name: "relu4" @@ -138,6 +170,8 @@ layers { layers { name: "conv5" type: CONVOLUTION + bottom: "conv4" + top: "conv5" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 @@ -147,9 +181,15 @@ layers { pad: 1 kernel_size: 3 group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } } - bottom: "conv4" - top: "conv5" } layers { name: "relu5" @@ -160,86 +200,140 @@ layers { layers { name: "pool5" type: POOLING + bottom: "conv5" + top: "pool5" pooling_param { pool: MAX kernel_size: 3 stride: 2 } - bottom: "conv5" - top: "pool5" } layers { - name: "fc6" - type: INNER_PRODUCT + name: "fc6-conv" + type: CONVOLUTION + bottom: "pool5" + top: "fc6-conv" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 weight_decay: 0 - inner_product_param { + convolution_param { num_output: 4096 + kernel_size: 6 + pad: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } } - bottom: "pool5" - top: "fc6" } layers { name: "relu6" type: RELU - bottom: "fc6" - top: "fc6" + bottom: "fc6-conv" + top: "fc6-conv" } layers { name: "drop6" type: DROPOUT + bottom: "fc6-conv" + top: "fc6-conv" dropout_param { - dropout_ratio: 0.5 + dropout_ratio: 0.0 } - bottom: "fc6" - top: "fc6" } layers { - name: "fc7" - type: INNER_PRODUCT + name: "fc7-conv" + type: CONVOLUTION + bottom: "fc6-conv" + top: "fc7-conv" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 weight_decay: 0 - inner_product_param { + convolution_param { num_output: 4096 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } } - bottom: "fc6" - top: "fc7" } layers { name: "relu7" type: RELU - bottom: "fc7" - top: "fc7" + bottom: "fc7-conv" + top: "fc7-conv" } layers { name: "drop7" type: DROPOUT + bottom: "fc7-conv" + top: "fc7-conv" dropout_param { - dropout_ratio: 0.5 + dropout_ratio: 0.0 } - bottom: "fc7" - top: "fc7" } + layers { - name: "fc8" - type: INNER_PRODUCT + name: "bb-output" + type: CONVOLUTION + bottom: "fc7-conv" + top: "bb-output" blobs_lr: 1 blobs_lr: 2 weight_decay: 1 weight_decay: 0 - inner_product_param { - num_output: 1000 + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1.0 + } + } +} + +layers { + name: "pixel-conv" + type: CONVOLUTION + bottom: "fc7-conv" + top: "pixel-conv" + blobs_lr: 1 + blobs_lr: 2 + weight_decay: 1 + weight_decay: 0 + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1.0 + } } - bottom: "fc7" - top: "fc8" } + +# Pixel level logistic prediction. layers { - name: "prob" - type: SOFTMAX - bottom: "fc8" - top: "prob" + name: "pixel-prob" + type: SIGMOID + bottom: "pixel-conv" + top: "pixel-prob" } diff --git a/models/brody/solver.prototxt b/models/brody/solver.prototxt index 347fb123911..56bf6f47fe9 100644 --- a/models/brody/solver.prototxt +++ b/models/brody/solver.prototxt @@ -7,7 +7,7 @@ gamma: 0.1 stepsize: 100000 display: 20 max_iter: 1450000 -momentum: 0. +momentum: 0.9 weight_decay: 0.00005 snapshot: 10000 snapshot_prefix: "models/brody/caffe_brody_train" diff --git a/models/brody/train_val_brody.prototxt b/models/brody/train_val_brody.prototxt index cad276f2ee2..085ccdf9a94 100644 --- a/models/brody/train_val_brody.prototxt +++ b/models/brody/train_val_brody.prototxt @@ -65,9 +65,13 @@ layers { bottom: "label" top: "pixel-label" top: "bb-label" + top: "height-label" + top: "norm-label" slice_param { slice_dim: 1 slice_point: 16 + slice_point: 80 + slice_point: 96 } } @@ -86,6 +90,19 @@ layers { } } +layers { + name: "height-block" + type: CONCAT + bottom: "height-label" + bottom: "height-label" + bottom: "height-label" + bottom: "height-label" + top: "height-block" + concat_param { + concat_dim: 1 + } +} + layers { name: "conv1" type: CONVOLUTION @@ -105,7 +122,7 @@ layers { } bias_filler { type: "constant" - value: 0 + value: 0.1 } } } @@ -150,14 +167,14 @@ layers { num_output: 256 pad: 2 kernel_size: 5 - group: 1 + group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" - value: 1 + value: 0.1 } } } @@ -208,7 +225,7 @@ layers { } bias_filler { type: "constant" - value: 0 + value: 0.1 } } } @@ -231,14 +248,14 @@ layers { num_output: 384 pad: 1 kernel_size: 3 - group: 1 + group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" - value: 1 + value: 0.1 } } } @@ -261,14 +278,14 @@ layers { num_output: 256 pad: 1 kernel_size: 3 - group: 1 + group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" - value: 1 + value: 0.1 } } } @@ -304,7 +321,7 @@ layers { pad: 3 weight_filler { type: "gaussian" - std: 0.005 + std: 0.01 } bias_filler { type: "constant" @@ -324,7 +341,7 @@ layers { bottom: "fc6-conv" top: "fc6-conv" dropout_param { - dropout_ratio: 0.5 + dropout_ratio: 0.0 } } layers { @@ -341,7 +358,7 @@ layers { kernel_size: 1 weight_filler { type: "gaussian" - std: 0.005 + std: 0.01 } bias_filler { type: "constant" @@ -361,7 +378,7 @@ layers { bottom: "fc7-conv" top: "fc7-conv" dropout_param { - dropout_ratio: 0.5 + dropout_ratio: 0.0 } } @@ -370,16 +387,16 @@ layers { type: CONVOLUTION bottom: "fc7-conv" top: "bb-output" - blobs_lr: 0.001 - blobs_lr: 2 - weight_decay: 1 + blobs_lr: 100 + blobs_lr: 200 + weight_decay: 0.00001 weight_decay: 0 convolution_param { num_output: 64 kernel_size: 1 weight_filler { type: "gaussian" - std: 0.005 + std: 0.01 } bias_filler { type: "constant" @@ -441,37 +458,61 @@ layers { } # Squared loss on the bounding boxes. +#layers { +# name: "bb-loss" +# type: EUCLIDEAN_LOSS +# bottom: "bb-masked-output" +# bottom: "bb-label" +# top: "bb-loss" +# loss_weight: 0.01 +#} + +# L1 error loss layers { - name: "bb-loss" - type: EUCLIDEAN_LOSS + name: "bb-diff" + type: ELTWISE bottom: "bb-masked-output" bottom: "bb-label" - top: "bb-loss" + eltwise_param { + operation: SUM + coeff: 1.0 + coeff: -1.0 + } + top: "bb-diff" } -# L1 error loss -#layers { -# name: "bb-diff" -# type: ELTWISE -# bottom: "bb-label" -# bottom: "bb-masked-output" -# eltwise_param { -# operation: SUM -# coeff: 1.0 -# coeff: -1.0 -# } -# top: "bb-diff" -#} - #layers { # name: "bb-loss" # type: ABSVAL # bottom: "bb-diff" # top: "bb-loss" -# # 1 / (batch_size * 20 * 15 * 64) -# loss_weight: 0.00001041666666666 +# # 1 / (20 * 15 * 64) +# loss_weight: 0.00000000001 #} +layers { + name: "bb-loss-pow2" + type: POWER + bottom: "bb-diff" + top: "bb-loss-pow2" + # 1 / (20 * 15 * 64) + power_param { + power: 2 + } +} + +layers { + name: "bb-loss-normalize" + type: ELTWISE + bottom: "bb-loss-pow2" + bottom: "height-block" + eltwise_param { + operation: PROD + } + top: "bb-loss" + loss_weight: 0.001 +} + #layers { # name: "bb-loss-silence" # type: SILENCE diff --git a/models/bvlc_reference_caffenet/train_val.prototxt b/models/bvlc_reference_caffenet/train_val.prototxt index 073d8aeff4a..cdcfb61736c 100644 --- a/models/bvlc_reference_caffenet/train_val.prototxt +++ b/models/bvlc_reference_caffenet/train_val.prototxt @@ -5,7 +5,7 @@ layers { top: "data" top: "label" data_param { - source: "examples/imagenet/ilsvrc12_train_lmdb" + source: "/deep/u/willsong/data/ilsvrc12_train_lmdb" backend: LMDB batch_size: 256 } @@ -22,7 +22,7 @@ layers { top: "data" top: "label" data_param { - source: "examples/imagenet/ilsvrc12_val_lmdb" + source: "/deep/u/willsong/data/ilsvrc12_val_lmdb" backend: LMDB batch_size: 50 } @@ -97,7 +97,7 @@ layers { num_output: 256 pad: 2 kernel_size: 5 - group: 2 + group: 1 weight_filler { type: "gaussian" std: 0.01 @@ -178,7 +178,7 @@ layers { num_output: 384 pad: 1 kernel_size: 3 - group: 2 + group: 1 weight_filler { type: "gaussian" std: 0.01 @@ -208,7 +208,7 @@ layers { num_output: 256 pad: 1 kernel_size: 3 - group: 2 + group: 1 weight_filler { type: "gaussian" std: 0.01 diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py index fe471ca13b1..ddc2467cbbb 100644 --- a/python/caffe/classifier.py +++ b/python/caffe/classifier.py @@ -92,3 +92,22 @@ def predict(self, inputs, oversample=True): predictions = predictions.mean(1) return predictions + + + def ff(self, inputs): + # Scale to standardize input dimensions. + input_ = np.zeros((len(inputs), + self.image_dims[0], self.image_dims[1], inputs[0].shape[2]), + dtype=np.float32) + for ix, in_ in enumerate(inputs): + input_[ix] = caffe.io.resize_image(in_, self.image_dims) + + # Generate center, corner, and mirrored crops. + input_ = caffe.io.oversample(input_, self.crop_dims) + + # Classify + caffe_in = np.zeros(np.array(input_.shape)[[0,3,1,2]], + dtype=np.float32) + for ix, in_ in enumerate(input_): + caffe_in[ix] = self.preprocess(self.inputs[0], in_) + out = self.forward_all(**{self.inputs[0]: caffe_in}) diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index ba262920a9a..bd28a54229a 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -197,16 +197,18 @@ void Solver::Solve(const char* resume_file) { net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + double loss_sum = 0.0; for (int k = 0; k < result[j]->count(); ++k) { - ostringstream loss_msg_stream; - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * result_vec[k] << " loss)"; - } - LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); + loss_sum += result_vec[k]; } + ostringstream loss_msg_stream; + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight + << " = " << loss_weight * loss_sum << " loss)"; + } + LOG(INFO) << " Train net output #" + << score_index++ << ": " << output_name << " = " + << loss_sum << loss_msg_stream.str(); } } @@ -284,20 +286,23 @@ void Solver::Test(const int test_net_id) { loss /= param_.test_iter(test_net_id); LOG(INFO) << "Test loss: " << loss; } + double sum_mean_score = 0.0; for (int i = 0; i < test_score.size(); ++i) { - const int output_blob_index = - test_net->output_blob_indices()[test_score_output_id[i]]; - const string& output_name = test_net->blob_names()[output_blob_index]; - const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; - ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); - if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * mean_score << " loss)"; - } - LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " - << mean_score << loss_msg_stream.str(); + sum_mean_score += mean_score; } + const int output_blob_index = + test_net->output_blob_indices()[test_score_output_id[0]]; + const string& output_name = test_net->blob_names()[output_blob_index]; + const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; + ostringstream loss_msg_stream; + if (loss_weight) { + loss_msg_stream << " (* " << loss_weight + << " = " << loss_weight * sum_mean_score << " loss)"; + } + LOG(INFO) << " Test net output #" << 0 << ": " << output_name << " = " + << sum_mean_score << loss_msg_stream.str(); + Caffe::set_phase(Caffe::TRAIN); } diff --git a/tools/convert_detection_label.cpp b/tools/convert_detection_label.cpp index 8d1f6b9a562..a8d27f73b6c 100644 --- a/tools/convert_detection_label.cpp +++ b/tools/convert_detection_label.cpp @@ -59,13 +59,18 @@ bool ReadBoundingBoxLabelToDatum( const int img_height = height * grid_dim; // 1 pixel label, 4 bounding box coordinates. + const int num_mask_label = 1; + const int num_bb_labels = 4; + const int num_norm_labels = 2; + const int num_total_labels = num_mask_label + num_bb_labels + num_norm_labels; vector labels; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < num_total_labels; ++i) { labels.push_back( new cv::Mat(img_height, img_width, CV_32F, cv::Scalar(0.0))); } - CHECK_EQ(bbs.size() % 4, 0); + CHECK_EQ(bbs.size() % num_bb_labels, 0); + int total_num_pixels = 0; for (int i = 0; i < bbs.size(); i += 4) { float xmin = bbs[i]; float ymin = bbs[i + 1]; @@ -96,28 +101,45 @@ bool ReadBoundingBoxLabelToDatum( cv::Rect r(gxmin, gymin, gxmax - gxmin + (gxmax == gxmin && gxmax < img_width ? 1 : 0), gymax - gymin + (gymax == gymin && gymax < img_height ? 1 : 0)); - float flabels[5] = {1.0, xmin, ymin, xmax, ymax}; - for (int j = 0; j < 5; ++j) { + + total_num_pixels += r.area(); + int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin; + CHECK_GT(normalization_height, 0); + float flabels[num_total_labels] = + {1.0, xmin, ymin, xmax, ymax, 1.0 / normalization_height, 1.0}; + for (int j = 0; j < num_total_labels; ++j) { cv::Mat roi(*labels[j], r); roi = cv::Scalar(flabels[j]); } } - datum->set_channels(5 * grid_dim * grid_dim); + if (total_num_pixels == 0) { + total_num_pixels = 1; + } + float reweight_value = 1.0 / total_num_pixels; + for (int y = 0; y < img_height; ++y) { + for (int x = 0; x < img_width; ++x) { + if (labels[num_total_labels - 1]->at(x, y) == 1.0) { + labels[num_total_labels - 1]->at(x, y) = reweight_value; + } + } + } + + datum->set_channels(num_total_labels * grid_dim * grid_dim); datum->set_height(height); datum->set_width(width); datum->set_label(0); // dummy label datum->clear_data(); datum->clear_float_data(); - for (int m = 0; m < 5; ++m) { + for (int m = 0; m < num_total_labels; ++m) { for (int dy = 0; dy < grid_dim; ++dy) { for (int dx = 0; dx < grid_dim; ++dx) { for (int y = 0; y < img_height; y += grid_dim) { for (int x = 0; x < img_width; x += grid_dim) { float adjustment = 0; float val = labels[m]->at(y + dy, x + dx); - if (m == 0) { + if (m == 0 || m > 4) { // do nothing } else if (labels[0]->at(y + dy, x + dx) == 0.0) { // do nothing @@ -136,9 +158,9 @@ bool ReadBoundingBoxLabelToDatum( } } } - CHECK_EQ(datum->float_data_size(), 5 * img_height * img_width); - for (int i = 0; i < 5; ++i) { + CHECK_EQ(datum->float_data_size(), num_total_labels * img_height * img_width); + for (int i = 0; i < num_total_labels; ++i) { delete labels[i]; }