more changes and bug fixes for driving network

brodyh · Jan 26, 2015 · a149449 · a149449
1 parent 3258f77
commit a149449
Show file tree

Hide file tree

Showing 15 changed files with 207 additions and 122 deletions.
diff --git a/examples/filter_visualization_car_deeppy.ipynb b/examples/filter_visualization_car_deeppy.ipynb
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
@@ -100,6 +100,7 @@ class Blob {
   Dtype* mutable_gpu_diff();
   void Update();
   void FromProto(const BlobProto& proto);
+  void FromProtoDataOnly(const BlobProto& proto);
   void FromProtoReplicate(const BlobProto& proto, const int num_replicates);
   void ToProto(BlobProto* proto, bool write_diff = false) const;
 

diff --git a/models/brody/solver_driving.prototxt b/models/brody/solver_driving.prototxt
@@ -2,7 +2,7 @@ net: "models/brody/train_val_driving.prototxt"
 test_iter: 20
 test_interval: 5000
 test_compute_loss: true
-base_lr: 0.0000001
+base_lr: 0.001
 lr_policy: "step"
 gamma: 0.1
 stepsize: 100000

diff --git a/models/brody/solver_driving_softmax.prototxt b/models/brody/solver_driving_softmax.prototxt
@@ -2,14 +2,14 @@ net: "models/brody/train_val_driving_softmax.prototxt"
 test_iter: 20
 test_interval: 5000
 test_compute_loss: true
-base_lr: 0.0000001
+base_lr: 0.001
 lr_policy: "step"
 gamma: 0.1
 stepsize: 100000
 display: 20
 max_iter: 1450000
 momentum: 0.9
 weight_decay: 0.00005
-snapshot: 10000
-snapshot_prefix: "models/brody/caffe_driving_softmax"
+snapshot: 1000
+snapshot_prefix: "models/brody/driving_softmax_8x8"
 solver_mode: GPU
diff --git a/models/brody/train_val_driving_softmax.prototxt b/models/brody/train_val_driving_softmax.prototxt
@@ -7,7 +7,7 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "driving_train"
+    source: "new_driving_train"
     backend: LMDB
     batch_size: 5
   }
@@ -24,7 +24,7 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "driving_test"
+    source: "new_driving_test"
     backend: LMDB
     batch_size: 5
   }
@@ -363,12 +363,12 @@ layers {
   type: CONVOLUTION
   bottom: "fc7-conv"
   top: "bb-output"
-  blobs_lr: 100
-  blobs_lr: 200
-  weight_decay: 0.00001
+  blobs_lr: 10
+  blobs_lr: 20
+  weight_decay: 0.1
   weight_decay: 0
   convolution_param {
-    num_output: 64
+    num_output: 256
     kernel_size: 1
     weight_filler {
       type: "gaussian"
@@ -391,7 +391,7 @@ layers {
   weight_decay: 1
   weight_decay: 0
   convolution_param {
-    num_output: 32
+    num_output: 128
     kernel_size: 1
     weight_filler {
       type: "gaussian"
@@ -410,7 +410,7 @@ layers {
   bottom: "pixel-conv"
   top: "pixel-conv-tiled"
   tiling_param {
-    tile_dim: 4
+    tile_dim: 8
   }
 }
 
@@ -420,7 +420,7 @@ layers {
   bottom: "bb-output"
   top: "bb-output-tiled"
   tiling_param {
-    tile_dim: 4
+    tile_dim: 8
   }
 }
 

diff --git a/python/convert_mean.py b/python/convert_mean.py
@@ -21,20 +21,21 @@ def main(argv):
   mean_data = np.array(mean_data.data)
   print mean_data.shape
   mean_img = mean_data.reshape([3, 480, 640])
-  mean_img = mean_img[(2, 1, 0), :, :]
-  np.save(open('new_driving_mean.npy', 'wb'), mean_img)
+#  mean_img = mean_img[(2, 1, 0), :, :]
+#  np.save(open('new_driving_mean.npy', 'wb'), mean_img)
 
-  """
   mean_img = np.transpose(mean_img, (1, 2, 0))
-  Image.fromarray(mean_img.astype('uint8')).save('mean.png')
+  Image.fromarray(mean_img.astype('uint8')).save('test_mean.png')
 
+  """
   real_img = caffe.io.load_image( \
       '/deep/group/driving_data/andriluka/IMAGES/driving_data_q50_data/all_extracted/4-2-14-monterey-split_0_280S_a2/4-2-14-monterey-split_0_280S_a2_000341.jpeg')
   real_img = caffe.io.resize_image(real_img * 255, (480, 640, 3))
   Image.fromarray(real_img.astype('uint8')).save('original.png')
   Image.fromarray(np.clip(real_img - mean_img, 0, 255).astype('uint8')).save('sub.png')
   """
 
+
 if __name__ == '__main__':
   import sys
   main(sys.argv)
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
@@ -265,6 +265,14 @@ void Blob<Dtype>::FromProtoReplicate(const BlobProto& proto, const int num_repli
   }
 }
 
+template <typename Dtype>
+void Blob<Dtype>::FromProtoDataOnly(const BlobProto& proto) {
+  // copy data
+  Dtype* data_vec = mutable_cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    data_vec[i] = proto.data(i);
+  }
+}
 
 template <typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {

diff --git a/src/caffe/layers/driving_data_layer.cpp b/src/caffe/layers/driving_data_layer.cpp
@@ -283,7 +283,6 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatumLegacy(
         new cv::Mat(full_label_height, full_label_width, CV_32F, cv::Scalar(0.0)));
   }
 
-  int total_num_pixels = 0;
   for (int i = 0; i < data.car_boxes_size(); ++i) {
     int xmin = data.car_boxes(i).xmin();
     int ymin = data.car_boxes(i).ymin();
@@ -319,7 +318,6 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatumLegacy(
                gxmax - gxmin + (gxmax == gxmin && gxmax < full_label_width ? 1 : 0),
                gymax - gymin + (gymax == gymin && gymax < full_label_height ? 1 : 0));
 
-    total_num_pixels += r.area();
     int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin;
     CHECK_GT(normalization_height, 0);
     int normalization_width = xmax - xmin == 0 ? 1 : xmax - xmin;
@@ -339,6 +337,15 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatumLegacy(
     }
   }
 
+
+  int total_num_pixels = 0;
+  for (int y = 0; y < full_label_height; ++y) {
+    for (int x = 0; x < full_label_width; ++x) {
+      if (labels[num_total_labels - 1]->at<float>(y, x) == 1.0) {
+        total_num_pixels++;
+      }
+    }
+  }
   if (total_num_pixels != 0) {
     float reweight_value = 1.0 / total_num_pixels;
     for (int y = 0; y < full_label_height; ++y) {
@@ -399,6 +406,7 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
   const int height = data.car_label_height();
   const int full_label_width = width * grid_dim;
   const int full_label_height = height * grid_dim;
+  const float half_shrink_factor = data.car_shrink_factor() / 2;
   const float scaling = static_cast<float>(full_label_width) / data.car_cropped_width();
 
   // 1 pixel label, 4 bounding box coordinates, 2 normalization labels.
@@ -412,7 +420,6 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
         new cv::Mat(full_label_height, full_label_width, CV_32F, cv::Scalar(0.0)));
   }
 
-  int total_num_pixels = 0;
   for (int i = 0; i < data.car_boxes_size(); ++i) {
     int xmin = data.car_boxes(i).xmin();
     int ymin = data.car_boxes(i).ymin();
@@ -425,10 +432,10 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
     float w = xmax - xmin;
     float h = ymax - ymin;
     // shrink bboxes
-    int gxmin = cvRound((xmin + w / 4) * scaling);
-    int gxmax = cvRound((xmax - w / 4) * scaling);
-    int gymin = cvRound((ymin + h / 4) * scaling);
-    int gymax = cvRound((ymax - h / 4) * scaling);
+    int gxmin = cvRound((xmin + w * half_shrink_factor) * scaling);
+    int gxmax = cvRound((xmax - w * half_shrink_factor) * scaling);
+    int gymin = cvRound((ymin + h * half_shrink_factor) * scaling);
+    int gymax = cvRound((ymax - h * half_shrink_factor) * scaling);
 
     CHECK_LE(gxmin, gxmax);
     CHECK_LE(gymin, gymax);
@@ -448,7 +455,6 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
                gxmax - gxmin + (gxmax == gxmin && gxmax < full_label_width ? 1 : 0),
                gymax - gymin + (gymax == gymin && gymax < full_label_height ? 1 : 0));
 
-    total_num_pixels += r.area();
     int normalization_height = ymax - ymin == 0 ? 1 : ymax - ymin;
     CHECK_GT(normalization_height, 0);
     int normalization_width = xmax - xmin == 0 ? 1 : xmax - xmin;
@@ -468,6 +474,14 @@ bool DrivingDataLayer<Dtype>::ReadBoundingBoxLabelToDatum(
     }
   }
 
+  int total_num_pixels = 0;
+  for (int y = 0; y < full_label_height; ++y) {
+    for (int x = 0; x < full_label_width; ++x) {
+      if (labels[num_total_labels - 1]->at<float>(y, x) == 1.0) {
+        total_num_pixels++;
+      }
+    }
+  }
   if (total_num_pixels != 0) {
     float reweight_value = 1.0 / total_num_pixels;
     for (int y = 0; y < full_label_height; ++y) {

diff --git a/src/caffe/layers/l1_loss_layer.cu b/src/caffe/layers/l1_loss_layer.cu
@@ -20,6 +20,7 @@ void L1LossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   caffe_gpu_asum(count, diff_.gpu_data(), &abs_sum);
   caffe_gpu_sign(count, diff_.gpu_data(), sign_.mutable_gpu_data());
   Dtype loss = abs_sum / bottom[0]->num();
+//  Dtype loss = abs_sum / bottom[0]->count();
   (*top)[0]->mutable_cpu_data()[0] = loss;
 }
 
@@ -30,6 +31,7 @@ void L1LossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
       const Dtype alpha = sign * top[0]->cpu_diff()[0] / (*bottom)[i]->num();
+//      const Dtype alpha = sign * top[0]->cpu_diff()[0] / (*bottom)[i]->count();
       caffe_gpu_axpby(
           (*bottom)[i]->count(),              // count
           alpha,                              // alpha

diff --git a/src/caffe/layers/lrn_fixed_layer.cpp b/src/caffe/layers/lrn_fixed_layer.cpp
@@ -110,12 +110,11 @@ void LRNFixedLayer<Dtype>::CrossChannelForward_cpu(
   Dtype* scale_data = scale_.mutable_cpu_data();
   // start with the constant value
   for (int i = 0; i < scale_.count(); ++i) {
-    scale_data[i] = 1.;
+    scale_data[i] = 2.;
   }
   Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
   Dtype* padded_square_data = padded_square.mutable_cpu_data();
   caffe_set(padded_square.count(), Dtype(0), padded_square_data);
-  Dtype alpha_over_size = alpha_ / size_;
   // go through the images
   for (int n = 0; n < num_; ++n) {
     // compute the padded square
@@ -124,7 +123,7 @@ void LRNFixedLayer<Dtype>::CrossChannelForward_cpu(
         padded_square_data + padded_square.offset(0, pre_pad_));
     // Create the first channel scale
     for (int c = 0; c < size_; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
+      caffe_axpy<Dtype>(height_ * width_, alpha_,
           padded_square_data + padded_square.offset(0, c),
           scale_data + scale_.offset(n, 0));
     }
@@ -134,11 +133,11 @@ void LRNFixedLayer<Dtype>::CrossChannelForward_cpu(
           scale_data + scale_.offset(n, c - 1),
           scale_data + scale_.offset(n, c));
       // add head
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
+      caffe_axpy<Dtype>(height_ * width_, alpha_,
           padded_square_data + padded_square.offset(0, c + size_ - 1),
           scale_data + scale_.offset(n, c));
       // subtract tail
-      caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
+      caffe_axpy<Dtype>(height_ * width_, -alpha_,
           padded_square_data + padded_square.offset(0, c - 1),
           scale_data + scale_.offset(n, c));
     }
@@ -190,7 +189,8 @@ void LRNFixedLayer<Dtype>::CrossChannelBackward_cpu(
   // We hack a little bit by using the diff() to store an additional result
   Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
   caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
-  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
+//  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
+  Dtype cache_ratio_value = 2. * alpha_ * beta_;
 
   caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
   caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);

diff --git a/src/caffe/layers/lrn_fixed_layer.cu b/src/caffe/layers/lrn_fixed_layer.cu
@@ -183,7 +183,7 @@ void LRNFixedLayer<Dtype>::CrossChannelBackward_gpu(
   LRNFixedComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
       n_threads, (*bottom)[0]->gpu_data(), top[0]->gpu_data(),
       scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      size_, -beta_, Dtype(2. * alpha_ * beta_),
       (*bottom)[0]->mutable_gpu_diff());
 }
 

diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
@@ -48,7 +48,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
                            Dtype(FLT_MIN)));
     }
   }
-  (*top)[0]->mutable_cpu_data()[0] = loss / num;
+  (*top)[0]->mutable_cpu_data()[0] = loss / num / spatial_dim;
   if (top->size() == 2) {
     (*top)[1]->ShareData(prob_);
   }
@@ -78,7 +78,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     }
     // Scale gradient
     const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_scal(prob_.count(), loss_weight / num, bottom_diff);
+    caffe_scal(prob_.count(), loss_weight / num / spatial_dim, bottom_diff);
   }
 }
 

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
@@ -720,18 +720,28 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
                 << source_layer.blobs(j).channels() << "x"
                 << source_layer.blobs(j).height() << "x"
                 << source_layer.blobs(j).width();
-      CHECK_EQ(target_blobs[j]->num(), source_layer.blobs(j).num());
-      CHECK_EQ(target_blobs[j]->channels(), source_layer.blobs(j).channels());
-      CHECK_EQ(target_blobs[j]->height(), source_layer.blobs(j).height());
       //CHECK_EQ(target_blobs[j]->width(), source_layer.blobs(j).width());
-      if (target_blobs[j]->width() == source_layer.blobs(j).width()) {
+      if (target_blobs[j]->width() == source_layer.blobs(j).width() &&
+          target_blobs[j]->height() == source_layer.blobs(j).height() &&
+          target_blobs[j]->channels() == source_layer.blobs(j).channels() &&
+          target_blobs[j]->num() == source_layer.blobs(j).num()) {
         target_blobs[j]->FromProto(source_layer.blobs(j));
-      } else if (target_blobs[j]->width() > source_layer.blobs(j).width()) {
+      } else if (target_blobs[j]->width() > source_layer.blobs(j).width() &&
+          target_blobs[j]->height() == source_layer.blobs(j).height() &&
+          target_blobs[j]->channels() == source_layer.blobs(j).channels() &&
+          target_blobs[j]->num() == source_layer.blobs(j).num()) {
         LOG(INFO) << "### WARNING: source target dimension is less than target";
         const int num_replicates = target_blobs[j]->width()
             / source_layer.blobs(j).width();
         CHECK_EQ(target_blobs[j]->width() % source_layer.blobs(j).width(), 0);
         target_blobs[j]->FromProtoReplicate(source_layer.blobs(j), num_replicates);
+      } else if (
+          target_blobs[j]->width() * target_blobs[j]->height() *
+          target_blobs[j]->channels() * target_blobs[j]->num() ==
+          source_layer.blobs(j).width() * source_layer.blobs(j).height() *
+          source_layer.blobs(j).channels() * source_layer.blobs(j).num()) {
+        LOG(INFO) << "### WARNING: source target dimension only match by total";
+        target_blobs[j]->FromProtoDataOnly(source_layer.blobs(j));
       } else {
         CHECK(false) << "dimension mismatched";
       }

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -65,6 +65,8 @@ message DrivingData {
   optional int32 car_label_height = 10 [default = 15];
   // Tiling dimensions.
   optional int32 car_label_resolution = 11 [default = 4];
+  // Shrink factor for the car prediction mask.
+  optional float car_shrink_factor = 12 [default = 0.5];
 }
 
 message FillerParameter {

diff --git a/tools/convert_driving_data.cpp b/tools/convert_driving_data.cpp
@@ -46,7 +46,7 @@ DEFINE_bool(shuffle, true,
     "Randomly shuffle the order of images and their labels");
 DEFINE_int32(width, 20, "Number of grids horizontally.");
 DEFINE_int32(height, 15, "Number of grids vertically.");
-DEFINE_int32(grid_dim, 4, "grid_dim x grid_dim number of pixels per each grid.");
+DEFINE_int32(grid_dim, 8, "grid_dim x grid_dim number of pixels per each grid.");
 DEFINE_int32(num_info_per_box, 4, "number of fields per box.");
 DEFINE_int32(resize_width, 640 + 32, "Width images are resized to");
 DEFINE_int32(resize_height, 480 + 32, "Height images are resized to");
@@ -132,6 +132,8 @@ int main(int argc, char** argv) {
   LOG(ERROR) << "Total to be processed: " << lines.size() << ".\n";
   for (int line_id = 0; line_id < lines.size(); ++line_id) {
     DrivingData data;
+    data.set_car_label_resolution(FLAGS_grid_dim);
+    data.set_car_shrink_factor(0.75);
     const string image_path = root_folder + lines[line_id].first;
     data.set_car_img_source(image_path);
     const vector<int>& bbs = lines[line_id].second;