diff --git a/src/caffe/layers/dropout_fixed_layer.cpp b/src/caffe/layers/dropout_fixed_layer.cpp new file mode 100644 index 00000000000..405a9813976 --- /dev/null +++ b/src/caffe/layers/dropout_fixed_layer.cpp @@ -0,0 +1,79 @@ +// TODO (sergeyk): effect should not be dependent on phase. wasted memcpy. + +#include + +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/vision_layers.hpp" + +namespace caffe { + +template +void DropoutFixedLayer::LayerSetUp(const vector*>& bottom, + vector*>* top) { + NeuronLayer::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.dropout_param().dropout_ratio(); + DCHECK(threshold_ > 0.); + DCHECK(threshold_ < 1.); + scale_ = 1. / (1. - threshold_); + uint_thres_ = static_cast(UINT_MAX * threshold_); +} + +template +void DropoutFixedLayer::Reshape(const vector*>& bottom, + vector*>* top) { + NeuronLayer::Reshape(bottom, top); + // Set up the cache for random number generation + rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); +} + +template +void DropoutFixedLayer::Forward_cpu(const vector*>& bottom, + vector*>* top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = (*top)[0]->mutable_cpu_data(); + unsigned int* mask = rand_vec_.mutable_cpu_data(); + const int count = bottom[0]->count(); + if (Caffe::phase() == Caffe::TRAIN) { + // Create random numbers + caffe_rng_bernoulli(count, 1. - threshold_, mask); + for (int i = 0; i < count; ++i) { + top_data[i] = bottom_data[i] * mask[i] * scale_; + } + } else { + caffe_copy(bottom[0]->count(), bottom_data, top_data); + caffe_scal(count, Dtype(1.0 - threshold_), top_data); + } +} + +template +void DropoutFixedLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, + vector*>* bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + if (Caffe::phase() == Caffe::TRAIN) { + const unsigned int* mask = rand_vec_.cpu_data(); + const int count = (*bottom)[0]->count(); + for (int i = 0; i < count; ++i) { + bottom_diff[i] = top_diff[i] * mask[i] * scale_; + } + } else { + caffe_copy(top[0]->count(), top_diff, bottom_diff); + } + } +} + + +#ifdef CPU_ONLY +STUB_GPU(DropoutFixedLayer); +#endif + +INSTANTIATE_CLASS(DropoutFixedLayer); + + +} // namespace caffe diff --git a/src/caffe/layers/dropout_fixed_layer.cu b/src/caffe/layers/dropout_fixed_layer.cu new file mode 100644 index 00000000000..4cda4842945 --- /dev/null +++ b/src/caffe/layers/dropout_fixed_layer.cu @@ -0,0 +1,78 @@ +#include +#include +#include + +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/vision_layers.hpp" + +namespace caffe { + + +template +__global__ void DropoutFixedForward(const int n, const Dtype* in, + const unsigned int* mask, const unsigned int threshold, const float scale, + Dtype* out) { + CUDA_KERNEL_LOOP(index, n) { + out[index] = in[index] * (mask[index] > threshold) * scale; + } +} + +template +void DropoutFixedLayer::Forward_gpu(const vector*>& bottom, + vector*>* top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = (*top)[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (Caffe::phase() == Caffe::TRAIN) { + unsigned int* mask = + static_cast(rand_vec_.mutable_gpu_data()); + caffe_gpu_rng_uniform(count, mask); + // set thresholds + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutFixedForward<<>>( + count, bottom_data, mask, uint_thres_, scale_, top_data); + CUDA_POST_KERNEL_CHECK; + } else { + caffe_copy(count, bottom_data, top_data); + caffe_gpu_scal(count, Dtype(1.0 - threshold_), top_data); + } +} + +template +__global__ void DropoutFixedBackward(const int n, const Dtype* in_diff, + const unsigned int* mask, const unsigned int threshold, const float scale, + Dtype* out_diff) { + CUDA_KERNEL_LOOP(index, n) { + out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); + } +} + +template +void DropoutFixedLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, + vector*>* bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); + if (Caffe::phase() == Caffe::TRAIN) { + const unsigned int* mask = + static_cast(rand_vec_.gpu_data()); + const int count = (*bottom)[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutFixedBackward<<>>( + count, top_diff, mask, uint_thres_, scale_, bottom_diff); + CUDA_POST_KERNEL_CHECK; + } else { + caffe_copy(top[0]->count(), top_diff, bottom_diff); + } + } +} + +INSTANTIATE_CLASS(DropoutFixedLayer); + + +} // namespace caffe diff --git a/src/caffe/layers/lrn_fixed_layer.cpp b/src/caffe/layers/lrn_fixed_layer.cpp new file mode 100644 index 00000000000..c9714ff7572 --- /dev/null +++ b/src/caffe/layers/lrn_fixed_layer.cpp @@ -0,0 +1,256 @@ +#include + +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/vision_layers.hpp" + +namespace caffe { + +template +void LRNFixedLayer::LayerSetUp(const vector*>& bottom, + vector*>* top) { + size_ = this->layer_param_.lrn_param().local_size(); + CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; + pre_pad_ = (size_ - 1) / 2; + alpha_ = this->layer_param_.lrn_param().alpha(); + beta_ = this->layer_param_.lrn_param().beta(); + if (this->layer_param_.lrn_param().norm_region() == + LRNParameter_NormRegion_WITHIN_CHANNEL) { + // Set up split_layer_ to use inputs in the numerator and denominator. + split_top_vec_.clear(); + split_top_vec_.push_back(&product_input_); + split_top_vec_.push_back(&square_input_); + LayerParameter split_param; + split_layer_.reset(new SplitLayer(split_param)); + split_layer_->SetUp(bottom, &split_top_vec_); + // Set up square_layer_ to square the inputs. + square_bottom_vec_.clear(); + square_top_vec_.clear(); + square_bottom_vec_.push_back(&square_input_); + square_top_vec_.push_back(&square_output_); + LayerParameter square_param; + square_param.mutable_power_param()->set_power(Dtype(2)); + square_layer_.reset(new PowerLayer(square_param)); + square_layer_->SetUp(square_bottom_vec_, &square_top_vec_); + // Set up pool_layer_ to sum over square neighborhoods of the input. + pool_top_vec_.clear(); + pool_top_vec_.push_back(&pool_output_); + LayerParameter pool_param; + pool_param.mutable_pooling_param()->set_pool( + PoolingParameter_PoolMethod_AVE); + pool_param.mutable_pooling_param()->set_pad(pre_pad_); + pool_param.mutable_pooling_param()->set_kernel_size(size_); + pool_layer_.reset(new PoolingLayer(pool_param)); + pool_layer_->SetUp(square_top_vec_, &pool_top_vec_); + // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is + // the sum of a squared neighborhood (the output of pool_layer_). + power_top_vec_.clear(); + power_top_vec_.push_back(&power_output_); + LayerParameter power_param; + power_param.mutable_power_param()->set_power(-beta_); + power_param.mutable_power_param()->set_scale(alpha_); + power_param.mutable_power_param()->set_shift(Dtype(1)); + power_layer_.reset(new PowerLayer(power_param)); + power_layer_->SetUp(pool_top_vec_, &power_top_vec_); + // Set up a product_layer_ to compute outputs by multiplying inputs by the + // inverse demoninator computed by the power layer. + product_bottom_vec_.clear(); + product_bottom_vec_.push_back(&product_input_); + product_bottom_vec_.push_back(&power_output_); + LayerParameter product_param; + EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param(); + eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD); + product_layer_.reset(new EltwiseLayer(product_param)); + product_layer_->SetUp(product_bottom_vec_, top); + } +} + +template +void LRNFixedLayer::Reshape(const vector*>& bottom, + vector*>* top) { + num_ = bottom[0]->num(); + channels_ = bottom[0]->channels(); + height_ = bottom[0]->height(); + width_ = bottom[0]->width(); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + (*top)[0]->Reshape(num_, channels_, height_, width_); + scale_.Reshape(num_, channels_, height_, width_); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + split_layer_->Reshape(bottom, &split_top_vec_); + square_layer_->Reshape(square_bottom_vec_, &square_top_vec_); + pool_layer_->Reshape(square_top_vec_, &pool_top_vec_); + power_layer_->Reshape(pool_top_vec_, &power_top_vec_); + product_layer_->Reshape(product_bottom_vec_, top); + break; + } +} + +template +void LRNFixedLayer::Forward_cpu(const vector*>& bottom, + vector*>* top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_cpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template +void LRNFixedLayer::CrossChannelForward_cpu( + const vector*>& bottom, vector*>* top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = (*top)[0]->mutable_cpu_data(); + Dtype* scale_data = scale_.mutable_cpu_data(); + // start with the constant value + for (int i = 0; i < scale_.count(); ++i) { + scale_data[i] = 1.; + } + Blob padded_square(1, channels_ + size_ - 1, height_, width_); + Dtype* padded_square_data = padded_square.mutable_cpu_data(); + caffe_set(padded_square.count(), Dtype(0), padded_square_data); + Dtype alpha_over_size = alpha_ / size_; + // go through the images + for (int n = 0; n < num_; ++n) { + // compute the padded square + caffe_sqr(channels_ * height_ * width_, + bottom_data + bottom[0]->offset(n), + padded_square_data + padded_square.offset(0, pre_pad_)); + // Create the first channel scale + for (int c = 0; c < size_; ++c) { + caffe_axpy(height_ * width_, alpha_over_size, + padded_square_data + padded_square.offset(0, c), + scale_data + scale_.offset(n, 0)); + } + for (int c = 1; c < channels_; ++c) { + // copy previous scale + caffe_copy(height_ * width_, + scale_data + scale_.offset(n, c - 1), + scale_data + scale_.offset(n, c)); + // add head + caffe_axpy(height_ * width_, alpha_over_size, + padded_square_data + padded_square.offset(0, c + size_ - 1), + scale_data + scale_.offset(n, c)); + // subtract tail + caffe_axpy(height_ * width_, -alpha_over_size, + padded_square_data + padded_square.offset(0, c - 1), + scale_data + scale_.offset(n, c)); + } + } + + // In the end, compute output + caffe_powx(scale_.count(), scale_data, -beta_, top_data); + caffe_mul(scale_.count(), top_data, bottom_data, top_data); +} + +template +void LRNFixedLayer::WithinChannelForward( + const vector*>& bottom, vector*>* top) { + split_layer_->Forward(bottom, &split_top_vec_); + square_layer_->Forward(square_bottom_vec_, &square_top_vec_); + pool_layer_->Forward(square_top_vec_, &pool_top_vec_); + power_layer_->Forward(pool_top_vec_, &power_top_vec_); + product_layer_->Forward(product_bottom_vec_, top); +} + +template +void LRNFixedLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_cpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template +void LRNFixedLayer::CrossChannelBackward_cpu( + const vector*>& top, const vector& propagate_down, + vector*>* bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* top_data = top[0]->cpu_data(); + const Dtype* bottom_data = (*bottom)[0]->cpu_data(); + const Dtype* scale_data = scale_.cpu_data(); + Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); + Blob padded_ratio(1, channels_ + size_ - 1, height_, width_); + Blob accum_ratio(1, 1, height_, width_); + Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); + Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); + // We hack a little bit by using the diff() to store an additional result + Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff(); + caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); + Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; + + caffe_powx(scale_.count(), scale_data, -beta_, bottom_diff); + caffe_mul(scale_.count(), top_diff, bottom_diff, bottom_diff); + + // go through individual data + int inverse_pre_pad = size_ - (size_ + 1) / 2; + for (int n = 0; n < num_; ++n) { + int block_offset = scale_.offset(n); + // first, compute diff_i * y_i / s_i + caffe_mul(channels_ * height_ * width_, + top_diff + block_offset, top_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + caffe_div(channels_ * height_ * width_, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), + scale_data + block_offset, + padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + // Now, compute the accumulated ratios and the bottom diff + caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); + for (int c = 0; c < size_ - 1; ++c) { + caffe_axpy(height_ * width_, 1., + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + } + for (int c = 0; c < channels_; ++c) { + caffe_axpy(height_ * width_, 1., + padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), + accum_ratio_data); + // compute bottom diff + caffe_mul(height_ * width_, + bottom_data + top[0]->offset(n, c), + accum_ratio_data, accum_ratio_times_bottom); + caffe_axpy(height_ * width_, -cache_ratio_value, + accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); + caffe_axpy(height_ * width_, -1., + padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + } + } +} + +template +void LRNFixedLayer::WithinChannelBackward( + const vector*>& top, const vector& propagate_down, + vector*>* bottom) { + if (propagate_down[0]) { + vector product_propagate_down(2, true); + product_layer_->Backward(top, product_propagate_down, &product_bottom_vec_); + power_layer_->Backward(power_top_vec_, propagate_down, &pool_top_vec_); + pool_layer_->Backward(pool_top_vec_, propagate_down, &square_top_vec_); + square_layer_->Backward(square_top_vec_, propagate_down, + &square_bottom_vec_); + split_layer_->Backward(split_top_vec_, propagate_down, bottom); + } +} + +#ifdef CPU_ONLY +STUB_GPU(LRNFixedLayer); +STUB_GPU_FORWARD(LRNFixedLayer, CrossChannelForward); +STUB_GPU_BACKWARD(LRNFixedLayer, CrossChannelBackward); +#endif + +INSTANTIATE_CLASS(LRNFixedLayer); + + +} // namespace caffe diff --git a/src/caffe/layers/lrn_fixed_layer.cu b/src/caffe/layers/lrn_fixed_layer.cu new file mode 100644 index 00000000000..7132f088d3a --- /dev/null +++ b/src/caffe/layers/lrn_fixed_layer.cu @@ -0,0 +1,193 @@ +#include + +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/vision_layers.hpp" + +namespace caffe { + +template +__global__ void LRNFixedFillScale(const int nthreads, const Dtype* in, + const int num, const int channels, const int height, + const int width, const int size, const Dtype alpha, + Dtype* scale) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local offset + int w = index % width; + int h = (index / width) % height; + int n = index / width / height; + int offset = (n * channels * height + h) * width + w; + int step = height * width; + in += offset; + scale += offset; + int head = 0; + int pre_pad = (size - 1) / 2; + int post_pad = size - pre_pad - 1; + Dtype accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // until we reach size, nothing needs to be subtracted + while (head < size) { + accum_scale += in[head * step] * in[head * step]; + scale[(head - post_pad) * step] = 2. + accum_scale * alpha; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + accum_scale -= in[(head - size) * step] * in[(head - size) * step]; + scale[(head - post_pad) * step] = 2. + accum_scale * alpha; + ++head; + } + // subtract only + while (head < channels + post_pad) { + accum_scale -= in[(head - size) * step] * in[(head - size) * step]; + scale[(head - post_pad) * step] = 2. + accum_scale * alpha; + ++head; + } + } +} + +template +void LRNFixedLayer::Forward_gpu(const vector*>& bottom, + vector*>* top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_gpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +// TODO: check if it would be faster to just put it into the previous kernel. +template +__global__ void LRNFixedComputeOutput(const int nthreads, const Dtype* in, + const Dtype* scale, const Dtype negative_beta, Dtype* out) { + CUDA_KERNEL_LOOP(index, nthreads) { + out[index] = in[index] * pow(scale[index], negative_beta); + } +} + +template +void LRNFixedLayer::CrossChannelForward_gpu( + const vector*>& bottom, vector*>* top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = (*top)[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFixedFillScale<<>>( + n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_, scale_data); + CUDA_POST_KERNEL_CHECK; + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFixedComputeOutput<<>>( + n_threads, bottom_data, scale_data, -beta_, top_data); + CUDA_POST_KERNEL_CHECK; +} + +template +void LRNFixedLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, vector*>* bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_gpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template +__global__ void LRNFixedComputeDiff(const int nthreads, const Dtype* bottom_data, + const Dtype* top_data, const Dtype* scale, const Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int size, const Dtype negative_beta, + const Dtype cache_ratio, + Dtype* bottom_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local offset + int w = index % width; + int h = (index / width) % height; + int n = index / width / height; + int offset = (n * channels * height + h) * width + w; + int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + int pre_pad = size - (size + 1) / 2; + int post_pad = size - pre_pad - 1; + Dtype accum_ratio = 0; + // accumulate values + while (head < post_pad) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // until we reach size, nothing needs to be subtracted + while (head < size) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + } +} + +template +void LRNFixedLayer::CrossChannelBackward_gpu( + const vector*>& top, const vector& propagate_down, + vector*>* bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFixedComputeDiff<<>>( + n_threads, (*bottom)[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + (*bottom)[0]->mutable_gpu_diff()); +} + + +INSTANTIATE_CLASS(LRNFixedLayer); + +} // namespace caffe