From 3258f77117880066ecd281925363ba3d26611b17 Mon Sep 17 00:00:00 2001
From: William Song <cheeyos@github.com>
Date: Fri, 9 Jan 2015 21:08:40 -0800
Subject: [PATCH] adding forgotten fixed layer files.

---
 src/caffe/layers/dropout_fixed_layer.cpp |  79 +++++++
 src/caffe/layers/dropout_fixed_layer.cu  |  78 +++++++
 src/caffe/layers/lrn_fixed_layer.cpp     | 256 +++++++++++++++++++++++
 src/caffe/layers/lrn_fixed_layer.cu      | 193 +++++++++++++++++
 4 files changed, 606 insertions(+)
 create mode 100644 src/caffe/layers/dropout_fixed_layer.cpp
 create mode 100644 src/caffe/layers/dropout_fixed_layer.cu
 create mode 100644 src/caffe/layers/lrn_fixed_layer.cpp
 create mode 100644 src/caffe/layers/lrn_fixed_layer.cu
diff --git a/src/caffe/layers/dropout_fixed_layer.cpp b/src/caffe/layers/dropout_fixed_layer.cpp
new file mode 100644
index 00000000000..405a9813976
--- /dev/null
+++ b/src/caffe/layers/dropout_fixed_layer.cpp
@@ -0,0 +1,79 @@
+// TODO (sergeyk): effect should not be dependent on phase. wasted memcpy.
+
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+  threshold_ = this->layer_param_.dropout_param().dropout_ratio();
+  DCHECK(threshold_ > 0.);
+  DCHECK(threshold_ < 1.);
+  scale_ = 1. / (1. - threshold_);
+  uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+}
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  NeuronLayer<Dtype>::Reshape(bottom, top);
+  // Set up the cache for random number generation
+  rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+}
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  unsigned int* mask = rand_vec_.mutable_cpu_data();
+  const int count = bottom[0]->count();
+  if (Caffe::phase() == Caffe::TRAIN) {
+    // Create random numbers
+    caffe_rng_bernoulli(count, 1. - threshold_, mask);
+    for (int i = 0; i < count; ++i) {
+      top_data[i] = bottom_data[i] * mask[i] * scale_;
+    }
+  } else {
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+    caffe_scal(count, Dtype(1.0 - threshold_), top_data);
+  }
+}
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    if (Caffe::phase() == Caffe::TRAIN) {
+      const unsigned int* mask = rand_vec_.cpu_data();
+      const int count = (*bottom)[0]->count();
+      for (int i = 0; i < count; ++i) {
+        bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+      }
+    } else {
+      caffe_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(DropoutFixedLayer);
+#endif
+
+INSTANTIATE_CLASS(DropoutFixedLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/dropout_fixed_layer.cu b/src/caffe/layers/dropout_fixed_layer.cu
new file mode 100644
index 00000000000..4cda4842945
--- /dev/null
+++ b/src/caffe/layers/dropout_fixed_layer.cu
@@ -0,0 +1,78 @@
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+
+template <typename Dtype>
+__global__ void DropoutFixedForward(const int n, const Dtype* in,
+    const unsigned int* mask, const unsigned int threshold, const float scale,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = in[index] * (mask[index] > threshold) * scale;
+  }
+}
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  if (Caffe::phase() == Caffe::TRAIN) {
+    unsigned int* mask =
+        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+    caffe_gpu_rng_uniform(count, mask);
+    // set thresholds
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    DropoutFixedForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, mask, uint_thres_, scale_, top_data);
+    CUDA_POST_KERNEL_CHECK;
+  } else {
+    caffe_copy(count, bottom_data, top_data);
+    caffe_gpu_scal(count, Dtype(1.0 - threshold_), top_data);
+  }
+}
+
+template <typename Dtype>
+__global__ void DropoutFixedBackward(const int n, const Dtype* in_diff,
+    const unsigned int* mask, const unsigned int threshold, const float scale,
+    Dtype* out_diff) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
+  }
+}
+
+template <typename Dtype>
+void DropoutFixedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+    if (Caffe::phase() == Caffe::TRAIN) {
+      const unsigned int* mask =
+          static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      const int count = (*bottom)[0]->count();
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DropoutFixedBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(
+          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+      CUDA_POST_KERNEL_CHECK;
+    } else {
+      caffe_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
+}
+
+INSTANTIATE_CLASS(DropoutFixedLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lrn_fixed_layer.cpp b/src/caffe/layers/lrn_fixed_layer.cpp
new file mode 100644
index 00000000000..c9714ff7572
--- /dev/null
+++ b/src/caffe/layers/lrn_fixed_layer.cpp
@@ -0,0 +1,256 @@
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  size_ = this->layer_param_.lrn_param().local_size();
+  CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
+  pre_pad_ = (size_ - 1) / 2;
+  alpha_ = this->layer_param_.lrn_param().alpha();
+  beta_ = this->layer_param_.lrn_param().beta();
+  if (this->layer_param_.lrn_param().norm_region() ==
+      LRNParameter_NormRegion_WITHIN_CHANNEL) {
+    // Set up split_layer_ to use inputs in the numerator and denominator.
+    split_top_vec_.clear();
+    split_top_vec_.push_back(&product_input_);
+    split_top_vec_.push_back(&square_input_);
+    LayerParameter split_param;
+    split_layer_.reset(new SplitLayer<Dtype>(split_param));
+    split_layer_->SetUp(bottom, &split_top_vec_);
+    // Set up square_layer_ to square the inputs.
+    square_bottom_vec_.clear();
+    square_top_vec_.clear();
+    square_bottom_vec_.push_back(&square_input_);
+    square_top_vec_.push_back(&square_output_);
+    LayerParameter square_param;
+    square_param.mutable_power_param()->set_power(Dtype(2));
+    square_layer_.reset(new PowerLayer<Dtype>(square_param));
+    square_layer_->SetUp(square_bottom_vec_, &square_top_vec_);
+    // Set up pool_layer_ to sum over square neighborhoods of the input.
+    pool_top_vec_.clear();
+    pool_top_vec_.push_back(&pool_output_);
+    LayerParameter pool_param;
+    pool_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_AVE);
+    pool_param.mutable_pooling_param()->set_pad(pre_pad_);
+    pool_param.mutable_pooling_param()->set_kernel_size(size_);
+    pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
+    pool_layer_->SetUp(square_top_vec_, &pool_top_vec_);
+    // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
+    // the sum of a squared neighborhood (the output of pool_layer_).
+    power_top_vec_.clear();
+    power_top_vec_.push_back(&power_output_);
+    LayerParameter power_param;
+    power_param.mutable_power_param()->set_power(-beta_);
+    power_param.mutable_power_param()->set_scale(alpha_);
+    power_param.mutable_power_param()->set_shift(Dtype(1));
+    power_layer_.reset(new PowerLayer<Dtype>(power_param));
+    power_layer_->SetUp(pool_top_vec_, &power_top_vec_);
+    // Set up a product_layer_ to compute outputs by multiplying inputs by the
+    // inverse demoninator computed by the power layer.
+    product_bottom_vec_.clear();
+    product_bottom_vec_.push_back(&product_input_);
+    product_bottom_vec_.push_back(&power_output_);
+    LayerParameter product_param;
+    EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param();
+    eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
+    product_layer_.reset(new EltwiseLayer<Dtype>(product_param));
+    product_layer_->SetUp(product_bottom_vec_, top);
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    (*top)[0]->Reshape(num_, channels_, height_, width_);
+    scale_.Reshape(num_, channels_, height_, width_);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    split_layer_->Reshape(bottom, &split_top_vec_);
+    square_layer_->Reshape(square_bottom_vec_, &square_top_vec_);
+    pool_layer_->Reshape(square_top_vec_, &pool_top_vec_);
+    power_layer_->Reshape(pool_top_vec_, &power_top_vec_);
+    product_layer_->Reshape(product_bottom_vec_, top);
+    break;
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_cpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::CrossChannelForward_cpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  // start with the constant value
+  for (int i = 0; i < scale_.count(); ++i) {
+    scale_data[i] = 1.;
+  }
+  Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
+  Dtype* padded_square_data = padded_square.mutable_cpu_data();
+  caffe_set(padded_square.count(), Dtype(0), padded_square_data);
+  Dtype alpha_over_size = alpha_ / size_;
+  // go through the images
+  for (int n = 0; n < num_; ++n) {
+    // compute the padded square
+    caffe_sqr(channels_ * height_ * width_,
+        bottom_data + bottom[0]->offset(n),
+        padded_square_data + padded_square.offset(0, pre_pad_));
+    // Create the first channel scale
+    for (int c = 0; c < size_; ++c) {
+      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
+          padded_square_data + padded_square.offset(0, c),
+          scale_data + scale_.offset(n, 0));
+    }
+    for (int c = 1; c < channels_; ++c) {
+      // copy previous scale
+      caffe_copy<Dtype>(height_ * width_,
+          scale_data + scale_.offset(n, c - 1),
+          scale_data + scale_.offset(n, c));
+      // add head
+      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
+          padded_square_data + padded_square.offset(0, c + size_ - 1),
+          scale_data + scale_.offset(n, c));
+      // subtract tail
+      caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
+          padded_square_data + padded_square.offset(0, c - 1),
+          scale_data + scale_.offset(n, c));
+    }
+  }
+
+  // In the end, compute output
+  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
+  caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::WithinChannelForward(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  split_layer_->Forward(bottom, &split_top_vec_);
+  square_layer_->Forward(square_bottom_vec_, &square_top_vec_);
+  pool_layer_->Forward(square_top_vec_, &pool_top_vec_);
+  power_layer_->Forward(pool_top_vec_, &power_top_vec_);
+  product_layer_->Forward(product_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_cpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::CrossChannelBackward_cpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+  const Dtype* scale_data = scale_.cpu_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
+  Blob<Dtype> accum_ratio(1, 1, height_, width_);
+  Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
+  Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
+  // We hack a little bit by using the diff() to store an additional result
+  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
+  caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
+  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
+
+  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
+  caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);
+
+  // go through individual data
+  int inverse_pre_pad = size_ - (size_ + 1) / 2;
+  for (int n = 0; n < num_; ++n) {
+    int block_offset = scale_.offset(n);
+    // first, compute diff_i * y_i / s_i
+    caffe_mul<Dtype>(channels_ * height_ * width_,
+        top_diff + block_offset, top_data + block_offset,
+        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+    caffe_div<Dtype>(channels_ * height_ * width_,
+        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
+        scale_data + block_offset,
+        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+    // Now, compute the accumulated ratios and the bottom diff
+    caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
+    for (int c = 0; c < size_ - 1; ++c) {
+      caffe_axpy<Dtype>(height_ * width_, 1.,
+          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+    }
+    for (int c = 0; c < channels_; ++c) {
+      caffe_axpy<Dtype>(height_ * width_, 1.,
+          padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
+          accum_ratio_data);
+      // compute bottom diff
+      caffe_mul<Dtype>(height_ * width_,
+          bottom_data + top[0]->offset(n, c),
+          accum_ratio_data, accum_ratio_times_bottom);
+      caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,
+          accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
+      caffe_axpy<Dtype>(height_ * width_, -1.,
+          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+    }
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::WithinChannelBackward(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down[0]) {
+    vector<bool> product_propagate_down(2, true);
+    product_layer_->Backward(top, product_propagate_down, &product_bottom_vec_);
+    power_layer_->Backward(power_top_vec_, propagate_down, &pool_top_vec_);
+    pool_layer_->Backward(pool_top_vec_, propagate_down, &square_top_vec_);
+    square_layer_->Backward(square_top_vec_, propagate_down,
+                            &square_bottom_vec_);
+    split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LRNFixedLayer);
+STUB_GPU_FORWARD(LRNFixedLayer, CrossChannelForward);
+STUB_GPU_BACKWARD(LRNFixedLayer, CrossChannelBackward);
+#endif
+
+INSTANTIATE_CLASS(LRNFixedLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lrn_fixed_layer.cu b/src/caffe/layers/lrn_fixed_layer.cu
new file mode 100644
index 00000000000..7132f088d3a
--- /dev/null
+++ b/src/caffe/layers/lrn_fixed_layer.cu
@@ -0,0 +1,193 @@
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void LRNFixedFillScale(const int nthreads, const Dtype* in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha,
+    Dtype* scale) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    in += offset;
+    scale += offset;
+    int head = 0;
+    int pre_pad = (size - 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    Dtype accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_scale += in[head * step] * in[head * step];
+      scale[(head - post_pad) * step] = 2. + accum_scale * alpha;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 2. + accum_scale * alpha;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 2. + accum_scale * alpha;
+      ++head;
+    }
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_gpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+// TODO: check if it would be faster to just put it into the previous kernel.
+template <typename Dtype>
+__global__ void LRNFixedComputeOutput(const int nthreads, const Dtype* in,
+    const Dtype* scale, const Dtype negative_beta, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    out[index] = in[index] * pow(scale[index], negative_beta);
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::CrossChannelForward_gpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  // First, compute scale
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  // We will launch one kernel for each pixel location, and have the kernel
+  // go through all the channels.
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFixedFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
+      n_threads, bottom_data, num_, channels_, height_, width_, size_,
+      alpha_, scale_data);
+  CUDA_POST_KERNEL_CHECK;
+  n_threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFixedComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
+      n_threads, bottom_data, scale_data, -beta_, top_data);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_gpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+template <typename Dtype>
+__global__ void LRNFixedComputeDiff(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_data, const Dtype* scale, const Dtype* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio,
+    Dtype* bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    int pre_pad = size - (size + 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    Dtype accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
+}
+
+template <typename Dtype>
+void LRNFixedLayer<Dtype>::CrossChannelBackward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFixedComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
+      n_threads, (*bottom)[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      (*bottom)[0]->mutable_gpu_diff());
+}
+
+
+INSTANTIATE_CLASS(LRNFixedLayer);
+
+}  // namespace caffe