From eea50e91fa6fbcfef59b6a252c5ec5f9400e1ac2 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Fri, 18 Jan 2019 12:14:34 -0800 Subject: [PATCH] Fix SoftmaxOps (#16049) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16049 We might see the pattern ``` if (scale_.numel() != N) { scale_->Resize(N); // set initial value for scale_ } // In class: Tensor scale_{CPU}; ``` before in the code, where `scale_` is a member variable of Type `caffe2::Tensor` This pattern actually serves two purposes, if `scale_` is partially initialized with device type but not size, this call will initialize Tensor with the correct size, or if `scale_` is already initialized with size, it will check whether the size matches a runtime value `N` and if not it will Resize. To rewrite this we'll do the following: ``` if (!scale_.defined() || scale_.numel() != N) { ReinitializeTensor(&scale_, {N}, at::dtype().device(CPU)); // set initial value for scale_ } ``` There are some variants, if `scale_` is resized to a constant size, we can call `ReinitializeTensor` instead ``` if (scale_.numel() != 1) { scale_->Resize(1); } ``` --> ``` ReinitializeTensor(&scale_, {1}, at::dtype().device(CPU)); ``` Normal Resize will be refactored directly into ReinitializeTensor: ``` scale_->Resize(N); ``` --> ``` ReinitializeTensor(&scale_, {N}, at::dtype().device(CPU)); ``` Reviewed By: dzhulgakov Differential Revision: D13667883 fbshipit-source-id: 2c7cb61544b72765b594011b99150eb5a1b50836 --- caffe2/operators/softmax_op.cc | 49 +++++++----- caffe2/operators/softmax_ops.cu | 74 +++++++++++++------ caffe2/operators/softmax_with_loss_op.cc | 28 ++++--- caffe2/operators/softmax_with_loss_op.h | 6 +- .../operators/spatial_softmax_with_loss_op.cc | 11 +-- .../operators/spatial_softmax_with_loss_op.h | 4 +- 6 files changed, 108 insertions(+), 64 deletions(-) diff --git a/caffe2/operators/softmax_op.cc b/caffe2/operators/softmax_op.cc index f925c359f18f8b..2a021ab7304973 100644 --- a/caffe2/operators/softmax_op.cc +++ b/caffe2/operators/softmax_op.cc @@ -13,19 +13,26 @@ bool SoftmaxOp::RunOnDevice() { const int D = X.size_from_dim(canonical_axis); auto* Y = Output(0, X.sizes(), at::dtype()); float* Ydata = Y->template mutable_data(); - // ReinitializeTensor itself has the effect of caching, so there is no need to check for numel of Tensor // First, get scales - ReinitializeTensor( - &scale_, {N}, at::dtype().device(CPU)); + if (!scale_.defined()) { + scale_ = caffe2::empty({N}, at::dtype().device(CPU)); + } else if (scale_.numel() != N) { + scale_.Resize(N); + } - ReinitializeTensor( - &rowmax_, {N}, at::dtype().device(CPU)); + if (!rowmax_.defined()) { + rowmax_ = caffe2::empty({N}, at::dtype().device(CPU)); + } else if (rowmax_.numel() != N) { + rowmax_.Resize(N); + } - ReinitializeTensor( - &sum_multiplier_, - {D}, - at::dtype().device(CPU)); - math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CPU)); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + } SoftmaxCPU( context_, @@ -50,18 +57,20 @@ bool SoftmaxGradientOp::RunOnDevice() { const int64_t N = Y.size_to_dim(canonical_axis); const int64_t D = Y.size_from_dim(canonical_axis); // First, get scales - if (scale_.numel() != N) { - ReinitializeTensor( - &scale_, {N}, at::dtype().device(CPU)); + if (!scale_.defined()) { + scale_ = caffe2::empty({N}, at::dtype().device(CPU)); + } else if (scale_.numel() != N) { + scale_.Resize(N); } - if (sum_multiplier_.numel() != D) { - ReinitializeTensor( - &sum_multiplier_, - {D}, - at::dtype().device(CPU)); - math::Set(D, 1.f, sum_multiplier_.mutable_data(), - &context_); + + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CPU)); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); } + auto* dX = Output(0, Y.sizes(), at::dtype()); const float* Ydata = Y.data(); const float* dYdata = dY.data(); diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu index 0876ad884f832c..d1dd96d90cecf9 100644 --- a/caffe2/operators/softmax_ops.cu +++ b/caffe2/operators/softmax_ops.cu @@ -311,17 +311,26 @@ bool SoftmaxWithLossOp::RunOnDevice() { auto* avg_loss = Output(1, vector(), at::dtype()); // Average loss - if (losses_.size() != N) { - ReinitializeTensor(&losses_, {N}, at::dtype().device(CUDA)); + if (!losses_.defined()) { + losses_ = caffe2::empty({N}, at::dtype().device(CUDA)); + } else if (losses_.numel() != N) { + losses_.Resize(N); } - if (rowmax_.size() != N) { - ReinitializeTensor(&rowmax_, {N}, at::dtype().device(CUDA)); + + if (!rowmax_.defined()) { + rowmax_ = caffe2::empty({N}, at::dtype().device(CUDA)); + } else if (rowmax_.numel() != N) { + rowmax_.Resize(N); } - if (sum_multiplier_.size() != D) { - ReinitializeTensor(&sum_multiplier_, {D}, at::dtype().device(CUDA)); - math::Set( - D, 1.f, sum_multiplier_.mutable_data(), &context_); + + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CUDA)); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); } + Softmax( N, D, @@ -379,7 +388,7 @@ bool SoftmaxWithLossOp::RunOnDevice() { // Sum of all losses float* avg_loss_data = avg_loss->template mutable_data(); math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_, &scratch_); + losses_.numel(), losses_.data(), avg_loss_data, &context_, &scratch_); // Average of input batch size if (total_weight > 0) { math::Scale( @@ -409,11 +418,16 @@ bool SpatialSoftmaxWithLossOp::RunOnDevice() { int H = X.dim32(2); int W = X.dim32(3); - if (losses_.size() != N * W * H) { - ReinitializeTensor(&losses_, {N * W * H}, at::dtype().device(CUDA)); + if (!losses_.defined()) { + losses_ = caffe2::empty({N * W * H}, at::dtype().device(CUDA)); + } else if (losses_.numel() != N * W * H) { + losses_.Resize(N * W * H); } - if (weights_.size() != N * W * H) { - ReinitializeTensor(&weights_, {N * W * H}, at::dtype().device(CUDA)); + + if (!weights_.defined()) { + weights_ = caffe2::empty({N * W * H}, at::dtype().device(CUDA)); + } else if (weights_.numel() != N * W * H) { + weights_.Resize(N * W * H); } const float* Xdata = X.data(); @@ -454,7 +468,7 @@ bool SpatialSoftmaxWithLossOp::RunOnDevice() { // Somewhat awkward scalar passing from device to host float h_total_weight; math::Sum( - weights_.size(), + weights_.numel(), weights_.data(), total_weight_ptr_.mutable_data(), &context_, @@ -467,7 +481,7 @@ bool SpatialSoftmaxWithLossOp::RunOnDevice() { context_.cuda_stream())); math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_, &scratch_); + losses_.numel(), losses_.data(), avg_loss_data, &context_, &scratch_); // Final scaling if (h_total_weight > 0) { @@ -624,8 +638,10 @@ bool SpatialSoftmaxWithLossGradientOp::RunOnDevice() { int H = X.dim32(2); int W = X.dim32(3); dX->ResizeLike(X); - if (weights_.size() != N * W * H) { - ReinitializeTensor(&weights_, {N * W * H}, at::dtype().device(CUDA)); + if (!weights_.defined()) { + weights_ = caffe2::empty({N * W * H}, at::dtype().device(CUDA)); + } else if (weights_.numel() != N * W * H) { + weights_.Resize(N * W * H); } const float* Pdata = P.data(); @@ -649,7 +665,7 @@ bool SpatialSoftmaxWithLossGradientOp::RunOnDevice() { N, D, W, H, label_data, weights, dX_data, weights_.mutable_data()); math::Sum( - weights_.size(), + weights_.numel(), weights_.data(), total_weight_ptr_.mutable_data(), &context_, @@ -696,17 +712,27 @@ bool SoftmaxOp::RunOnDevice() { if (N == 0) { return true; } - if (sum_multiplier_.size() != D) { - ReinitializeTensor(&sum_multiplier_, {D}, at::dtype().device(CUDA)); + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CUDA)); + math::Set( + D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); math::Set( D, 1.f, sum_multiplier_.mutable_data(), &context_); } - if (scale_.size() != N) { - ReinitializeTensor(&scale_, {N}, at::dtype().device(CUDA)); + if (!scale_.defined()) { + scale_ = caffe2::empty({N}, at::dtype().device(CUDA)); + } else if (scale_.numel() != N) { + scale_.Resize(N); } - if (rowmax_.size() != N) { - ReinitializeTensor(&rowmax_, {N}, at::dtype().device(CUDA)); + + if (!rowmax_.defined()) { + rowmax_ = caffe2::empty({N}, at::dtype().device(CUDA)); + } else if (rowmax_.numel() != N) { + rowmax_.Resize(N); } + Softmax( N, D, diff --git a/caffe2/operators/softmax_with_loss_op.cc b/caffe2/operators/softmax_with_loss_op.cc index 81a6c7ea297b8a..36a77408d4c254 100644 --- a/caffe2/operators/softmax_with_loss_op.cc +++ b/caffe2/operators/softmax_with_loss_op.cc @@ -178,19 +178,25 @@ bool SoftmaxWithLossOp::RunOnDevice() { } } - if (sum_multiplier_.numel() != D) { - ReinitializeTensor( - &sum_multiplier_, - {D}, - at::dtype().device(CPU)); - math::Set( - D, 1.f, sum_multiplier_.mutable_data(), &context_); + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CPU)); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); + math::Set(D, 1.f, sum_multiplier_.mutable_data(), &context_); } - ReinitializeTensor( - &rowmax_, {N}, at::dtype().device(CPU)); - ReinitializeTensor( - &losses_, {N}, at::dtype().device(CPU)); + if (!losses_.defined()) { + losses_ = caffe2::empty({N}, at::dtype().device(CPU)); + } else if (losses_.numel() != N) { + losses_.Resize(N); + } + + if (!rowmax_.defined()) { + rowmax_ = caffe2::empty({N}, at::dtype().device(CPU)); + } else if (rowmax_.numel() != N) { + rowmax_.Resize(N); + } SoftmaxCPU( context_, diff --git a/caffe2/operators/softmax_with_loss_op.h b/caffe2/operators/softmax_with_loss_op.h index ce47072d5ddd6b..d72e3058905d80 100644 --- a/caffe2/operators/softmax_with_loss_op.h +++ b/caffe2/operators/softmax_with_loss_op.h @@ -34,9 +34,10 @@ class SoftmaxWithLossOp final : public Operator { Tensor losses_; // Per example loss Tensor rowmax_; // per example row max - Tensor weights_{Context::GetDeviceType()}; // unignored weights + Tensor weights_; // unignored weights Tensor sum_multiplier_; // Vector of ones for summing via dot prod Tensor total_weight_ptr_; + // passed to a function Tensor scratch_{Context::GetDeviceType()}; }; @@ -62,8 +63,9 @@ class SoftmaxWithLossGradientOp final : public Operator { protected: float scale_; int label_prob_mode_; + // not used? Tensor sum_multiplier_{Context::GetDeviceType()}; - Tensor weights_{Context::GetDeviceType()}; // unignored weights + Tensor weights_; // unignored weights Tensor total_weight_ptr_; StorageOrder order_; bool only_loss_; diff --git a/caffe2/operators/spatial_softmax_with_loss_op.cc b/caffe2/operators/spatial_softmax_with_loss_op.cc index 9c650bf46deff4..09464b0e05d7dc 100644 --- a/caffe2/operators/spatial_softmax_with_loss_op.cc +++ b/caffe2/operators/spatial_softmax_with_loss_op.cc @@ -72,11 +72,12 @@ bool SpatialSoftmaxWithLossOp::RunOnDevice() { auto* P = Output(0, X.sizes(), at::dtype()); // Probabilities from softmax - if (sum_multiplier_.numel() != D) { - ReinitializeTensor( - &sum_multiplier_, - {D}, - at::dtype().device(CPU)); + if (!sum_multiplier_.defined()) { + sum_multiplier_ = caffe2::empty({D}, at::dtype().device(CPU)); + math::Set( + D, 1.f, sum_multiplier_.mutable_data(), &context_); + } else if (sum_multiplier_.numel() != D) { + sum_multiplier_.Resize(D); math::Set( D, 1.f, sum_multiplier_.mutable_data(), &context_); } diff --git a/caffe2/operators/spatial_softmax_with_loss_op.h b/caffe2/operators/spatial_softmax_with_loss_op.h index c728a458df3af1..97d38184694f20 100644 --- a/caffe2/operators/spatial_softmax_with_loss_op.h +++ b/caffe2/operators/spatial_softmax_with_loss_op.h @@ -29,7 +29,7 @@ class SpatialSoftmaxWithLossOp final : public Operator { StorageOrder order_; Tensor losses_; // Per example loss - Tensor rowmax_{Context::GetDeviceType()}; // per example row max + Tensor rowmax_; // per example row max Tensor weights_; // unignored weights Tensor sum_multiplier_; // Vector of ones for summing via dot prod Tensor total_weight_ptr_; @@ -55,7 +55,7 @@ class SpatialSoftmaxWithLossGradientOp final : public Operator { protected: float scale_; - Tensor sum_multiplier_{Context::GetDeviceType()}; + Tensor sum_multiplier_; Tensor weights_; // unignored weights Tensor total_weight_ptr_; StorageOrder order_;